1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-10-26 01:29:34 +00:00

Compare commits

...

54 Commits

Author SHA1 Message Date
Peter Boyle
3aab983760 Flop count set as in DiRAC-ITT-2020 (mistaken 20% low, but must maintain consistency) 2020-11-16 17:13:58 +01:00
Peter Boyle
9c4dcc5ea3 Merge branch 'master' into develop 2020-11-16 16:34:57 +01:00
Peter Boyle
a1063ddbb9 Update options and simplify 2020-11-13 04:11:03 +01:00
Peter Boyle
18ef8056ec Hide Shared Memory 2020-11-13 04:10:40 +01:00
Peter Boyle
1c673977fa Must ask for COMMMS_THREADS 2020-11-13 03:59:36 +01:00
Peter Boyle
e9bc748828 Useful GPU machine benchmark for GDR used to shakeout Booster at Juelich - see slack earlyaccess channel 2020-11-13 03:58:34 +01:00
Peter Boyle
f48156529b Work on 2,2,2,8 ranks 2020-11-13 03:57:58 +01:00
Peter Boyle
d05ce01809 TOFU behaviour now optional THREAD_MULTIPLE or THREAD_SERIALIZED 2020-11-13 03:52:19 +01:00
Peter Boyle
cf23eff60e Device to Device, Memset, cannot assume UVM == Communicable 2020-11-13 03:51:08 +01:00
Peter Boyle
6e313575be Use of default GPU is behaviour, not a system property. Move Summit specific to configure.ac 2020-11-13 03:50:16 +01:00
Peter Boyle
b13d1f7238 TOFU compat flag to help Isaaku 2020-11-13 03:49:44 +01:00
Peter Boyle
b5e7945dd9 Option for host or device Cshift implementation 2020-11-13 01:38:54 +01:00
Peter Boyle
7535566f54 Option for bounce through the SHM buffer 2020-11-12 22:54:27 +01:00
Peter Boyle
50b808ab33 Configure option between host and device 2020-11-12 22:28:12 +01:00
Peter Boyle
f16c2665f5 Host memory explict 2020-11-12 20:29:58 +01:00
Peter Boyle
41e28015ae Volume divisible guarantee 2020-11-07 13:32:16 +01:00
Peter Boyle
a0ccbb3bd6 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-11-01 01:16:35 +00:00
Peter Boyle
5eeabaa2bb HIP fix 2020-11-01 01:16:01 +00:00
Peter Boyle
00d0d6d008 Hip Free managed 2020-10-31 18:14:31 -04:00
Peter Boyle
537a9f7030 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-10-31 18:13:30 -04:00
Peter Boyle
cc9c993f74 Project on group fix on GPU tracked to reciprocal sqrt collision between CUDA and Grid rsqrt 2020-10-31 18:12:47 -04:00
Peter Boyle
d10422ded8 Test project on group 2020-10-31 18:12:30 -04:00
Peter Boyle
f313565a3c HiP compile 2020-10-31 12:12:40 +00:00
61d5860b46 Merge pull request #318 from rrhodgson/feature/BaryonSpinMat
Added untraced baryon contraction code
2020-10-28 18:39:59 +00:00
52d17987dc BaryonUtils.h updated debug output 2020-10-23 11:41:08 +01:00
19d8bba97d BaryonUtils function naming change 2020-10-21 11:58:53 +01:00
463d72d322 Added untraced baryon contraction code 2020-10-19 16:13:28 +01:00
Peter Boyle
3362f8dfa0 happy compile 2020-10-14 22:59:41 -04:00
Peter Boyle
bf3c9857e0 Closure changes 2020-10-14 21:37:14 -04:00
Peter Boyle
a88b3ceca5 Closure cases 2020-10-14 21:33:51 -04:00
Peter Boyle
aa135412f5 toComplex, toReal 2020-10-13 22:25:01 -04:00
Peter Boyle
9945399e60 Reaality issues fix by drop from ET 2020-10-13 22:24:32 -04:00
Peter Boyle
5eeffa49e8 Reality forced included 2020-10-13 22:23:57 -04:00
Peter Boyle
3f06209720 Pretty print 2020-10-13 22:18:51 -04:00
Peter Boyle
12e239dd9f Merge branch 'release/dirac-ITT-2020' 2020-10-13 13:38:29 -04:00
Peter Boyle
af2301afbb Merge pull request #312 from i-kanamori/debug_512
add reordring of random number generators in IO
2020-10-13 11:42:12 -04:00
Peter Boyle
f98856a26f Merge pull request #314 from smangham/issue_readme_precision
Fix for deprecated configure options in documentation (issue #313)
2020-10-13 11:41:38 -04:00
Sam Mangham
d55cc5b380 Fixed typo on --enable-comm, removed all references to --enable-precision except for config options, where it is listed as deprecated. Removed travis test for single precision. 2020-10-12 12:33:13 +01:00
c2b688abc9 Benchmark_IO: reducing max local volume to 32^4 2020-10-10 16:52:56 +01:00
b0d61b9687 Benchmark_IO cleaner output 2020-10-09 21:46:45 +01:00
5f893bf9af Benchmark_IO procurement sizes 2020-10-09 21:31:59 +01:00
0e17bd6597 I/O benchmark cleanup 2020-10-09 20:29:57 +01:00
22caa158cc multi-pass I/O benchmark, with statistic and robustness summary 2020-10-09 20:29:40 +01:00
b24a504d7c hook to access last parallel I/O performance measurement 2020-10-09 20:28:54 +01:00
Peter Boyle
992ef6e9fc more runtime 2020-10-08 22:19:20 -04:00
Peter Boyle
f32a320bc3 Single prec benchmark in double prec compile 2020-10-08 19:52:08 -04:00
Peter Boyle
5f0fe029d2 Improve meemory benchmarks for GPU (avoid host mem ping pong) 2020-10-08 19:51:28 -04:00
6b1486e89b fixing number of colours defaulting to 4 in most cases 2020-10-08 16:31:24 +01:00
Peter Boyle
3f9c427a3a Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-10-07 13:12:57 -04:00
Peter Boyle
d201277652 Expose Nc as a compile time configure option.
Remove precision option
2020-10-07 13:07:00 -04:00
fdda7cf9cf Merge branch 'feature/benchmark-io-update' into develop 2020-10-07 15:57:53 +01:00
e22d30f715 Merge branch 'develop' into feature/benchmark-io-update 2020-10-07 15:56:39 +01:00
Peter Boyle
35a69a5133 SU4 x SU4 2020-10-06 21:48:35 -04:00
97db2b8d20 add reordring of random number generator in IO 2020-10-06 17:25:59 +09:00
166 changed files with 2075 additions and 628 deletions

View File

@@ -9,11 +9,6 @@ matrix:
- os: osx
osx_image: xcode8.3
compiler: clang
env: PREC=single
- os: osx
osx_image: xcode8.3
compiler: clang
env: PREC=double
before_install:
- export GRIDDIR=`pwd`
@@ -55,7 +50,7 @@ script:
- make -j4
- make install
- cd $CWD/build
- ../configure --enable-precision=$PREC --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
- ../configure --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
- make -j4
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
- make check

View File

@@ -28,4 +28,7 @@
///////////////////
#include "Config.h"
#ifdef TOFU
#undef GRID_COMMS_THREADS
#endif
#endif /* GRID_STD_H */

View File

@@ -165,9 +165,17 @@ template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const d
////////////////////////////////////////////////////////////////////////////////
// Template typedefs
////////////////////////////////////////////////////////////////////////////////
//template<class T> using commAllocator = devAllocator<T>;
#ifdef ACCELERATOR_CSHIFT
// Cshift on device
template<class T> using cshiftAllocator = devAllocator<T>;
#else
// Cshift on host
template<class T> using cshiftAllocator = std::allocator<T>;
#endif
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
template<class T> using commVector = std::vector<T,devAllocator<T> >;
template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
NAMESPACE_END(Grid);

View File

@@ -44,7 +44,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) {
#if defined (TOFU) // FUGAKU, credits go to Issaku Kanamori
#ifndef GRID_COMMS_THREADS
nCommThreads=1;
// wrong results here too
// For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs
@@ -358,16 +358,19 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
assert(from != _processor);
assert(gme == ShmRank);
double off_node_bytes=0.0;
int tag;
if ( gfrom ==MPI_UNDEFINED) {
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq);
tag= dir+from*32;
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
assert(ierr==0);
list.push_back(rrq);
off_node_bytes+=bytes;
}
if ( gdest == MPI_UNDEFINED ) {
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[commdir],&xrq);
tag= dir+_processor*32;
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
list.push_back(xrq);
off_node_bytes+=bytes;

View File

@@ -457,8 +457,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE);
}
if ( WorldRank == 0 ){
std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes
// if ( WorldRank == 0 ){
if ( 1 ){
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
}
SharedMemoryZero(ShmCommBuf,bytes);
@@ -771,22 +772,13 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r;
MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);
#ifdef GRID_IBM_SUMMIT
// Hide the shared memory path between sockets
// if even number of nodes
if ( (ShmSize & 0x1)==0 ) {
int SocketSize = ShmSize/2;
int mySocket = ShmRank/SocketSize;
#ifdef GRID_SHM_DISABLE
// Hide the shared memory path between ranks
{
for(int r=0;r<size;r++){
int hisRank=ShmRanks[r];
if ( hisRank!= MPI_UNDEFINED ) {
int hisSocket=hisRank/SocketSize;
if ( hisSocket != mySocket ) {
ShmRanks[r] = MPI_UNDEFINED;
}
}
}
}
#endif
SharedMemoryTest();

View File

@@ -35,7 +35,7 @@ extern Vector<std::pair<int,int> > Cshift_table;
// Gather for when there is no need to SIMD split
///////////////////////////////////////////////////////////////////
template<class vobj> void
Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
{
int rd = rhs.Grid()->_rdimensions[dimension];
@@ -73,12 +73,19 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
}
}
{
autoView(rhs_v , rhs, AcceleratorRead);
auto buffer_p = & buffer[0];
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
});
#else
autoView(rhs_v , rhs, CpuRead);
thread_for(i,ent,{
buffer_p[table[i].first]=rhs_v[table[i].second];
});
#endif
}
}
@@ -103,6 +110,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
int n1=rhs.Grid()->_slice_stride[dimension];
if ( cbmask ==0x3){
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for2d(n,e1,b,e2,1,{
int o = n*n1;
@@ -111,12 +119,22 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
});
} else {
autoView(rhs_v , rhs, AcceleratorRead);
#else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
int o = n*n1;
int offset = b+n*e2;
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
});
#endif
} else {
Coordinate rdim=rhs.Grid()->_rdimensions;
Coordinate cdm =rhs.Grid()->_checker_dim_mask;
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for2d(n,e1,b,e2,1,{
Coordinate coor;
@@ -134,13 +152,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
extract<vobj>(temp,pointers,offset);
}
});
#else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
Coordinate coor;
int o=n*n1;
int oindex = o+b;
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
int ocb=1<<cb;
int offset = b+n*e2;
if ( ocb & cbmask ) {
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
}
});
#endif
}
}
//////////////////////////////////////////////////////
// Scatter for when there is no need to SIMD split
//////////////////////////////////////////////////////
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask)
{
int rd = rhs.Grid()->_rdimensions[dimension];
@@ -182,12 +220,19 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
}
{
autoView( rhs_v, rhs, AcceleratorWrite);
auto buffer_p = & buffer[0];
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v, rhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
});
#else
autoView( rhs_v, rhs, CpuWrite);
thread_for(i,ent,{
rhs_v[table[i].first]=buffer_p[table[i].second];
});
#endif
}
}
@@ -208,14 +253,23 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
int e2=rhs.Grid()->_slice_block[dimension];
if(cbmask ==0x3 ) {
autoView( rhs_v , rhs, AcceleratorWrite);
int _slice_stride = rhs.Grid()->_slice_stride[dimension];
int _slice_block = rhs.Grid()->_slice_block[dimension];
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v , rhs, AcceleratorWrite);
accelerator_for2d(n,e1,b,e2,1,{
int o = n*_slice_stride;
int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset);
});
#else
autoView( rhs_v , rhs, CpuWrite);
thread_for2d(n,e1,b,e2,{
int o = n*_slice_stride;
int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset);
});
#endif
} else {
// Case of SIMD split AND checker dim cannot currently be hit, except in
@@ -280,12 +334,20 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
}
{
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
autoView(lhs_v , lhs, AcceleratorWrite);
auto table = &Cshift_table[0];
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
});
#else
autoView(rhs_v , rhs, CpuRead);
autoView(lhs_v , lhs, CpuWrite);
thread_for(i,ent,{
lhs_v[table[i].first]=rhs_v[table[i].second];
});
#endif
}
}
@@ -324,12 +386,20 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
}
{
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v, rhs, AcceleratorRead);
autoView( lhs_v, lhs, AcceleratorWrite);
auto table = &Cshift_table[0];
accelerator_for(i,ent,1,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
#else
autoView( rhs_v, rhs, CpuRead);
autoView( lhs_v, lhs, CpuWrite);
thread_for(i,ent,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
#endif
}
}

View File

@@ -101,7 +101,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
}
}
#define ACCELERATOR_CSHIFT_NO_COPY
#ifdef ACCELERATOR_CSHIFT_NO_COPY
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
typedef typename vobj::vector_type vector_type;
@@ -121,8 +122,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
assert(shift<fd);
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
commVector<vobj> send_buf(buffer_size);
commVector<vobj> recv_buf(buffer_size);
cshiftVector<vobj> send_buf(buffer_size);
cshiftVector<vobj> recv_buf(buffer_size);
int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
@@ -138,7 +139,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
} else {
int words = send_buf.size();
int words = buffer_size;
if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj);
@@ -150,12 +151,14 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
grid->Barrier();
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
recv_from_rank,
bytes);
grid->Barrier();
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
@@ -195,8 +198,15 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type);
std::vector<commVector<scalar_object> > send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
std::vector<commVector<scalar_object> > recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
std::vector<cshiftVector<scalar_object> > send_buf_extract(Nsimd);
std::vector<cshiftVector<scalar_object> > recv_buf_extract(Nsimd);
scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi;
for(int s=0;s<Nsimd;s++){
send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size);
}
int bytes = buffer_size*sizeof(scalar_object);
@@ -242,11 +252,204 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
grid->Barrier();
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
recv_buf_extract_mpi = &recv_buf_extract[i][0];
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank,
(void *)&recv_buf_extract[i][0],
(void *)recv_buf_extract_mpi,
recv_from_rank,
bytes);
grid->Barrier();
rpointers[i] = &recv_buf_extract[i][0];
} else {
rpointers[i] = &send_buf_extract[nbr_lane][0];
}
}
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
}
}
#else
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid=rhs.Grid();
Lattice<vobj> temp(rhs.Grid());
int fd = rhs.Grid()->_fdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
assert(simd_layout==1);
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
cshiftVector<vobj> send_buf_v(buffer_size);
cshiftVector<vobj> recv_buf_v(buffer_size);
vobj *send_buf;
vobj *recv_buf;
{
grid->ShmBufferFreeAll();
size_t bytes = buffer_size*sizeof(vobj);
send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
}
int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
for(int x=0;x<rd;x++){
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc==0) {
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
} else {
int words = buffer_size;
if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj);
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
// int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
recv_from_rank,
bytes);
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
grid->Barrier();
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
}
}
}
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
GridBase *grid=rhs.Grid();
const int Nsimd = grid->Nsimd();
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
int fd = grid->_fdimensions[dimension];
int rd = grid->_rdimensions[dimension];
int ld = grid->_ldimensions[dimension];
int pd = grid->_processors[dimension];
int simd_layout = grid->_simd_layout[dimension];
int comm_dim = grid->_processors[dimension] >1 ;
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
assert(comm_dim==1);
assert(simd_layout==2);
assert(shift>=0);
assert(shift<fd);
int permute_type=grid->PermuteType(dimension);
///////////////////////////////////////////////
// Simd direction uses an extract/merge pair
///////////////////////////////////////////////
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type);
std::vector<cshiftVector<scalar_object> > send_buf_extract(Nsimd);
std::vector<cshiftVector<scalar_object> > recv_buf_extract(Nsimd);
scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi;
{
size_t bytes = sizeof(scalar_object)*buffer_size;
grid->ShmBufferFreeAll();
send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
}
for(int s=0;s<Nsimd;s++){
send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size);
}
int bytes = buffer_size*sizeof(scalar_object);
ExtractPointerArray<scalar_object> pointers(Nsimd); //
ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
///////////////////////////////////////////
// Work out what to send where
///////////////////////////////////////////
int cb = (cbmask==0x2)? Odd : Even;
int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
// loop over outer coord planes orthog to dim
for(int x=0;x<rd;x++){
// FIXME call local permute copy if none are offnode.
for(int i=0;i<Nsimd;i++){
pointers[i] = &send_buf_extract[i][0];
}
int sx = (x+sshift)%rd;
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
for(int i=0;i<Nsimd;i++){
int inner_bit = (Nsimd>>(permute_type+1));
int ic= (i&inner_bit)? 1:0;
int my_coor = rd*ic + x;
int nbr_coor = my_coor+sshift;
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
int nbr_ox = (nbr_coor%rd); // outer coord of peer
int nbr_lane = (i&(~inner_bit));
int recv_from_rank;
int xmit_to_rank;
if (nbr_ic) nbr_lane|=inner_bit;
assert (sx == nbr_ox);
if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank,
(void *)recv_buf_extract_mpi,
recv_from_rank,
bytes);
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
grid->Barrier();
rpointers[i] = &recv_buf_extract[i][0];
} else {
@@ -258,7 +461,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
}
}
#endif
NAMESPACE_END(Grid);
#endif

View File

@@ -36,7 +36,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_local.h>
#include <Grid/lattice/Lattice_reduction.h>
#include <Grid/lattice/Lattice_peekpoke.h>
//#include <Grid/lattice/Lattice_reality.h>
#include <Grid/lattice/Lattice_reality.h>
#include <Grid/lattice/Lattice_real_imag.h>
#include <Grid/lattice/Lattice_comparison_utils.h>
#include <Grid/lattice/Lattice_comparison.h>

View File

@@ -342,19 +342,14 @@ inline void ExpressionViewClose(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
GridUnopClass(UnarySub, -a);
GridUnopClass(UnaryNot, Not(a));
GridUnopClass(UnaryAdj, adj(a));
GridUnopClass(UnaryConj, conjugate(a));
GridUnopClass(UnaryTrace, trace(a));
GridUnopClass(UnaryTranspose, transpose(a));
GridUnopClass(UnaryTa, Ta(a));
GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
GridUnopClass(UnaryToReal, toReal(a));
GridUnopClass(UnaryToComplex, toComplex(a));
GridUnopClass(UnaryTimesI, timesI(a));
GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
GridUnopClass(UnaryAbs, abs(a));
GridUnopClass(UnarySqrt, sqrt(a));
GridUnopClass(UnaryRsqrt, rsqrt(a));
GridUnopClass(UnarySin, sin(a));
GridUnopClass(UnaryCos, cos(a));
GridUnopClass(UnaryAsin, asin(a));
@@ -456,20 +451,17 @@ GridTrinOpClass(TrinaryWhere,
GRID_DEF_UNOP(operator-, UnarySub);
GRID_DEF_UNOP(Not, UnaryNot);
GRID_DEF_UNOP(operator!, UnaryNot);
GRID_DEF_UNOP(adj, UnaryAdj);
GRID_DEF_UNOP(conjugate, UnaryConj);
//GRID_DEF_UNOP(adj, UnaryAdj);
//GRID_DEF_UNOP(conjugate, UnaryConj);
GRID_DEF_UNOP(trace, UnaryTrace);
GRID_DEF_UNOP(transpose, UnaryTranspose);
GRID_DEF_UNOP(Ta, UnaryTa);
GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
GRID_DEF_UNOP(toReal, UnaryToReal);
GRID_DEF_UNOP(toComplex, UnaryToComplex);
GRID_DEF_UNOP(timesI, UnaryTimesI);
GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
GRID_DEF_UNOP(abs, UnaryAbs); // abs overloaded in cmath C++98; DON'T do the
// abs-fabs-dabs-labs thing
GRID_DEF_UNOP(sqrt, UnarySqrt);
GRID_DEF_UNOP(rsqrt, UnaryRsqrt);
GRID_DEF_UNOP(sin, UnarySin);
GRID_DEF_UNOP(cos, UnaryCos);
GRID_DEF_UNOP(asin, UnaryAsin);
@@ -494,27 +486,27 @@ GRID_DEF_TRINOP(where, TrinaryWhere);
/////////////////////////////////////////////////////////////
template <class Op, class T1>
auto closure(const LatticeUnaryExpression<Op, T1> &expr)
-> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1)))>
-> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1)))>::type >
{
Lattice<decltype(expr.op.func(vecEval(0, expr.arg1)))> ret(expr);
Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1)))>::type > ret(expr);
return ret;
}
template <class Op, class T1, class T2>
auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
-> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>
-> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>::type >
{
Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))> ret(expr);
Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>::type > ret(expr);
return ret;
}
template <class Op, class T1, class T2, class T3>
auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),
-> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),
vecEval(0, expr.arg2),
vecEval(0, expr.arg3)))>
vecEval(0, expr.arg3)))>::type >
{
Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),
Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),
vecEval(0, expr.arg2),
vecEval(0, expr.arg3)))> ret(expr);
vecEval(0, expr.arg3)))>::type > ret(expr);
return ret;
}
#define EXPRESSION_CLOSURE(function) \

View File

@@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
basis_v.push_back(basis[k].View(AcceleratorWrite));
}
#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) )
#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) )
int max_threads = thread_max();
Vector < vobj > Bt(Nm * max_threads);
thread_region
@@ -161,11 +161,12 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
auto basis_vp=& basis_v[0];
autoView(result_v,result,AcceleratorWrite);
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
auto B=coalescedRead(zz);
for(int k=k0; k<k1; ++k){
B +=Qt_j[k] * coalescedRead(basis_v[k][ss]);
B +=Qt_j[k] * coalescedRead(basis_vp[k][ss]);
}
coalescedWrite(result_v[ss], B);
});

View File

@@ -45,8 +45,8 @@ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard()=lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] = adj(lhs_v[ss]);
});
return ret;
};
@@ -64,6 +64,53 @@ template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
return ret;
};
template<class vobj> inline Lattice<typename vobj::Complexified> toComplex(const Lattice<vobj> &lhs){
Lattice<typename vobj::Complexified> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard() = lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] = toComplex(lhs_v[ss]);
});
return ret;
};
template<class vobj> inline Lattice<typename vobj::Realified> toReal(const Lattice<vobj> &lhs){
Lattice<typename vobj::Realified> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard() = lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] = toReal(lhs_v[ss]);
});
return ret;
};
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto toComplex(const Expression &expr) -> decltype(closure(expr))
{
return toComplex(closure(expr));
}
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto toReal(const Expression &expr) -> decltype(closure(expr))
{
return toReal(closure(expr));
}
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto adj(const Expression &expr) -> decltype(closure(expr))
{
return adj(closure(expr));
}
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto conjugate(const Expression &expr) -> decltype(closure(expr))
{
return conjugate(closure(expr));
}
NAMESPACE_END(Grid);
#endif

View File

@@ -1,3 +1,4 @@
#include <Grid/GridCore.h>
int Grid::BinaryIO::latticeWriteMaxRetry = -1;
Grid::BinaryIO::IoPerf Grid::BinaryIO::lastPerf;

View File

@@ -79,6 +79,13 @@ inline void removeWhitespace(std::string &key)
///////////////////////////////////////////////////////////////////////////////////////////////////
class BinaryIO {
public:
struct IoPerf
{
uint64_t size{0},time{0};
double mbytesPerSecond{0.};
};
static IoPerf lastPerf;
static int latticeWriteMaxRetry;
/////////////////////////////////////////////////////////////////////////////
@@ -502,12 +509,15 @@ class BinaryIO {
timer.Stop();
}
lastPerf.size = sizeof(fobj)*iodata.size()*nrank;
lastPerf.time = timer.useconds();
lastPerf.mbytesPerSecond = lastPerf.size/1024./1024./(lastPerf.time/1.0e6);
std::cout<<GridLogMessage<<"IOobject: ";
if ( control & BINARYIO_READ) std::cout << " read ";
else std::cout << " write ";
uint64_t bytes = sizeof(fobj)*iodata.size()*nrank;
std::cout<< bytes <<" bytes in "<<timer.Elapsed() <<" "
<< (double)bytes/ (double)timer.useconds() <<" MB/s "<<std::endl;
std::cout<< lastPerf.size <<" bytes in "<< timer.Elapsed() <<" "
<< lastPerf.mbytesPerSecond <<" MB/s "<<std::endl;
std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed() <<std::endl;
@@ -663,10 +673,15 @@ class BinaryIO {
nersc_csum,scidac_csuma,scidac_csumb);
timer.Start();
thread_for(lidx,lsites,{
thread_for(lidx,lsites,{ // FIX ME, suboptimal implementation
std::vector<RngStateType> tmp(RngStateCount);
std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
parallel_rng.SetState(tmp,lidx);
Coordinate lcoor;
grid->LocalIndexToLocalCoor(lidx, lcoor);
int o_idx=grid->oIndex(lcoor);
int i_idx=grid->iIndex(lcoor);
int gidx=parallel_rng.generator_idx(o_idx,i_idx);
parallel_rng.SetState(tmp,gidx);
});
timer.Stop();
@@ -723,7 +738,12 @@ class BinaryIO {
std::vector<RNGstate> iodata(lsites);
thread_for(lidx,lsites,{
std::vector<RngStateType> tmp(RngStateCount);
parallel_rng.GetState(tmp,lidx);
Coordinate lcoor;
grid->LocalIndexToLocalCoor(lidx, lcoor);
int o_idx=grid->oIndex(lcoor);
int i_idx=grid->iIndex(lcoor);
int gidx=parallel_rng.generator_idx(o_idx,i_idx);
parallel_rng.GetState(tmp,gidx);
std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
});
timer.Stop();

View File

@@ -47,7 +47,7 @@ static constexpr int Ym = 5;
static constexpr int Zm = 6;
static constexpr int Tm = 7;
static constexpr int Nc=3;
static constexpr int Nc=Config_Nc;
static constexpr int Ns=4;
static constexpr int Nd=4;
static constexpr int Nhs=2; // half spinor

View File

@@ -133,14 +133,14 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
pickCheckerboard(Even, CloverTermEven, CloverTerm);
pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
pickCheckerboard(Even, CloverTermDagEven, closure(adj(CloverTerm)));
pickCheckerboard(Odd, CloverTermDagOdd, closure(adj(CloverTerm)));
pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
pickCheckerboard(Even, CloverTermInvDagEven, closure(adj(CloverTermInv)));
pickCheckerboard(Odd, CloverTermInvDagOdd, closure(adj(CloverTermInv)));
pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
}
template <class Impl>

View File

@@ -51,7 +51,7 @@ public:
private:
template <class mobj, class robj>
static void baryon_site(const mobj &D1,
static void BaryonSite(const mobj &D1,
const mobj &D2,
const mobj &D3,
const Gamma GammaA_left,
@@ -61,8 +61,18 @@ public:
const int parity,
const bool * wick_contractions,
robj &result);
template <class mobj, class robj>
static void BaryonSiteMatrix(const mobj &D1,
const mobj &D2,
const mobj &D3,
const Gamma GammaA_left,
const Gamma GammaB_left,
const Gamma GammaA_right,
const Gamma GammaB_right,
const bool * wick_contractions,
robj &result);
public:
static void Wick_Contractions(std::string qi,
static void WickContractions(std::string qi,
std::string qf,
bool* wick_contractions);
static void ContractBaryons(const PropagatorField &q1_left,
@@ -75,8 +85,17 @@ public:
const bool* wick_contractions,
const int parity,
ComplexField &baryon_corr);
static void ContractBaryonsMatrix(const PropagatorField &q1_left,
const PropagatorField &q2_left,
const PropagatorField &q3_left,
const Gamma GammaA_left,
const Gamma GammaB_left,
const Gamma GammaA_right,
const Gamma GammaB_right,
const bool* wick_contractions,
SpinMatrixField &baryon_corr);
template <class mobj, class robj>
static void ContractBaryons_Sliced(const mobj &D1,
static void ContractBaryonsSliced(const mobj &D1,
const mobj &D2,
const mobj &D3,
const Gamma GammaA_left,
@@ -87,9 +106,20 @@ public:
const int parity,
const int nt,
robj &result);
template <class mobj, class robj>
static void ContractBaryonsSlicedMatrix(const mobj &D1,
const mobj &D2,
const mobj &D3,
const Gamma GammaA_left,
const Gamma GammaB_left,
const Gamma GammaA_right,
const Gamma GammaB_right,
const bool* wick_contractions,
const int nt,
robj &result);
private:
template <class mobj, class mobj2, class robj>
static void Baryon_Gamma_3pt_Group1_Site(
static void BaryonGamma3ptGroup1Site(
const mobj &Dq1_ti,
const mobj2 &Dq2_spec,
const mobj2 &Dq3_spec,
@@ -101,7 +131,7 @@ public:
robj &result);
template <class mobj, class mobj2, class robj>
static void Baryon_Gamma_3pt_Group2_Site(
static void BaryonGamma3ptGroup2Site(
const mobj2 &Dq1_spec,
const mobj &Dq2_ti,
const mobj2 &Dq3_spec,
@@ -113,7 +143,7 @@ public:
robj &result);
template <class mobj, class mobj2, class robj>
static void Baryon_Gamma_3pt_Group3_Site(
static void BaryonGamma3ptGroup3Site(
const mobj2 &Dq1_spec,
const mobj2 &Dq2_spec,
const mobj &Dq3_ti,
@@ -125,7 +155,7 @@ public:
robj &result);
public:
template <class mobj>
static void Baryon_Gamma_3pt(
static void BaryonGamma3pt(
const PropagatorField &q_ti,
const mobj &Dq_spec1,
const mobj &Dq_spec2,
@@ -138,7 +168,7 @@ public:
SpinMatrixField &stn_corr);
private:
template <class mobj, class mobj2, class robj>
static void Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop,
static void SigmaToNucleonQ1EyeSite(const mobj &Dq_loop,
const mobj2 &Du_spec,
const mobj &Dd_tf,
const mobj &Ds_ti,
@@ -147,7 +177,7 @@ public:
const Gamma GammaB_nucl,
robj &result);
template <class mobj, class mobj2, class robj>
static void Sigma_to_Nucleon_Q1_NonEye_site(const mobj &Du_ti,
static void SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti,
const mobj &Du_tf,
const mobj2 &Du_spec,
const mobj &Dd_tf,
@@ -159,7 +189,7 @@ public:
template <class mobj, class mobj2, class robj>
static void Sigma_to_Nucleon_Q2_Eye_site(const mobj &Dq_loop,
static void SigmaToNucleonQ2EyeSite(const mobj &Dq_loop,
const mobj2 &Du_spec,
const mobj &Dd_tf,
const mobj &Ds_ti,
@@ -168,7 +198,7 @@ public:
const Gamma GammaB_nucl,
robj &result);
template <class mobj, class mobj2, class robj>
static void Sigma_to_Nucleon_Q2_NonEye_site(const mobj &Du_ti,
static void SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti,
const mobj &Du_tf,
const mobj2 &Du_spec,
const mobj &Dd_tf,
@@ -179,7 +209,7 @@ public:
robj &result);
public:
template <class mobj>
static void Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
static void SigmaToNucleonEye(const PropagatorField &qq_loop,
const mobj &Du_spec,
const PropagatorField &qd_tf,
const PropagatorField &qs_ti,
@@ -189,7 +219,7 @@ public:
const std::string op,
SpinMatrixField &stn_corr);
template <class mobj>
static void Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti,
static void SigmaToNucleonNonEye(const PropagatorField &qq_ti,
const PropagatorField &qq_tf,
const mobj &Du_spec,
const PropagatorField &qd_tf,
@@ -217,7 +247,7 @@ const Real BaryonUtils<FImpl>::epsilon_sgn[6] = {1.,1.,1.,-1.,-1.,-1.};
//This is the old version
template <class FImpl>
template <class mobj, class robj>
void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
void BaryonUtils<FImpl>::BaryonSite(const mobj &D1,
const mobj &D2,
const mobj &D3,
const Gamma GammaA_i,
@@ -329,12 +359,132 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
}}
}
//New version without parity projection or trace
template <class FImpl>
template <class mobj, class robj>
void BaryonUtils<FImpl>::BaryonSiteMatrix(const mobj &D1,
const mobj &D2,
const mobj &D3,
const Gamma GammaA_i,
const Gamma GammaB_i,
const Gamma GammaA_f,
const Gamma GammaB_f,
const bool * wick_contraction,
robj &result)
{
auto D1_GAi = D1 * GammaA_i;
auto GAf_D1_GAi = GammaA_f * D1_GAi;
auto GBf_D1_GAi = GammaB_f * D1_GAi;
auto D2_GBi = D2 * GammaB_i;
auto GBf_D2_GBi = GammaB_f * D2_GBi;
auto GAf_D2_GBi = GammaA_f * D2_GBi;
auto GBf_D3 = GammaB_f * D3;
auto GAf_D3 = GammaA_f * D3;
for (int ie_f=0; ie_f < 6 ; ie_f++){
int a_f = epsilon[ie_f][0]; //a
int b_f = epsilon[ie_f][1]; //b
int c_f = epsilon[ie_f][2]; //c
for (int ie_i=0; ie_i < 6 ; ie_i++){
int a_i = epsilon[ie_i][0]; //a'
int b_i = epsilon[ie_i][1]; //b'
int c_i = epsilon[ie_i][2]; //c'
Real ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i];
//This is the \delta_{456}^{123} part
if (wick_contraction[0]){
for (int rho_i=0; rho_i<Ns; rho_i++){
for (int rho_f=0; rho_f<Ns; rho_f++){
auto GAf_D1_GAi_rr_cc = GAf_D1_GAi()(rho_f,rho_i)(c_f,c_i);
for (int alpha_f=0; alpha_f<Ns; alpha_f++){
for (int beta_i=0; beta_i<Ns; beta_i++){
result()(rho_f,rho_i)() += ee * GAf_D1_GAi_rr_cc
* D2_GBi ()(alpha_f,beta_i)(a_f,a_i)
* GBf_D3 ()(alpha_f,beta_i)(b_f,b_i);
}}
}}
}
//This is the \delta_{456}^{231} part
if (wick_contraction[1]){
for (int rho_i=0; rho_i<Ns; rho_i++){
for (int alpha_f=0; alpha_f<Ns; alpha_f++){
auto D1_GAi_ar_ac = D1_GAi()(alpha_f,rho_i)(a_f,c_i);
for (int beta_i=0; beta_i<Ns; beta_i++){
auto GBf_D2_GBi_ab_ba = GBf_D2_GBi ()(alpha_f,beta_i)(b_f,a_i);
for (int rho_f=0; rho_f<Ns; rho_f++){
result()(rho_f,rho_i)() += ee * D1_GAi_ar_ac
* GBf_D2_GBi_ab_ba
* GAf_D3 ()(rho_f,beta_i)(c_f,b_i);
}}
}}
}
//This is the \delta_{456}^{312} part
if (wick_contraction[2]){
for (int rho_i=0; rho_i<Ns; rho_i++){
for (int alpha_f=0; alpha_f<Ns; alpha_f++){
auto GBf_D1_GAi_ar_bc = GBf_D1_GAi()(alpha_f,rho_i)(b_f,c_i);
for (int beta_i=0; beta_i<Ns; beta_i++){
auto D3_ab_ab = D3 ()(alpha_f,beta_i)(a_f,b_i);
for (int rho_f=0; rho_f<Ns; rho_f++){
result()(rho_f,rho_i)() += ee * GBf_D1_GAi_ar_bc
* GAf_D2_GBi ()(rho_f,beta_i)(c_f,a_i)
* D3_ab_ab;
}}
}}
}
//This is the \delta_{456}^{132} part
if (wick_contraction[3]){
for (int rho_i=0; rho_i<Ns; rho_i++){
for (int rho_f=0; rho_f<Ns; rho_f++){
auto GAf_D1_GAi_rr_cc = GAf_D1_GAi()(rho_f,rho_i)(c_f,c_i);
for (int alpha_f=0; alpha_f<Ns; alpha_f++){
for (int beta_i=0; beta_i<Ns; beta_i++){
result()(rho_f,rho_i)() -= ee * GAf_D1_GAi_rr_cc
* GBf_D2_GBi ()(alpha_f,beta_i)(b_f,a_i)
* D3 ()(alpha_f,beta_i)(a_f,b_i);
}}
}}
}
//This is the \delta_{456}^{321} part
if (wick_contraction[4]){
for (int rho_i=0; rho_i<Ns; rho_i++){
for (int alpha_f=0; alpha_f<Ns; alpha_f++){
auto GBf_D1_GAi_ar_bc = GBf_D1_GAi()(alpha_f,rho_i)(b_f,c_i);
for (int beta_i=0; beta_i<Ns; beta_i++){
auto D2_GBi_ab_aa = D2_GBi()(alpha_f,beta_i)(a_f,a_i);
for (int rho_f=0; rho_f<Ns; rho_f++){
result()(rho_f,rho_i)() -= ee * GBf_D1_GAi_ar_bc
* D2_GBi_ab_aa
* GAf_D3 ()(rho_f,beta_i)(c_f,b_i);
}}
}}
}
//This is the \delta_{456}^{213} part
if (wick_contraction[5]){
for (int rho_i=0; rho_i<Ns; rho_i++){
for (int alpha_f=0; alpha_f<Ns; alpha_f++){
auto D1_GAi_ar_ac = D1_GAi()(alpha_f,rho_i)(a_f,c_i);
for (int beta_i=0; beta_i<Ns; beta_i++){
auto GBf_D3_ab_bb = GBf_D3()(alpha_f,beta_i)(b_f,b_i);
for (int rho_f=0; rho_f<Ns; rho_f++){
result()(rho_f,rho_i)() -= ee * D1_GAi_ar_ac
* GAf_D2_GBi ()(rho_f,beta_i)(c_f,a_i)
* GBf_D3_ab_bb;
}}
}}
}
}}
}
/* Computes which wick contractions should be performed for a *
* baryon 2pt function given the initial and finals state quark *
* flavours. *
* The array wick_contractions must be of length 6 */
template<class FImpl>
void BaryonUtils<FImpl>::Wick_Contractions(std::string qi, std::string qf, bool* wick_contractions) {
void BaryonUtils<FImpl>::WickContractions(std::string qi, std::string qf, bool* wick_contractions) {
const int epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
for (int ie=0; ie < 6 ; ie++) {
wick_contractions[ie] = (qi.size() == 3 && qf.size() == 3
@@ -365,11 +515,6 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl;
std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl;
std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl;
std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl;
assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
GridBase *grid = q1_left.Grid();
@@ -397,13 +542,62 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
auto D2 = v2[ss];
auto D3 = v3[ss];
vobj result=Zero();
baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result);
BaryonSite(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result);
vbaryon_corr[ss] = result;
} );//end loop over lattice sites
t += usecond();
std::cout << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl;
std::cout << GridLogDebug << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl;
}
template<class FImpl>
void BaryonUtils<FImpl>::ContractBaryonsMatrix(const PropagatorField &q1_left,
const PropagatorField &q2_left,
const PropagatorField &q3_left,
const Gamma GammaA_left,
const Gamma GammaB_left,
const Gamma GammaA_right,
const Gamma GammaB_right,
const bool* wick_contractions,
SpinMatrixField &baryon_corr)
{
assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
GridBase *grid = q1_left.Grid();
autoView(vbaryon_corr, baryon_corr,CpuWrite);
autoView( v1 , q1_left, CpuRead);
autoView( v2 , q2_left, CpuRead);
autoView( v3 , q3_left, CpuRead);
// Real bytes =0.;
// bytes += grid->oSites() * (432.*sizeof(vComplex) + 126.*sizeof(int) + 36.*sizeof(Real));
// for (int ie=0; ie < 6 ; ie++){
// if(ie==0 or ie==3){
// bytes += grid->oSites() * (4.*sizeof(int) + 4752.*sizeof(vComplex)) * wick_contractions[ie];
// }
// else{
// bytes += grid->oSites() * (64.*sizeof(int) + 5184.*sizeof(vComplex)) * wick_contractions[ie];
// }
// }
// Real t=0.;
// t =-usecond();
accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
auto D1 = v1[ss];
auto D2 = v2[ss];
auto D3 = v3[ss];
sobj result=Zero();
BaryonSiteMatrix(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,wick_contractions,result);
vbaryon_corr[ss] = result;
} );//end loop over lattice sites
// t += usecond();
// std::cout << GridLogDebug << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl;
}
@@ -414,7 +608,7 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
* Wick_Contractions function above */
template <class FImpl>
template <class mobj, class robj>
void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
void BaryonUtils<FImpl>::ContractBaryonsSliced(const mobj &D1,
const mobj &D2,
const mobj &D3,
const Gamma GammaA_left,
@@ -430,15 +624,32 @@ void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl;
std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl;
std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl;
std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl;
assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
for (int t=0; t<nt; t++) {
baryon_site(D1[t],D2[t],D3[t],GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result[t]);
BaryonSite(D1[t],D2[t],D3[t],GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result[t]);
}
}
template <class FImpl>
template <class mobj, class robj>
void BaryonUtils<FImpl>::ContractBaryonsSlicedMatrix(const mobj &D1,
const mobj &D2,
const mobj &D3,
const Gamma GammaA_left,
const Gamma GammaB_left,
const Gamma GammaA_right,
const Gamma GammaB_right,
const bool* wick_contractions,
const int nt,
robj &result)
{
assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
for (int t=0; t<nt; t++) {
BaryonSiteMatrix(D1[t],D2[t],D3[t],GammaA_left,GammaB_left,GammaA_right,GammaB_right,wick_contractions,result[t]);
}
}
@@ -454,7 +665,7 @@ void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
* Dq4_tf is a quark line from t_f to t_J */
template<class FImpl>
template <class mobj, class mobj2, class robj>
void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group1_Site(
void BaryonUtils<FImpl>::BaryonGamma3ptGroup1Site(
const mobj &Dq1_ti,
const mobj2 &Dq2_spec,
const mobj2 &Dq3_spec,
@@ -546,7 +757,7 @@ void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group1_Site(
* Dq4_tf is a quark line from t_f to t_J */
template<class FImpl>
template <class mobj, class mobj2, class robj>
void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group2_Site(
void BaryonUtils<FImpl>::BaryonGamma3ptGroup2Site(
const mobj2 &Dq1_spec,
const mobj &Dq2_ti,
const mobj2 &Dq3_spec,
@@ -636,7 +847,7 @@ void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group2_Site(
* Dq4_tf is a quark line from t_f to t_J */
template<class FImpl>
template <class mobj, class mobj2, class robj>
void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group3_Site(
void BaryonUtils<FImpl>::BaryonGamma3ptGroup3Site(
const mobj2 &Dq1_spec,
const mobj2 &Dq2_spec,
const mobj &Dq3_ti,
@@ -728,7 +939,7 @@ void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group3_Site(
* https://aportelli.github.io/Hadrons-doc/#/mcontraction */
template<class FImpl>
template <class mobj>
void BaryonUtils<FImpl>::Baryon_Gamma_3pt(
void BaryonUtils<FImpl>::BaryonGamma3pt(
const PropagatorField &q_ti,
const mobj &Dq_spec1,
const mobj &Dq_spec2,
@@ -751,7 +962,7 @@ void BaryonUtils<FImpl>::Baryon_Gamma_3pt(
auto Dq_ti = vq_ti[ss];
auto Dq_tf = vq_tf[ss];
sobj result=Zero();
Baryon_Gamma_3pt_Group1_Site(Dq_ti,Dq_spec1,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
BaryonGamma3ptGroup1Site(Dq_ti,Dq_spec1,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
vcorr[ss] += result;
});//end loop over lattice sites
} else if (group == 2) {
@@ -759,7 +970,7 @@ void BaryonUtils<FImpl>::Baryon_Gamma_3pt(
auto Dq_ti = vq_ti[ss];
auto Dq_tf = vq_tf[ss];
sobj result=Zero();
Baryon_Gamma_3pt_Group2_Site(Dq_spec1,Dq_ti,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
BaryonGamma3ptGroup2Site(Dq_spec1,Dq_ti,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
vcorr[ss] += result;
});//end loop over lattice sites
} else if (group == 3) {
@@ -767,7 +978,7 @@ void BaryonUtils<FImpl>::Baryon_Gamma_3pt(
auto Dq_ti = vq_ti[ss];
auto Dq_tf = vq_tf[ss];
sobj result=Zero();
Baryon_Gamma_3pt_Group3_Site(Dq_spec1,Dq_spec2,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
BaryonGamma3ptGroup3Site(Dq_spec1,Dq_spec2,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
vcorr[ss] += result;
});//end loop over lattice sites
@@ -787,7 +998,7 @@ void BaryonUtils<FImpl>::Baryon_Gamma_3pt(
* Ds_ti is a quark line from t_i to t_H */
template <class FImpl>
template <class mobj, class mobj2, class robj>
void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop,
void BaryonUtils<FImpl>::SigmaToNucleonQ1EyeSite(const mobj &Dq_loop,
const mobj2 &Du_spec,
const mobj &Dd_tf,
const mobj &Ds_ti,
@@ -838,7 +1049,7 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop,
* Ds_ti is a quark line from t_i to t_H */
template <class FImpl>
template <class mobj, class mobj2, class robj>
void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q1_NonEye_site(const mobj &Du_ti,
void BaryonUtils<FImpl>::SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti,
const mobj &Du_tf,
const mobj2 &Du_spec,
const mobj &Dd_tf,
@@ -897,7 +1108,7 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q1_NonEye_site(const mobj &Du_ti,
* Ds_ti is a quark line from t_i to t_H */
template <class FImpl>
template <class mobj, class mobj2, class robj>
void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q2_Eye_site(const mobj &Dq_loop,
void BaryonUtils<FImpl>::SigmaToNucleonQ2EyeSite(const mobj &Dq_loop,
const mobj2 &Du_spec,
const mobj &Dd_tf,
const mobj &Ds_ti,
@@ -948,7 +1159,7 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q2_Eye_site(const mobj &Dq_loop,
* Ds_ti is a quark line from t_i to t_H */
template <class FImpl>
template <class mobj, class mobj2, class robj>
void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q2_NonEye_site(const mobj &Du_ti,
void BaryonUtils<FImpl>::SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti,
const mobj &Du_tf,
const mobj2 &Du_spec,
const mobj &Dd_tf,
@@ -1002,7 +1213,7 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q2_NonEye_site(const mobj &Du_ti,
template<class FImpl>
template <class mobj>
void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
void BaryonUtils<FImpl>::SigmaToNucleonEye(const PropagatorField &qq_loop,
const mobj &Du_spec,
const PropagatorField &qd_tf,
const PropagatorField &qs_ti,
@@ -1029,9 +1240,9 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
auto Ds_ti = vs_ti[ss];
sobj result=Zero();
if(op == "Q1"){
Sigma_to_Nucleon_Q1_Eye_site(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
SigmaToNucleonQ1EyeSite(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
} else if(op == "Q2"){
Sigma_to_Nucleon_Q2_Eye_site(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
SigmaToNucleonQ2EyeSite(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
} else {
assert(0 && "Weak Operator not correctly specified");
}
@@ -1041,7 +1252,7 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
template<class FImpl>
template <class mobj>
void BaryonUtils<FImpl>::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti,
void BaryonUtils<FImpl>::SigmaToNucleonNonEye(const PropagatorField &qq_ti,
const PropagatorField &qq_tf,
const mobj &Du_spec,
const PropagatorField &qd_tf,
@@ -1071,9 +1282,9 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti,
auto Ds_ti = vs_ti[ss];
sobj result=Zero();
if(op == "Q1"){
Sigma_to_Nucleon_Q1_NonEye_site(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
SigmaToNucleonQ1NonEyeSite(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
} else if(op == "Q2"){
Sigma_to_Nucleon_Q2_NonEye_site(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
SigmaToNucleonQ2NonEyeSite(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
} else {
assert(0 && "Weak Operator not correctly specified");
}

View File

@@ -449,7 +449,8 @@ public:
LatticeReal alpha(grid);
// std::cout<<GridLogMessage<<"xi "<<xi <<std::endl;
alpha = toReal(2.0 * xi);
xi = 2.0 *xi;
alpha = toReal(xi);
do {
// A. Generate two uniformly distributed pseudo-random numbers R and R',

View File

@@ -26,7 +26,7 @@
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#ifndef __NVCC__
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
NAMESPACE_BEGIN(Grid);

View File

@@ -125,14 +125,6 @@ accelerator_inline Grid_simd<S, V> sqrt(const Grid_simd<S, V> &r) {
return SimdApply(SqrtRealFunctor<S>(), r);
}
template <class S, class V>
accelerator_inline Grid_simd<S, V> rsqrt(const Grid_simd<S, V> &r) {
return SimdApply(RSqrtRealFunctor<S>(), r);
}
template <class Scalar>
accelerator_inline Scalar rsqrt(const Scalar &r) {
return (RSqrtRealFunctor<Scalar>(), r);
}
template <class S, class V>
accelerator_inline Grid_simd<S, V> cos(const Grid_simd<S, V> &r) {
return SimdApply(CosRealFunctor<S>(), r);
}

View File

@@ -92,17 +92,22 @@ accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
{
// need a check for the group type?
iMatrix<vtype,N> ret(arg);
vtype rnrm;
vtype nrm;
vtype inner;
for(int c1=0;c1<N;c1++){
// Normalises row c1
zeroit(inner);
for(int c2=0;c2<N;c2++)
inner += innerProduct(ret._internal[c1][c2],ret._internal[c1][c2]);
nrm = rsqrt(inner);
nrm = sqrt(inner);
nrm = 1.0/nrm;
for(int c2=0;c2<N;c2++)
ret._internal[c1][c2]*= nrm;
// Remove c1 from rows c1+1...N-1
for (int b=c1+1; b<N; ++b){
decltype(ret._internal[b][b]*ret._internal[b][b]) pr;
zeroit(pr);

View File

@@ -84,7 +84,6 @@ NAMESPACE_BEGIN(Grid);
}
UNARY(sqrt);
UNARY(rsqrt);
UNARY(sin);
UNARY(cos);
UNARY(asin);

View File

@@ -48,7 +48,7 @@ void acceleratorInit(void)
prop = gpu_props[i];
totalDeviceMem = prop.totalGlobalMem;
if ( world_rank == 0) {
#ifndef GRID_IBM_SUMMIT
#ifndef GRID_DEFAULT_GPU
if ( i==rank ) {
printf("AcceleratorCudaInit[%d]: ========================\n",rank);
printf("AcceleratorCudaInit[%d]: Device Number : %d\n", rank,i);
@@ -73,11 +73,17 @@ void acceleratorInit(void)
#undef GPU_PROP_FMT
#undef GPU_PROP
#ifdef GRID_IBM_SUMMIT
#ifdef GRID_DEFAULT_GPU
// IBM Jsrun makes cuda Device numbering screwy and not match rank
if ( world_rank == 0 ) printf("AcceleratorCudaInit: IBM Summit or similar - use default device\n");
if ( world_rank == 0 ) {
printf("AcceleratorCudaInit: using default device \n");
printf("AcceleratorCudaInit: assume user either uses a) IBM jsrun, or \n");
printf("AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
printf("AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no \n");
}
#else
printf("AcceleratorCudaInit: rank %d setting device to node rank %d\n",world_rank,rank);
printf("AcceleratorCudaInit: Configure options --enable-select-gpu=yes \n");
cudaSetDevice(rank);
#endif
if ( world_rank == 0 ) printf("AcceleratorCudaInit: ================================================\n");
@@ -139,11 +145,18 @@ void acceleratorInit(void)
MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
#undef GPU_PROP_FMT
#undef GPU_PROP
#ifdef GRID_IBM_SUMMIT
// IBM Jsrun makes cuda Device numbering screwy and not match rank
if ( world_rank == 0 ) printf("AcceleratorHipInit: IBM Summit or similar - NOT setting device to node rank\n");
#ifdef GRID_DEFAULT_GPU
if ( world_rank == 0 ) {
printf("AcceleratorHipInit: using default device \n");
printf("AcceleratorHipInit: assume user either uses a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
printf("AcceleratorHipInit: Configure options --enable-summit, --enable-select-gpu=no \n");
}
#else
if ( world_rank == 0 ) printf("AcceleratorHipInit: setting device to node rank\n");
if ( world_rank == 0 ) {
printf("AcceleratorHipInit: rank %d setting device to node rank %d\n",world_rank,rank);
printf("AcceleratorHipInit: Configure options --enable-select-gpu=yes \n");
}
hipSetDevice(rank);
#endif
if ( world_rank == 0 ) printf("AcceleratorHipInit: ================================================\n");

View File

@@ -166,15 +166,18 @@ inline void *acceleratorAllocDevice(size_t bytes)
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
inline int acceleratorIsCommunicable(void *ptr)
{
int uvm;
auto
cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr);
assert(cuerr == cudaSuccess );
if(uvm) return 0;
else return 1;
// int uvm=0;
// auto
// cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr);
// assert(cuerr == cudaSuccess );
// if(uvm) return 0;
// else return 1;
return 1;
}
#endif
@@ -229,8 +232,10 @@ inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*t
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theGridAccelerator->memset(base,value,bytes); theGridAccelerator->wait();}
inline int acceleratorIsCommunicable(void *ptr)
{
#if 0
@@ -328,10 +333,12 @@ inline void *acceleratorAllocDevice(size_t bytes)
return ptr;
};
inline void acceleratorFreeShared(void *ptr){ free(ptr);};
inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);}
#endif
@@ -369,8 +376,10 @@ inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemc
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);}
#ifdef HAVE_MM_MALLOC_H
inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
@@ -393,6 +402,8 @@ inline void *acceleratorAllocCpu(size_t bytes){return memalign(GRID_ALLOC_ALIGN,
inline void acceleratorFreeCpu (void *ptr){free(ptr);};
#endif
///////////////////////////////////////////////////
// Synchronise across local threads for divergence resynch
///////////////////////////////////////////////////

View File

@@ -473,11 +473,13 @@ void Grid_init(int *argc,char ***argv)
LebesgueOrder::UseLebesgueOrder=1;
}
CartesianCommunicator::nCommThreads = 1;
#ifdef GRID_COMMS_THREADS
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads");
GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads);
assert(CartesianCommunicator::nCommThreads > 0);
}
#endif
if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
GridCmdOptionIntVector(arg,LebesgueOrder::Block);

33
README
View File

@@ -111,11 +111,10 @@ Now you can execute the `configure` script to generate makefiles (here from a bu
``` bash
mkdir build; cd build
../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path>
../configure --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path>
```
where `--enable-precision=` set the default precision,
`--enable-simd=` set the SIMD type, `--enable-
where `--enable-simd=` set the SIMD type, `--enable-
comms=`, and `<path>` should be replaced by the prefix path where you want to
install Grid. Other options are detailed in the next section, you can also use `configure
--help` to display them. Like with any other program using GNU autotool, the
@@ -146,8 +145,8 @@ If you want to build all the tests at once just use `make tests`.
- `--enable-numa`: enable NUMA first touch optimisation
- `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below.
- `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes).
- `--enable-precision={single|double}`: set the default precision (default: `double`).
- `--enable-precision=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
- `--enable-precision={single|double}`: set the default precision (default: `double`). **Deprecated option**
- `--enable-comms=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
- `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `).
- `--disable-timers`: disable system dependent high-resolution timers.
- `--enable-chroma`: enable Chroma regression tests.
@@ -201,8 +200,7 @@ Alternatively, some CPU codenames can be directly used:
The following configuration is recommended for the Intel Knights Landing platform:
``` bash
../configure --enable-precision=double\
--enable-simd=KNL \
../configure --enable-simd=KNL \
--enable-comms=mpi-auto \
--enable-mkl \
CXX=icpc MPICXX=mpiicpc
@@ -212,8 +210,7 @@ The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
``` bash
../configure --enable-precision=double\
--enable-simd=KNL \
../configure --enable-simd=KNL \
--enable-comms=mpi \
--enable-mkl \
CXX=CC CC=cc
@@ -232,8 +229,7 @@ for interior communication. This is the mpi3 communications implementation.
We recommend four ranks per node for best performance, but optimum is local volume dependent.
``` bash
../configure --enable-precision=double\
--enable-simd=KNL \
../configure --enable-simd=KNL \
--enable-comms=mpi3-auto \
--enable-mkl \
CC=icpc MPICXX=mpiicpc
@@ -244,8 +240,7 @@ We recommend four ranks per node for best performance, but optimum is local volu
The following configuration is recommended for the Intel Haswell platform:
``` bash
../configure --enable-precision=double\
--enable-simd=AVX2 \
../configure --enable-simd=AVX2 \
--enable-comms=mpi3-auto \
--enable-mkl \
CXX=icpc MPICXX=mpiicpc
@@ -262,8 +257,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
``` bash
../configure --enable-precision=double\
--enable-simd=AVX2 \
../configure --enable-simd=AVX2 \
--enable-comms=mpi3 \
--enable-mkl \
CXX=CC CC=cc
@@ -280,8 +274,7 @@ This is the default.
The following configuration is recommended for the Intel Skylake platform:
``` bash
../configure --enable-precision=double\
--enable-simd=AVX512 \
../configure --enable-simd=AVX512 \
--enable-comms=mpi3 \
--enable-mkl \
CXX=mpiicpc
@@ -298,8 +291,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
``` bash
../configure --enable-precision=double\
--enable-simd=AVX512 \
../configure --enable-simd=AVX512 \
--enable-comms=mpi3 \
--enable-mkl \
CXX=CC CC=cc
@@ -330,8 +322,7 @@ and 8 threads per rank.
The following configuration is recommended for the AMD EPYC platform.
``` bash
../configure --enable-precision=double\
--enable-simd=AVX2 \
../configure --enable-simd=AVX2 \
--enable-comms=mpi3 \
CXX=mpicxx
```

View File

@@ -115,11 +115,10 @@ Now you can execute the `configure` script to generate makefiles (here from a bu
``` bash
mkdir build; cd build
../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path>
../configure --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path>
```
where `--enable-precision=` set the default precision,
`--enable-simd=` set the SIMD type, `--enable-
where `--enable-simd=` set the SIMD type, `--enable-
comms=`, and `<path>` should be replaced by the prefix path where you want to
install Grid. Other options are detailed in the next section, you can also use `configure
--help` to display them. Like with any other program using GNU autotool, the
@@ -150,8 +149,8 @@ If you want to build all the tests at once just use `make tests`.
- `--enable-numa`: enable NUMA first touch optimisation
- `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below.
- `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes).
- `--enable-precision={single|double}`: set the default precision (default: `double`).
- `--enable-precision=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
- `--enable-precision={single|double}`: set the default precision (default: `double`). **Deprecated option**
- `--enable-comms=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
- `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `).
- `--disable-timers`: disable system dependent high-resolution timers.
- `--enable-chroma`: enable Chroma regression tests.
@@ -205,8 +204,7 @@ Alternatively, some CPU codenames can be directly used:
The following configuration is recommended for the Intel Knights Landing platform:
``` bash
../configure --enable-precision=double\
--enable-simd=KNL \
../configure --enable-simd=KNL \
--enable-comms=mpi-auto \
--enable-mkl \
CXX=icpc MPICXX=mpiicpc
@@ -216,8 +214,7 @@ The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
``` bash
../configure --enable-precision=double\
--enable-simd=KNL \
../configure --enable-simd=KNL \
--enable-comms=mpi \
--enable-mkl \
CXX=CC CC=cc
@@ -236,8 +233,7 @@ for interior communication. This is the mpi3 communications implementation.
We recommend four ranks per node for best performance, but optimum is local volume dependent.
``` bash
../configure --enable-precision=double\
--enable-simd=KNL \
../configure --enable-simd=KNL \
--enable-comms=mpi3-auto \
--enable-mkl \
CC=icpc MPICXX=mpiicpc
@@ -248,8 +244,7 @@ We recommend four ranks per node for best performance, but optimum is local volu
The following configuration is recommended for the Intel Haswell platform:
``` bash
../configure --enable-precision=double\
--enable-simd=AVX2 \
../configure --enable-simd=AVX2 \
--enable-comms=mpi3-auto \
--enable-mkl \
CXX=icpc MPICXX=mpiicpc
@@ -266,8 +261,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
``` bash
../configure --enable-precision=double\
--enable-simd=AVX2 \
../configure --enable-simd=AVX2 \
--enable-comms=mpi3 \
--enable-mkl \
CXX=CC CC=cc
@@ -284,8 +278,7 @@ This is the default.
The following configuration is recommended for the Intel Skylake platform:
``` bash
../configure --enable-precision=double\
--enable-simd=AVX512 \
../configure --enable-simd=AVX512 \
--enable-comms=mpi3 \
--enable-mkl \
CXX=mpiicpc
@@ -302,8 +295,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
``` bash
../configure --enable-precision=double\
--enable-simd=AVX512 \
../configure --enable-simd=AVX512 \
--enable-comms=mpi3 \
--enable-mkl \
CXX=CC CC=cc
@@ -334,8 +326,7 @@ and 8 threads per rank.
The following configuration is recommended for the AMD EPYC platform.
``` bash
../configure --enable-precision=double\
--enable-simd=AVX2 \
../configure --enable-simd=AVX2 \
--enable-comms=mpi3 \
CXX=mpicxx
```

View File

@@ -12,31 +12,31 @@ module load mpi/openmpi-aarch64
scl enable gcc-toolset-10 bash
../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++ CC=gcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"
../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=g++ CC=gcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"
* gcc 10.1 prebuild w/ MPI, QPACE4 interactive login
scl enable gcc-toolset-10 bash
module load mpi/openmpi-aarch64
../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi-auto --enable-shm=shmget --enable-openmp CXX=mpicxx CC=mpicc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"
../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi-auto --enable-shm=shmget --enable-openmp CXX=mpicxx CC=mpicc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"
------------------------------------------------------------------------------
* armclang 20.2 (qp4)
../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DARMCLANGCOMPAT -DA64FXASM -DDSLASHINTRIN"
../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DARMCLANGCOMPAT -DA64FXASM -DDSLASHINTRIN"
------------------------------------------------------------------------------
* gcc 10.0.1 VLA (merlin)
../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static
../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static
* gcc 10.0.1 fixed-size ACLE (merlin)
../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"
../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"
* gcc 10.0.1 fixed-size ACLE (fjt) w/ MPI
@@ -46,34 +46,34 @@ export OMPI_CXX=g++-10.0.1
export MPICH_CC=gcc-10.0.1
export MPICH_CXX=g++-10.0.1
$ ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64 -lrt"
$ ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64 -lrt"
--------------------------------------------------------
* armclang 20.0 VLA (merlin)
../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -fno-unroll-loops -mllvm -vectorizer-min-trip-count=2 -march=armv8-a+sve -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static
../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -fno-unroll-loops -mllvm -vectorizer-min-trip-count=2 -march=armv8-a+sve -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static
TODO check ARMCLANGCOMPAT
* armclang 20.1 VLA (merlin)
../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static
../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static
TODO check ARMCLANGCOMPAT
* armclang 20.1 VLA (fjt cluster)
../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU"
../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU"
TODO check ARMCLANGCOMPAT
* armclang 20.1 VLA w/MPI (fjt cluster)
../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64"
../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64"
No ARMCLANGCOMPAT -> still correct ?
@@ -81,9 +81,9 @@ No ARMCLANGCOMPAT -> still correct ?
* Fujitsu fcc
../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=FCC CC=fcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN"
../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=FCC CC=fcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN"
* Fujitsu fcc w/ MPI
../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=mpiFCC CC=mpifcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU"
../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=mpiFCC CC=mpifcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU"

View File

@@ -1,8 +1,16 @@
#include "Benchmark_IO.hpp"
#ifndef BENCH_IO_LMIN
#define BENCH_IO_LMIN 8
#endif
#ifndef BENCH_IO_LMAX
#define BENCH_IO_LMAX 40
#define BENCH_IO_LMAX 32
#endif
#ifndef BENCH_IO_NPASS
#define BENCH_IO_NPASS 10
#endif
using namespace Grid;
@@ -12,62 +20,177 @@ std::string filestem(const int l)
return "iobench_l" + std::to_string(l);
}
int vol(const int i)
{
return BENCH_IO_LMIN + 2*i;
}
int volInd(const int l)
{
return (l - BENCH_IO_LMIN)/2;
}
template <typename Mat>
void stats(Mat &mean, Mat &stdDev, const std::vector<Mat> &data)
{
auto nr = data[0].rows(), nc = data[0].cols();
Eigen::MatrixXd sqSum(nr, nc);
double n = static_cast<double>(data.size());
assert(n > 1.);
mean = Mat::Zero(nr, nc);
sqSum = Mat::Zero(nr, nc);
for (auto &d: data)
{
mean += d;
sqSum += d.cwiseProduct(d);
}
stdDev = ((sqSum - mean.cwiseProduct(mean)/n)/(n - 1.)).cwiseSqrt();
mean /= n;
}
#define grid_printf(...) \
{\
char _buf[1024];\
sprintf(_buf, __VA_ARGS__);\
MSG << _buf;\
}
enum {sRead = 0, sWrite = 1, gRead = 2, gWrite = 3};
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
int64_t threads = GridThread::GetThreads();
auto mpi = GridDefaultMpi();
unsigned int nVol = (BENCH_IO_LMAX - BENCH_IO_LMIN)/2 + 1;
unsigned int nRelVol = (BENCH_IO_LMAX - 24)/2 + 1;
std::vector<Eigen::MatrixXd> perf(BENCH_IO_NPASS, Eigen::MatrixXd::Zero(nVol, 4));
std::vector<Eigen::VectorXd> avPerf(BENCH_IO_NPASS, Eigen::VectorXd::Zero(4));
std::vector<int> latt;
MSG << "Grid is setup to use " << threads << " threads" << std::endl;
MSG << "MPI partition " << mpi << std::endl;
for (unsigned int i = 0; i < BENCH_IO_NPASS; ++i)
{
MSG << BIGSEP << std::endl;
MSG << "Pass " << i + 1 << "/" << BENCH_IO_NPASS << std::endl;
MSG << BIGSEP << std::endl;
MSG << SEP << std::endl;
MSG << "Benchmark std write" << std::endl;
MSG << SEP << std::endl;
for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
{
latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
MSG << "-- Local volume " << l << "^4" << std::endl;
writeBenchmark<LatticeFermion>(latt, filestem(l), stdWrite<LatticeFermion>);
perf[i](volInd(l), sWrite) = BinaryIO::lastPerf.mbytesPerSecond;
}
MSG << SEP << std::endl;
MSG << "Benchmark std read" << std::endl;
MSG << SEP << std::endl;
for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
{
latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
MSG << "-- Local volume " << l << "^4" << std::endl;
readBenchmark<LatticeFermion>(latt, filestem(l), stdRead<LatticeFermion>);
perf[i](volInd(l), sRead) = BinaryIO::lastPerf.mbytesPerSecond;
}
#ifdef HAVE_LIME
MSG << SEP << std::endl;
MSG << "Benchmark Grid C-Lime write" << std::endl;
MSG << SEP << std::endl;
for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
{
latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
MSG << "-- Local volume " << l << "^4" << std::endl;
writeBenchmark<LatticeFermion>(latt, filestem(l), limeWrite<LatticeFermion>);
perf[i](volInd(l), gWrite) = BinaryIO::lastPerf.mbytesPerSecond;
}
MSG << SEP << std::endl;
MSG << "Benchmark Grid C-Lime read" << std::endl;
MSG << SEP << std::endl;
for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
{
latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
MSG << "-- Local volume " << l << "^4" << std::endl;
readBenchmark<LatticeFermion>(latt, filestem(l), limeRead<LatticeFermion>);
perf[i](volInd(l), gRead) = BinaryIO::lastPerf.mbytesPerSecond;
}
#endif
avPerf[i].fill(0.);
for (int f = 0; f < 4; ++f)
for (int l = 24; l <= BENCH_IO_LMAX; l += 2)
{
avPerf[i](f) += perf[i](volInd(l), f);
}
avPerf[i] /= nRelVol;
}
Eigen::MatrixXd mean(nVol, 4), stdDev(nVol, 4), rob(nVol, 4);
Eigen::VectorXd avMean(4), avStdDev(4), avRob(4);
double n = BENCH_IO_NPASS;
stats(mean, stdDev, perf);
stats(avMean, avStdDev, avPerf);
rob.fill(100.);
rob -= 100.*stdDev.cwiseQuotient(mean.cwiseAbs());
avRob.fill(100.);
avRob -= 100.*avStdDev.cwiseQuotient(avMean.cwiseAbs());
MSG << BIGSEP << std::endl;
MSG << "SUMMARY" << std::endl;
MSG << BIGSEP << std::endl;
MSG << "Summary of individual results (all results in MB/s)." << std::endl;
MSG << "Every second colum gives the standard deviation of the previous column." << std::endl;
MSG << std::endl;
grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n",
"L", "std read", "std dev", "std write", "std dev",
"Grid read", "std dev", "Grid write", "std dev");
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
{
grid_printf("%4d %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n",
l, mean(volInd(l), sRead), stdDev(volInd(l), sRead),
mean(volInd(l), sWrite), stdDev(volInd(l), sWrite),
mean(volInd(l), gRead), stdDev(volInd(l), gRead),
mean(volInd(l), gWrite), stdDev(volInd(l), gWrite));
}
MSG << std::endl;
MSG << "Robustness of individual results, in \%. (rob = 100\% - std dev / mean)" << std::endl;
MSG << std::endl;
grid_printf("%4s %12s %12s %12s %12s\n",
"L", "std read", "std write", "Grid read", "Grid write");
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
{
grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n",
l, rob(volInd(l), sRead), rob(volInd(l), sWrite),
rob(volInd(l), gRead), rob(volInd(l), gWrite));
}
MSG << std::endl;
MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX << "^4 (all results in MB/s)." << std::endl;
MSG << "Every second colum gives the standard deviation of the previous column." << std::endl;
MSG << std::endl;
grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n",
"std read", "std dev", "std write", "std dev",
"Grid read", "std dev", "Grid write", "std dev");
grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n",
avMean(sRead), avStdDev(sRead), avMean(sWrite), avStdDev(sWrite),
avMean(gRead), avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
MSG << std::endl;
MSG << "Robustness of volume-averaged results, in \%. (rob = 100\% - std dev / mean)" << std::endl;
MSG << std::endl;
grid_printf("%12s %12s %12s %12s\n",
"std read", "std write", "Grid read", "Grid write");
grid_printf("%12.1f %12.1f %12.1f %12.1f\n",
avRob(sRead), avRob(sWrite), avRob(gRead), avRob(gWrite));
Grid_finalize();

View File

@@ -5,6 +5,8 @@
#ifdef HAVE_LIME
#define MSG std::cout << GridLogMessage
#define SEP \
"-----------------------------------------------------------------------------"
#define BIGSEP \
"============================================================================="
namespace Grid {
@@ -37,9 +39,12 @@ using ReaderFn = std::function<void(Field &, const std::string)>;
// ioWatch.Stop();
// std::fclose(file);
// size *= vec.Grid()->ProcessorCount();
// MSG << "Std I/O write: Wrote " << size << " bytes in " << ioWatch.Elapsed()
// << ", performance " << size/1024./1024./(ioWatch.useconds()/1.e6)
// << " MB/s" << std::endl;
// auto &p = BinaryIO::lastPerf;
// p.size = size;
// p.time = ioWatch.useconds();
// p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
// MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()
// << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
// MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
// }
//
@@ -72,9 +77,12 @@ using ReaderFn = std::function<void(Field &, const std::string)>;
// MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
// assert(crcData == crcRead);
// size *= vec.Grid()->ProcessorCount();
// MSG << "Std I/O read: Read " << size << " bytes in " << ioWatch.Elapsed()
// << ", performance " << size/1024./1024./(ioWatch.useconds()/1.e6)
// << " MB/s" << std::endl;
// auto &p = BinaryIO::lastPerf;
// p.size = size;
// p.time = ioWatch.useconds();
// p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
// MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed()
// << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
// MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
// }
@@ -100,9 +108,12 @@ void stdWrite(const std::string filestem, Field &vec)
file.flush();
ioWatch.Stop();
size *= vec.Grid()->ProcessorCount();
MSG << "Std I/O write: Wrote " << size << " bytes in " << ioWatch.Elapsed()
<< ", " << size/1024./1024./(ioWatch.useconds()/1.e6)
<< " MB/s" << std::endl;
auto &p = BinaryIO::lastPerf;
p.size = size;
p.time = ioWatch.useconds();
p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()
<< ", " << p.mbytesPerSecond << " MB/s" << std::endl;
MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
}
@@ -135,9 +146,12 @@ void stdRead(Field &vec, const std::string filestem)
MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
assert(crcData == crcRead);
size *= vec.Grid()->ProcessorCount();
MSG << "Std I/O read: Read " << size << " bytes in " << ioWatch.Elapsed()
<< ", " << size/1024./1024./(ioWatch.useconds()/1.e6)
<< " MB/s" << std::endl;
auto &p = BinaryIO::lastPerf;
p.size = size;
p.time = ioWatch.useconds();
p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed()
<< ", " << p.mbytesPerSecond << " MB/s" << std::endl;
MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
}
@@ -200,6 +214,7 @@ void writeBenchmark(const Coordinate &latt, const std::string filename,
auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd());
std::shared_ptr<GridCartesian> gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi));
std::shared_ptr<GridBase> gPt;
std::random_device rd;
makeGrid(gPt, gBasePt, Ls, rb);
@@ -207,6 +222,11 @@ void writeBenchmark(const Coordinate &latt, const std::string filename,
GridParallelRNG rng(g);
Field vec(g);
rng.SeedFixedIntegers({static_cast<int>(rd()), static_cast<int>(rd()),
static_cast<int>(rd()), static_cast<int>(rd()),
static_cast<int>(rd()), static_cast<int>(rd()),
static_cast<int>(rd()), static_cast<int>(rd())});
random(rng, vec);
write(filename, vec);
}

View File

@@ -1,9 +1,5 @@
#include "Benchmark_IO.hpp"
#define MSG std::cout << GridLogMessage
#define SEP \
"============================================================================="
using namespace Grid;
int main (int argc, char ** argv)

View File

@@ -62,7 +62,7 @@ struct time_statistics{
void comms_header(){
std::cout <<GridLogMessage << " L "<<"\t"<<" Ls "<<"\t"
<<std::setw(11)<<"bytes"<<"MB/s uni (err/min/max)"<<"\t\t"<<"MB/s bidi (err/min/max)"<<std::endl;
<<"bytes\t MB/s uni (err/min/max) \t\t MB/s bidi (err/min/max)"<<std::endl;
};
Gamma::Algebra Gmu [] = {
@@ -125,7 +125,7 @@ public:
lat*mpi_layout[1],
lat*mpi_layout[2],
lat*mpi_layout[3]});
std::cout << GridLogMessage<< latt_size <<std::endl;
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
RealD Nrank = Grid._Nprocessors;
RealD Nnode = Grid.NodeCount();
@@ -137,8 +137,8 @@ public:
for(int d=0;d<8;d++){
xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
// bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
// bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
}
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
@@ -189,11 +189,11 @@ public:
// double rbytes = dbytes*0.5;
double bidibytes = dbytes;
std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
<<std::right<< xbytes/timestat.mean<<" "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
std::cout<<GridLogMessage << lat<<"\t"<<Ls<<"\t "
<< bytes << " \t "
<<xbytes/timestat.mean<<" \t "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " \t "
<<xbytes/timestat.max <<" "<< xbytes/timestat.min
<< "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< " " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
<< "\t\t"<< bidibytes/timestat.mean<< " " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
<< bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
}
@@ -202,6 +202,8 @@ public:
return;
}
static void Memory(void)
{
const int Nvec=8;
@@ -222,7 +224,7 @@ public:
uint64_t lmax=32;
#define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
#define NLOOP (1000*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
for(int lat=8;lat<=lmax;lat+=8){
@@ -247,11 +249,6 @@ public:
double start=usecond();
for(int i=0;i<Nloop;i++){
z=a*x-y;
autoView( x_v , x, CpuWrite);
autoView( y_v , y, CpuWrite);
autoView( z_v , z, CpuRead);
x_v[0]=z_v[0]; // force serial dependency to prevent optimise away
y_v[4]=z_v[4];
}
double stop=usecond();
double time = (stop-start)/Nloop*1000;
@@ -266,6 +263,61 @@ public:
};
static void SU4(void)
{
const int Nc4=4;
typedef Lattice< iMatrix< vComplexF,Nc4> > LatticeSU4;
Coordinate simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking z = y*x SU(4) bandwidth"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<< "\t\tGB/s / node"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
uint64_t NN;
uint64_t lmax=32;
#define NLOOP (1000*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
for(int lat=8;lat<=lmax;lat+=8){
Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
NN =Grid.NodeCount();
LatticeSU4 z(&Grid); z=Zero();
LatticeSU4 x(&Grid); x=Zero();
LatticeSU4 y(&Grid); y=Zero();
double a=2.0;
uint64_t Nloop=NLOOP;
double start=usecond();
for(int i=0;i<Nloop;i++){
z=x*y;
}
double stop=usecond();
double time = (stop-start)/Nloop*1000;
double flops=vol*Nc4*Nc4*(6+(Nc4-1)*8);// mul,add
double bytes=3.0*vol*Nc4*Nc4*2*sizeof(RealF);
std::cout<<GridLogMessage<<std::setprecision(3)
<< lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.
<< "\t\t"<< bytes/time/NN <<std::endl;
}
};
static double DWF(int Ls,int L)
{
RealD mass=0.1;
@@ -282,8 +334,9 @@ public:
int threads = GridThread::GetThreads();
Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
Coordinate local({L,L,L,L});
Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(Coordinate({72,72,72,72}),
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4,
GridDefaultSimd(Nd,vComplex::Nsimd()),
GridDefaultMpi());
uint64_t NP = TmpGrid->RankCount();
@@ -291,11 +344,11 @@ public:
NN_global=NN;
uint64_t SHM=NP/NN;
Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
///////// Welcome message ////////////
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
std::cout<<GridLogMessage << "* Nc : "<<Nc<<std::endl;
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
std::cout<<GridLogMessage << "* Ls : "<<Ls<<std::endl;
std::cout<<GridLogMessage << "* ranks : "<<NP <<std::endl;
@@ -324,7 +377,7 @@ public:
typedef LatticeGaugeFieldF Gauge;
///////// Source preparation ////////////
Gauge Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
Gauge Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
Fermion src (FGrid); random(RNG5,src);
Fermion src_e (FrbGrid);
Fermion src_o (FrbGrid);
@@ -369,7 +422,7 @@ public:
}
FGrid->Barrier();
double t1=usecond();
uint64_t ncall = 50;
uint64_t ncall = 500;
FGrid->Broadcast(0,&ncall,sizeof(ncall));
@@ -387,7 +440,17 @@ public:
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(1344.0*volume)/2;
// Nc=3 gives
// 1344= 3*(2*8+6)*2*8 + 8*3*2*2 + 3*4*2*8
// 1344 = Nc* (6+(Nc-1)*8)*2*Nd + Nd*Nc*2*2 + Nd*Nc*Ns*2
// double flops=(1344.0*volume)/2;
#if 0
double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + Nd*Nc*Ns + Nd*Nc*Ns*2;
#else
double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + 2*Nd*Nc*Ns + 2*Nd*Nc*Ns*2;
#endif
double flops=(fps*volume)/2;
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
@@ -402,6 +465,7 @@ public:
if ( mflops>mflops_best ) mflops_best = mflops;
if ( mflops<mflops_worst) mflops_worst= mflops;
std::cout<<GridLogMessage<< "Deo FlopsPerSite is "<<fps<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
@@ -438,8 +502,9 @@ public:
int threads = GridThread::GetThreads();
Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
Coordinate local({L,L,L,L});
Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(Coordinate({72,72,72,72}),
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4,
GridDefaultSimd(Nd,vComplex::Nsimd()),
GridDefaultMpi());
uint64_t NP = TmpGrid->RankCount();
@@ -447,7 +512,6 @@ public:
NN_global=NN;
uint64_t SHM=NP/NN;
Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
///////// Welcome message ////////////
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
@@ -478,7 +542,7 @@ public:
typedef typename Action::FermionField Fermion;
typedef LatticeGaugeFieldF Gauge;
Gauge Umu(FGrid); SU3::HotConfiguration(RNG4,Umu);
Gauge Umu(FGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
typename Action::ImplParams params;
Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params);
@@ -596,11 +660,12 @@ int main (int argc, char ** argv)
#endif
Benchmark::Decomposition();
int do_su4=1;
int do_memory=1;
int do_comms =1;
int sel=2;
std::vector<int> L_list({16,24,32});
int sel=4;
std::vector<int> L_list({8,12,16,24,32});
int selm1=sel-1;
std::vector<double> wilson;
@@ -624,7 +689,6 @@ int main (int argc, char ** argv)
dwf4.push_back(result);
}
/*
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Improved Staggered dslash 4D vectorised" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
@@ -632,14 +696,13 @@ int main (int argc, char ** argv)
double result = Benchmark::Staggered(L_list[l]) ;
staggered.push_back(result);
}
*/
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\tt Staggered" <<std::endl;
std::cout<<GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\t Staggered" <<std::endl;
for(int l=0;l<L_list.size();l++){
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t\t "<<dwf4[l] <<std::endl;
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
@@ -651,6 +714,13 @@ int main (int argc, char ** argv)
Benchmark::Memory();
}
if ( do_su4 ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
Benchmark::SU4();
}
if ( do_comms && (NN>1) ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Communications benchmark " <<std::endl;
@@ -661,9 +731,9 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4 " <<std::endl;
std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4\t\t Staggered " <<std::endl;
for(int l=0;l<L_list.size();l++){
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<std::endl;
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;

View File

@@ -94,8 +94,8 @@ int main (int argc, char ** argv)
RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank/Nnode;
std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);
std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8);
std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8);
for(int mu=0;mu<8;mu++){
xbuf[mu].resize(lat*lat*lat*Ls);

View File

@@ -0,0 +1,260 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_comms.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
struct time_statistics{
double mean;
double err;
double min;
double max;
void statistics(std::vector<double> v){
double sum = std::accumulate(v.begin(), v.end(), 0.0);
mean = sum / v.size();
std::vector<double> diff(v.size());
std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));
auto result = std::minmax_element(v.begin(), v.end());
min = *result.first;
max = *result.second;
}
};
void header(){
std::cout <<GridLogMessage << " L "<<"\t"<<" Ls "<<"\t"
<<std::setw(11)<<"bytes\t\t"<<"MB/s uni (err/min/max)"<<"\t\t"<<"MB/s bidi (err/min/max)"<<std::endl;
};
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
int Nloop=250;
int nmu=0;
int maxlat=32;
for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
std::vector<double> t_time(Nloop);
time_statistics timestat;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange from host memory "<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
header();
for(int lat=8;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=8;Ls*=2){
Coordinate latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
lat*mpi_layout[2],
lat*mpi_layout[3]});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
RealD Nrank = Grid._Nprocessors;
RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank/Nnode;
std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8);
std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8);
for(int mu=0;mu<8;mu++){
xbuf[mu].resize(lat*lat*lat*Ls);
rbuf[mu].resize(lat*lat*lat*Ls);
}
uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
int ncomm;
for(int mu=0;mu<4;mu++){
if (mpi_layout[mu]>1 ) {
double start=usecond();
for(int i=0;i<Nloop;i++){
ncomm=0;
ncomm++;
int comm_proc=1;
int xmit_to_rank;
int recv_from_rank;
{
std::vector<CommsRequest_t> requests;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
Grid.SendToRecvFrom((void *)&xbuf[mu][0],
xmit_to_rank,
(void *)&rbuf[mu][0],
recv_from_rank,
bytes);
}
comm_proc = mpi_layout[mu]-1;
{
std::vector<CommsRequest_t> requests;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
Grid.SendToRecvFrom((void *)&xbuf[mu+4][0],
xmit_to_rank,
(void *)&rbuf[mu+4][0],
recv_from_rank,
bytes);
}
}
Grid.Barrier();
double stop=usecond();
double mean=(stop-start)/Nloop;
double dbytes = bytes*ppn;
double xbytes = dbytes*2.0*ncomm;
double rbytes = xbytes;
double bidibytes = xbytes+rbytes;
std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)<<" "
<<std::right<< xbytes/mean<<" "
<< "\t\t"<<std::setw(7)<< bidibytes/mean<< std::endl;
}
}
}
}
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange from GPU memory "<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
header();
for(int lat=8;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=8;Ls*=2){
Coordinate latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
lat*mpi_layout[2],
lat*mpi_layout[3]});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
RealD Nrank = Grid._Nprocessors;
RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank/Nnode;
std::vector<HalfSpinColourVectorD *> xbuf(8);
std::vector<HalfSpinColourVectorD *> rbuf(8);
uint64_t bytes = lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
for(int d=0;d<8;d++){
xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
}
int ncomm;
for(int mu=0;mu<4;mu++){
if (mpi_layout[mu]>1 ) {
double start=usecond();
for(int i=0;i<Nloop;i++){
ncomm=0;
ncomm++;
int comm_proc=1;
int xmit_to_rank;
int recv_from_rank;
{
std::vector<CommsRequest_t> requests;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
Grid.SendToRecvFrom((void *)&xbuf[mu][0],
xmit_to_rank,
(void *)&rbuf[mu][0],
recv_from_rank,
bytes);
}
comm_proc = mpi_layout[mu]-1;
{
std::vector<CommsRequest_t> requests;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
Grid.SendToRecvFrom((void *)&xbuf[mu+4][0],
xmit_to_rank,
(void *)&rbuf[mu+4][0],
recv_from_rank,
bytes);
}
}
Grid.Barrier();
double stop=usecond();
double mean=(stop-start)/Nloop;
double dbytes = bytes*ppn;
double xbytes = dbytes*2.0*ncomm;
double rbytes = xbytes;
double bidibytes = xbytes+rbytes;
std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)<<" "
<<std::right<< xbytes/mean<<" "
<< "\t\t"<<std::setw(7)<< bidibytes/mean<< std::endl;
}
}
for(int d=0;d<8;d++){
acceleratorFreeDevice(xbuf[d]);
acceleratorFreeDevice(rbuf[d]);
}
}
}
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
Grid_finalize();
}

View File

@@ -108,7 +108,7 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
LatticeGaugeField Umu(UGrid);
SU3::HotConfiguration(RNG4,Umu);
SU<Nc>::HotConfiguration(RNG4,Umu);
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
#if 0
Umu=1.0;

View File

@@ -0,0 +1,364 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_dwf.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#ifdef GRID_CUDA
#define CUDA_PROFILE
#endif
#ifdef CUDA_PROFILE
#include <cuda_profiler_api.h>
#endif
using namespace std;
using namespace Grid;
template<class d>
struct scal {
d internal;
};
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
int threads = GridThread::GetThreads();
Coordinate latt4 = GridDefaultLatt();
int Ls=8;
for(int i=0;i<argc;i++)
if(std::string(argv[i]) == "-Ls"){
std::stringstream ss(argv[i+1]); ss >> Ls;
}
GridLogLayout();
long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG"));
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG"));
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
LatticeFermionF src (FGrid); random(RNG5,src);
#if 0
src = Zero();
{
Coordinate origin({0,0,0,latt4[2]-1,0});
SpinColourVectorF tmp;
tmp=Zero();
tmp()(0)(0)=Complex(-2.0,0.0);
std::cout << " source site 0 " << tmp<<std::endl;
pokeSite(tmp,src,origin);
}
#else
RealD N2 = 1.0/::sqrt(norm2(src));
src = src*N2;
#endif
LatticeFermionF result(FGrid); result=Zero();
LatticeFermionF ref(FGrid); ref=Zero();
LatticeFermionF tmp(FGrid);
LatticeFermionF err(FGrid);
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
LatticeGaugeFieldF Umu(UGrid);
SU<Nc>::HotConfiguration(RNG4,Umu);
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
#if 0
Umu=1.0;
for(int mu=0;mu<Nd;mu++){
LatticeColourMatrixF ttmp(UGrid);
ttmp = PeekIndex<LorentzIndex>(Umu,mu);
// if (mu !=2 ) ttmp = 0;
// ttmp = ttmp* pow(10.0,mu);
PokeIndex<LorentzIndex>(Umu,ttmp,mu);
}
std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
#endif
////////////////////////////////////
// Naive wilson implementation
////////////////////////////////////
// replicate across fifth dimension
LatticeGaugeFieldF Umu5d(FGrid);
std::vector<LatticeColourMatrixF> U(4,FGrid);
{
autoView( Umu5d_v, Umu5d, CpuWrite);
autoView( Umu_v , Umu , CpuRead);
for(int ss=0;ss<Umu.Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
Umu5d_v[Ls*ss+s] = Umu_v[ss];
}
}
}
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
}
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
if (1)
{
ref = Zero();
for(int mu=0;mu<Nd;mu++){
tmp = U[mu]*Cshift(src,mu+1,1);
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu+1,-1);
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
}
ref = -0.5*ref;
}
RealD mass=0.1;
RealD M5 =1.8;
RealD NP = UGrid->_Nprocessors;
RealD NN = UGrid->NodeCount();
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
std::cout << GridLogMessage<< "* VComplexF size is "<<sizeof(vComplexF)<< " B"<<std::endl;
if ( sizeof(RealF)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(RealF)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
#endif
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
int ncall =1000;
if (1) {
FGrid->Barrier();
Dw.ZeroCounters();
Dw.Dhop(src,result,0);
std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
double t0=usecond();
for(int i=0;i<ncall;i++){
__SSC_START;
Dw.Dhop(src,result,0);
__SSC_STOP;
}
double t1=usecond();
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=single_site_flops*volume*ncall;
auto nsimd = vComplex::Nsimd();
auto simdwidth = sizeof(vComplex);
// RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
// mem: Nd Wilson * Ls, Nd gauge, Nc colors
double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
// std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
std::cout<<GridLogMessage << "RF GiB/s (base 2) = "<< 1000000. * data_rf/((t1-t0))<<std::endl;
std::cout<<GridLogMessage << "mem GiB/s (base 2) = "<< 1000000. * data_mem/((t1-t0))<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
//exit(0);
if(( norm2(err)>1.0e-4) ) {
/*
std::cout << "RESULT\n " << result<<std::endl;
std::cout << "REF \n " << ref <<std::endl;
std::cout << "ERR \n " << err <<std::endl;
*/
std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
FGrid->Barrier();
exit(-1);
}
assert (norm2(err)< 1.0e-4 );
Dw.Report();
}
if (1)
{ // Naive wilson dag implementation
ref = Zero();
for(int mu=0;mu<Nd;mu++){
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
tmp = U[mu]*Cshift(src,mu+1,1);
{
autoView( ref_v, ref, CpuWrite);
autoView( tmp_v, tmp, CpuRead);
for(int i=0;i<ref_v.size();i++){
ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
}
}
tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu+1,-1);
{
autoView( ref_v, ref, CpuWrite);
autoView( tmp_v, tmp, CpuRead);
for(int i=0;i<ref_v.size();i++){
ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
}
}
}
ref = -0.5*ref;
}
// dump=1;
Dw.Dhop(src,result,1);
std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm dag ref "<< norm2(ref)<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm dag diff "<< norm2(err)<<std::endl;
if((norm2(err)>1.0e-4)){
/*
std::cout<< "DAG RESULT\n " <<ref << std::endl;
std::cout<< "DAG sRESULT\n " <<result << std::endl;
std::cout<< "DAG ERR \n " << err <<std::endl;
*/
}
LatticeFermionF src_e (FrbGrid);
LatticeFermionF src_o (FrbGrid);
LatticeFermionF r_e (FrbGrid);
LatticeFermionF r_o (FrbGrid);
LatticeFermionF r_eo (FGrid);
std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
// S-direction is INNERMOST and takes no part in the parity.
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionF::DhopEO "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
if ( sizeof(RealF)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(RealF)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
#endif
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
{
Dw.ZeroCounters();
FGrid->Barrier();
Dw.DhopEO(src_o,r_e,DaggerNo);
double t0=usecond();
for(int i=0;i<ncall;i++){
#ifdef CUDA_PROFILE
if(i==10) cudaProfilerStart();
#endif
Dw.DhopEO(src_o,r_e,DaggerNo);
#ifdef CUDA_PROFILE
if(i==20) cudaProfilerStop();
#endif
}
double t1=usecond();
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(single_site_flops*volume*ncall)/2.0;
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "Deo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "Deo mflop/s per node "<< flops/(t1-t0)/NN<<std::endl;
Dw.Report();
}
Dw.DhopEO(src_o,r_e,DaggerNo);
Dw.DhopOE(src_e,r_o,DaggerNo);
Dw.Dhop (src ,result,DaggerNo);
std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;
setCheckerboard(r_eo,r_o);
setCheckerboard(r_eo,r_e);
err = r_eo-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
if((norm2(err)>1.0e-4)){
/*
std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
std::cout<< "Deo REF\n " <<result << std::endl;
std::cout<< "Deo ERR \n " << err <<std::endl;
*/
}
pickCheckerboard(Even,src_e,err);
pickCheckerboard(Odd,src_o,err);
std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl;
std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl;
assert(norm2(src_e)<1.0e-4);
assert(norm2(src_o)<1.0e-4);
Grid_finalize();
exit(0);
}

View File

@@ -63,7 +63,7 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
LatticeGaugeFieldF Umu(UGrid);
SU3::HotConfiguration(RNG4,Umu);
SU<Nc>::HotConfiguration(RNG4,Umu);
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
RealD mass=0.1;

View File

@@ -30,7 +30,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
using namespace std;
using namespace Grid;
;
int main (int argc, char ** argv)
@@ -53,7 +53,7 @@ int main (int argc, char ** argv)
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
std::cout << GridLogMessage << "Seeded"<<std::endl;
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
std::cout << GridLogMessage << "made random gauge fields"<<std::endl;

View File

@@ -1,76 +0,0 @@
#!/usr/bin/env bash
awkscript='
BEGIN{
i = 0;
print "local L,std read (MB/s),std write (MB/s),Grid Lime read (MB/s),Grid Lime write (MB/s)"
}
/Benchmark std write/{
i = 0;
mode = "stdWrite";
}
/Benchmark std read/{
i = 0;
mode = "stdRead"
}
/Benchmark Grid C-Lime write/{
i = 0;
mode = "gridWrite";
}
/Benchmark Grid C-Lime read/{
i = 0;
mode = "gridRead";
}
/Local volume/{
match($0, "[0-9]+\\^4");
l[i] = substr($0, RSTART, RLENGTH-2);
}
/MB\/s/{
match($0, "[0-9.eE]+ MB/s");
p = substr($0, RSTART, RLENGTH-5);
if (mode == "stdWrite")
{
sw[i] = p;
}
else if (mode == "stdRead")
{
sr[i] = p;
}
else if (mode == "gridWrite")
{
gw[i] = p;
}
else if (mode == "gridRead")
{
gr[i] = p;
}
i++;
}
END{
s = 0
for (a in l)
{
s++;
}
for (j = 0; j < s; j++)
{
printf("%s,%s,%s,%s,%s\n", l[j], sr[j], sw[j], gr[j], gw[j]);
}
printf("\n");
}
'
if (( $# != 1 )); then
echo "usage: `basename $0` <log file>" 1>&2
exit 1
fi
LOG=$1
awk "${awkscript}" ${LOG}

View File

@@ -123,6 +123,24 @@ case ${ac_LAPACK} in
AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
esac
############### Nc
AC_ARG_ENABLE([Nc],
[AC_HELP_STRING([--enable-Nc=2|3|4], [enable number of colours])],
[ac_Nc=${enable_Nc}], [ac_Nc=3])
case ${ac_Nc} in
2)
AC_DEFINE([Config_Nc],[2],[Gauge group Nc]);;
3)
AC_DEFINE([Config_Nc],[3],[Gauge group Nc]);;
4)
AC_DEFINE([Config_Nc],[4],[Gauge group Nc]);;
5)
AC_DEFINE([Config_Nc],[5],[Gauge group Nc]);;
*)
AC_MSG_ERROR(["Unsupport gauge group choice Nc = ${ac_Nc}"]);;
esac
############### FP16 conversions
AC_ARG_ENABLE([sfw-fp16],
[AC_HELP_STRING([--enable-sfw-fp16=yes|no], [enable software fp16 comms])],
@@ -135,18 +153,28 @@ case ${ac_SFW_FP16} in
AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);;
esac
############### SUMMIT JSRUN
AC_ARG_ENABLE([summit],
[AC_HELP_STRING([--enable-summit=yes|no], [enable IBMs jsrun resource manager for SUMMIT])],
[ac_SUMMIT=${enable_summit}], [ac_SUMMIT=no])
case ${ac_SUMMIT} in
no);;
############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons
AC_ARG_ENABLE([accelerator-cshift],
[AC_HELP_STRING([--enable-accelerator-cshift=yes|no], [run cshift on the device])],
[ac_ACC_CSHIFT=${enable_accelerator_cshift}], [ac_ACC_CSHIFT=yes])
AC_ARG_ENABLE([ucx-buggy],
[AC_HELP_STRING([--enable-ucx-buggy=yes|no], [enable workaround for UCX device buffer bugs])],
[ac_UCXBUGGY=${enable_ucx_buggy}], [ac_UCXBUGGY=no])
case ${ac_UCXBUGGY} in
yes)
AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);;
*)
AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);;
ac_ACC_CSHIFT=no;;
*);;
esac
case ${ac_ACC_CSHIFT} in
yes)
AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ UCX device buffer bugs are not present]);;
*);;
esac
############### SYCL/CUDA/HIP/none
AC_ARG_ENABLE([accelerator],
[AC_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none], [enable none,cuda,sycl,hip acceleration])],
@@ -163,8 +191,9 @@ case ${ac_ACCELERATOR} in
echo HIP acceleration
AC_DEFINE([GRID_HIP],[1],[Use HIP offload]);;
none)
echo NO acceleration
;;
echo NO acceleration ;;
no)
echo NO acceleration ;;
*)
AC_MSG_ERROR(["Acceleration not suppoorted ${ac_ACCELERATOR}"]);;
esac
@@ -459,27 +488,26 @@ esac
AM_CXXFLAGS="$SIMD_FLAGS $AM_CXXFLAGS"
AM_CFLAGS="$SIMD_FLAGS $AM_CFLAGS"
############### Precision selection
AC_ARG_ENABLE([precision],
[AC_HELP_STRING([--enable-precision=single|double],
[Select default word size of Real])],
[ac_PRECISION=${enable_precision}],[ac_PRECISION=double])
case ${ac_PRECISION} in
single)
AC_DEFINE([GRID_DEFAULT_PRECISION_SINGLE],[1],[GRID_DEFAULT_PRECISION is SINGLE] )
;;
double)
###### PRECISION ALWAYS DOUBLE
AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] )
;;
#########################################################
###################### set GPU device to rank in node ##
#########################################################
AC_ARG_ENABLE([setdevice],[AC_HELP_STRING([--enable-setdevice | --disable-setdevice],
[Set GPU to rank in node with cudaSetDevice or similar])],[ac_SETDEVICE=${enable_SETDEVICE}],[ac_SETDEVICE=no])
case ${ac_SETDEVICE} in
yes);;
*)
AC_MSG_ERROR([${ac_PRECISION} unsupported --enable-precision option]);
AC_DEFINE([GRID_DEFAULT_GPU],[1],[GRID_DEFAULT_GPU] )
;;
esac
###################### Shared memory allocation technique under MPI3
AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|shmget|hugetlbfs|shmnone],
[Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=shmopen])
#########################################################
###################### Shared memory intranode #########
#########################################################
AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|shmget|hugetlbfs|shmnone|nvlink|no],
[Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=no])
case ${ac_SHM} in
@@ -498,8 +526,12 @@ case ${ac_SHM} in
AC_DEFINE([GRID_MPI3_SHMGET],[1],[GRID_MPI3_SHMGET] )
;;
shmnone)
shmnone | no)
AC_DEFINE([GRID_MPI3_SHM_NONE],[1],[GRID_MPI3_SHM_NONE] )
AC_DEFINE([GRID_SHM_DISABLE],[1],[USE MPI for intranode comms]);;
nvlink)
AC_DEFINE([GRID_MPI3_SHM_NVLINK],[1],[GRID_MPI3_SHM_NVLINK] )
;;
hugetlbfs)
@@ -518,10 +550,23 @@ AC_ARG_ENABLE([shmpath],[AC_HELP_STRING([--enable-shmpath=path],
[ac_SHMPATH=/var/lib/hugetlbfs/global/pagesize-2MB/])
AC_DEFINE_UNQUOTED([GRID_SHM_PATH],["$ac_SHMPATH"],[Path to a hugetlbfs filesystem for MMAPing])
############### communication type selection
AC_ARG_ENABLE([comms-threads],[AC_HELP_STRING([--enable-comms-threads | --disable-comms-threads],
[Use multiple threads in MPI calls])],[ac_COMMS_THREADS=${enable_comms_threads}],[ac_COMMS_THREADS=yes])
case ${ac_COMMS_THREADS} in
yes)
AC_DEFINE([GRID_COMMS_THREADING],[1],[GRID_COMMS_NONE] )
;;
*) ;;
esac
############### communication type selection
AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi|mpi-auto],
[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
case ${ac_COMMS} in
none)
AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
@@ -656,6 +701,7 @@ os (target) : $target_os
compiler vendor : ${ax_cv_cxx_compiler_vendor}
compiler version : ${ax_cv_gxx_version}
----- BUILD OPTIONS -----------------------------------
Nc : ${ac_Nc}
SIMD : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
Threading : ${ac_openmp}
Acceleration : ${ac_ACCELERATOR}

View File

@@ -184,19 +184,19 @@ Below are shown the `configure` script invocations for three recommended configu
This is the build for every day developing and debugging with Xcode. It uses the Xcode clang c++ compiler, without MPI, and defaults to double-precision. Xcode builds the `Debug` configuration with debug symbols for full debugging:
../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --enable-precision=double --prefix=$GridPre/Debug
../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --prefix=$GridPre/Debug
#### 2. `Release`
Since Grid itself doesn't really have debug configurations, the release build is recommended to be the same as `Debug`, except using single-precision (handy for validation):
Since Grid itself doesn't really have debug configurations, the release build is recommended to be the same as `Debug`:
../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --enable-precision=single --prefix=$GridPre/Release
../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --prefix=$GridPre/Release
#### 3. `MPIDebug`
Debug configuration with MPI:
../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx --enable-precision=double --prefix=$GridPre/MPIDebug
../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx --prefix=$GridPre/MPIDebug
### 5.3 Build Grid

View File

@@ -178,15 +178,10 @@ Then enter the cloned directory and set up the build system::
Now you can execute the `configure` script to generate makefiles (here from a build directory)::
mkdir build; cd build
../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto \
../configure --enable-simd=AVX --enable-comms=mpi-auto \
--prefix=<path>
where::
--enable-precision=single|double
sets the **default precision**. Since this is largely a benchmarking convenience, it is anticipated that the default precision may be removed in future implementations,
and that explicit type selection be made at all points. Naturally, most code will be type templated in any case.::
::
--enable-simd=GEN|SSE4|AVX|AVXFMA|AVXFMA4|AVX2|AVX512|NEONv8|QPX
@@ -236,7 +231,7 @@ Detailed build configuration options
--enable-mkl[=path] use Intel MKL for FFT (and LAPACK if enabled) routines. A UNIX prefix containing the library can be specified (optional).
--enable-simd=code setup Grid for the SIMD target `<code>`(default: `GEN`). A list of possible SIMD targets is detailed in a section below.
--enable-gen-simd-width=size select the size (in bytes) of the generic SIMD vector type (default: 32 bytes). E.g. SSE 128 bit corresponds to 16 bytes.
--enable-precision=single|double set the default precision (default: `double`).
--enable-precision=single|double set the default precision (default: `double`). **Deprecated option**
--enable-comms=mpi|none use `<comm>` for message passing (default: `none`).
--enable-rng=sitmo|ranlux48|mt19937 choose the RNG (default: `sitmo`).
--disable-timers disable system dependent high-resolution timers.
@@ -304,8 +299,7 @@ Build setup for Intel Knights Landing platform
The following configuration is recommended for the Intel Knights Landing platform::
../configure --enable-precision=double\
--enable-simd=KNL \
../configure --enable-simd=KNL \
--enable-comms=mpi-auto \
--enable-mkl \
CXX=icpc MPICXX=mpiicpc
@@ -314,8 +308,7 @@ The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use::
../configure --enable-precision=double\
--enable-simd=KNL \
../configure --enable-simd=KNL \
--enable-comms=mpi \
--enable-mkl \
CXX=CC CC=cc
@@ -332,8 +325,7 @@ presently performs better with use of more than one rank per node, using shared
for interior communication.
We recommend four ranks per node for best performance, but optimum is local volume dependent. ::
../configure --enable-precision=double\
--enable-simd=KNL \
../configure --enable-simd=KNL \
--enable-comms=mpi-auto \
--enable-mkl \
CC=icpc MPICXX=mpiicpc
@@ -343,8 +335,7 @@ Build setup for Intel Haswell Xeon platform
The following configuration is recommended for the Intel Haswell platform::
../configure --enable-precision=double\
--enable-simd=AVX2 \
../configure --enable-simd=AVX2 \
--enable-comms=mpi-auto \
--enable-mkl \
CXX=icpc MPICXX=mpiicpc
@@ -360,8 +351,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use::
../configure --enable-precision=double\
--enable-simd=AVX2 \
../configure --enable-simd=AVX2 \
--enable-comms=mpi \
--enable-mkl \
CXX=CC CC=cc
@@ -379,8 +369,7 @@ Build setup for Intel Skylake Xeon platform
The following configuration is recommended for the Intel Skylake platform::
../configure --enable-precision=double\
--enable-simd=AVX512 \
../configure --enable-simd=AVX512 \
--enable-comms=mpi \
--enable-mkl \
CXX=mpiicpc
@@ -396,8 +385,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use::
../configure --enable-precision=double\
--enable-simd=AVX512 \
../configure --enable-simd=AVX512 \
--enable-comms=mpi \
--enable-mkl \
CXX=CC CC=cc
@@ -422,8 +410,7 @@ and 8 threads per rank.
The following configuration is recommended for the AMD EPYC platform::
../configure --enable-precision=double\
--enable-simd=AVX2 \
../configure --enable-simd=AVX2 \
--enable-comms=mpi \
CXX=mpicxx

View File

@@ -69,7 +69,7 @@ int main (int argc, char ** argv)
std::vector<LatticeColourMatrix> U(4,&Fine);
SU3::HotConfiguration(pRNGa,Umu);
SU<Nc>::HotConfiguration(pRNGa,Umu);
FieldMetaData header;

View File

@@ -84,7 +84,7 @@ int main (int argc, char ** argv)
std::vector<LatticeColourMatrix> U(4,&Fine);
SU3::HotConfiguration(pRNGa,Umu);
SU<Nc>::HotConfiguration(pRNGa,Umu);
FieldMetaData header;
std::string file("./ckpoint_lat.4000");

View File

@@ -80,7 +80,7 @@ int main (int argc, char ** argv)
GridParallelRNG sRNG5(sFGrid); sRNG5.SeedFixedIntegers(seeds5);
LatticeGaugeField Umu(UGrid);
SU3::HotConfiguration(RNG4,Umu);
SU<Nc>::HotConfiguration(RNG4,Umu);
RealD mass=0.1;
RealD M5 =1.8;

View File

@@ -202,7 +202,7 @@ int main (int argc, char ** argv) {
std::vector<int> seeds4({1,2,3,4});
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
LatticeGaugeField Umu(UGrid);
SU3::HotConfiguration(RNG4,Umu);
SU<Nc>::HotConfiguration(RNG4,Umu);
// FieldMetaData header;
// NerscIO::readConfiguration(Umu,header,Params.config);

View File

@@ -71,7 +71,7 @@ int main (int argc, char ** argv)
LatticeGaugeFieldD Umu(UGrid);
LatticeGaugeFieldF Umu_f(UGrid_f);
SU3::HotConfiguration(RNG4,Umu);
SU<Nc>::HotConfiguration(RNG4,Umu);
precisionChange(Umu_f,Umu);

View File

@@ -69,7 +69,7 @@ int main (int argc, char ** argv)
LatticeGaugeFieldD Umu(UGrid);
LatticeGaugeFieldF Umu_f(UGrid_f);
SU3::HotConfiguration(RNG4,Umu);
SU<Nc>::HotConfiguration(RNG4,Umu);
precisionChange(Umu_f,Umu);

View File

@@ -64,7 +64,7 @@ int main (int argc, char ** argv)
LatticeFermion ref(FGrid); ref=Zero();
LatticeFermion tmp(FGrid);
LatticeFermion err(FGrid);
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
std::vector<LatticeColourMatrix> U(4,UGrid);
for(int mu=0;mu<Nd;mu++){

View File

@@ -131,7 +131,7 @@ int main (int argc, char ** argv)
// LatticeFermion result(FGrid); result=Zero();
// LatticeGaugeField Umu(UGrid);
// SU3::HotConfiguration(RNG4,Umu);
// SU<Nc>::HotConfiguration(RNG4,Umu);
// std::vector<LatticeColourMatrix> U(4,UGrid);
// for(int mu=0;mu<Nd;mu++){

View File

@@ -69,7 +69,7 @@ int main (int argc, char ** argv)
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
std::vector<LatticeColourMatrix> U(4,UGrid);
RealD mass=0.1;

View File

@@ -73,7 +73,7 @@ int main (int argc, char ** argv)
LatticeFermion ref (FGrid); ref = Zero();
LatticeFermion tmp (FGrid); tmp = Zero();
LatticeFermion err (FGrid); err = Zero();
LatticeGaugeField Umu (UGrid); SU3::HotConfiguration(RNG4, Umu);
LatticeGaugeField Umu (UGrid); SU<Nc>::HotConfiguration(RNG4, Umu);
std::vector<LatticeColourMatrix> U(4,UGrid);
// Only one non-zero (y)

View File

@@ -72,7 +72,7 @@ int main (int argc, char ** argv)
LatticeFermion ref(FGrid); ref=Zero();
LatticeFermion tmp(FGrid); tmp=Zero();
LatticeFermion err(FGrid); tmp=Zero();
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
std::vector<LatticeColourMatrix> U(4,UGrid);
// Only one non-zero (y)

View File

@@ -138,7 +138,7 @@ int main (int argc, char ** argv)
LatticeGaugeFieldD Umu(&GRID);
SU3::ColdConfiguration(pRNG,Umu); // Unit gauge
SU<Nc>::ColdConfiguration(pRNG,Umu); // Unit gauge
// Umu=Zero();
////////////////////////////////////////////////////
// Wilson test

View File

@@ -73,11 +73,11 @@ int main (int argc, char ** argv)
LatticeColourMatrix xform2(&GRID); // Gauge xform
LatticeColourMatrix xform3(&GRID); // Gauge xform
SU3::ColdConfiguration(pRNG,Umu); // Unit gauge
SU<Nc>::ColdConfiguration(pRNG,Umu); // Unit gauge
Uorg=Umu;
Urnd=Umu;
SU3::RandomGaugeTransform(pRNG,Urnd,g); // Unit gauge
SU<Nc>::RandomGaugeTransform(pRNG,Urnd,g); // Unit gauge
Real plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
std::cout << " Initial plaquette "<<plaq << std::endl;
@@ -121,7 +121,7 @@ int main (int argc, char ** argv)
std::cout<< "* Testing non-unit configuration *" <<std::endl;
std::cout<< "*****************************************************************" <<std::endl;
SU3::HotConfiguration(pRNG,Umu); // Unit gauge
SU<Nc>::HotConfiguration(pRNG,Umu); // Unit gauge
plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
std::cout << " Initial plaquette "<<plaq << std::endl;
@@ -136,7 +136,7 @@ int main (int argc, char ** argv)
std::cout<< "*****************************************************************" <<std::endl;
Umu=Urnd;
SU3::HotConfiguration(pRNG,Umu); // Unit gauge
SU<Nc>::HotConfiguration(pRNG,Umu); // Unit gauge
plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
std::cout << " Initial plaquette "<<plaq << std::endl;

View File

@@ -114,7 +114,7 @@ int main (int argc, char ** argv)
GridParallelRNG RNG4_2f(UGrid_2f); RNG4_2f.SeedFixedIntegers(seeds4);
GparityGaugeField Umu_2f(UGrid_2f);
SU3::HotConfiguration(RNG4_2f,Umu_2f);
SU<Nc>::HotConfiguration(RNG4_2f,Umu_2f);
StandardFermionField src (FGrid_2f);
StandardFermionField tmpsrc(FGrid_2f);

View File

@@ -61,7 +61,7 @@ int main (int argc, char ** argv)
FermionField ref(&Grid); ref=Zero();
FermionField tmp(&Grid); tmp=Zero();
FermionField err(&Grid); tmp=Zero();
LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
std::vector<LatticeColourMatrix> U(4,&Grid);
double volume=1;

View File

@@ -66,7 +66,7 @@ int main(int argc, char** argv) {
std::cout << GridLogMessage << "*********************************************"
<< std::endl;
std::cout << GridLogMessage << "* Generators for SU(3)" << std::endl;
std::cout << GridLogMessage << "* Generators for SU(Nc" << std::endl;
std::cout << GridLogMessage << "*********************************************"
<< std::endl;
SU3::printGenerators();
@@ -114,8 +114,8 @@ int main(int argc, char** argv) {
LatticeGaugeField U(grid), V(grid);
SU<Nc>::HotConfiguration<LatticeGaugeField>(gridRNG, U);
SU<Nc>::HotConfiguration<LatticeGaugeField>(gridRNG, V);
SU3::HotConfiguration<LatticeGaugeField>(gridRNG, U);
SU3::HotConfiguration<LatticeGaugeField>(gridRNG, V);
// Adjoint representation
// Test group structure
@@ -123,8 +123,8 @@ int main(int argc, char** argv) {
LatticeGaugeField UV(grid);
UV = Zero();
for (int mu = 0; mu < Nd; mu++) {
SU<Nc>::LatticeMatrix Umu = peekLorentz(U,mu);
SU<Nc>::LatticeMatrix Vmu = peekLorentz(V,mu);
SU3::LatticeMatrix Umu = peekLorentz(U,mu);
SU3::LatticeMatrix Vmu = peekLorentz(V,mu);
pokeLorentz(UV,Umu*Vmu, mu);
}
@@ -151,16 +151,16 @@ int main(int argc, char** argv) {
// Check correspondence of algebra and group transformations
// Create a random vector
SU<Nc>::LatticeAlgebraVector h_adj(grid);
SU3::LatticeAlgebraVector h_adj(grid);
typename AdjointRep<Nc>::LatticeMatrix Ar(grid);
random(gridRNG,h_adj);
h_adj = real(h_adj);
SU_Adjoint<Nc>::AdjointLieAlgebraMatrix(h_adj,Ar);
// Re-extract h_adj
SU<Nc>::LatticeAlgebraVector h_adj2(grid);
SU3::LatticeAlgebraVector h_adj2(grid);
SU_Adjoint<Nc>::projectOnAlgebra(h_adj2, Ar);
SU<Nc>::LatticeAlgebraVector h_diff = h_adj - h_adj2;
SU3::LatticeAlgebraVector h_diff = h_adj - h_adj2;
std::cout << GridLogMessage << "Projections structure check vector difference (Adjoint representation) : " << norm2(h_diff) << std::endl;
// Exponentiate
@@ -183,14 +183,14 @@ int main(int argc, char** argv) {
// Construct the fundamental matrix in the group
SU<Nc>::LatticeMatrix Af(grid);
SU<Nc>::FundamentalLieAlgebraMatrix(h_adj,Af);
SU<Nc>::LatticeMatrix Ufund(grid);
SU3::LatticeMatrix Af(grid);
SU3::FundamentalLieAlgebraMatrix(h_adj,Af);
SU3::LatticeMatrix Ufund(grid);
Ufund = expMat(Af, 1.0, 16);
// Check unitarity
SU<Nc>::LatticeMatrix uno_f(grid);
SU3::LatticeMatrix uno_f(grid);
uno_f = 1.0;
SU<Nc>::LatticeMatrix UnitCheck(grid);
SU3::LatticeMatrix UnitCheck(grid);
UnitCheck = Ufund * adj(Ufund) - uno_f;
std::cout << GridLogMessage << "unitarity check 1: " << norm2(UnitCheck)
<< std::endl;
@@ -311,14 +311,14 @@ int main(int argc, char** argv) {
// Test group structure
// (U_f * V_f)_r = U_r * V_r
LatticeGaugeField U2(grid), V2(grid);
SU<Nc>::HotConfiguration<LatticeGaugeField>(gridRNG, U2);
SU<Nc>::HotConfiguration<LatticeGaugeField>(gridRNG, V2);
SU3::HotConfiguration<LatticeGaugeField>(gridRNG, U2);
SU3::HotConfiguration<LatticeGaugeField>(gridRNG, V2);
LatticeGaugeField UV2(grid);
UV2 = Zero();
for (int mu = 0; mu < Nd; mu++) {
SU<Nc>::LatticeMatrix Umu2 = peekLorentz(U2,mu);
SU<Nc>::LatticeMatrix Vmu2 = peekLorentz(V2,mu);
SU3::LatticeMatrix Umu2 = peekLorentz(U2,mu);
SU3::LatticeMatrix Vmu2 = peekLorentz(V2,mu);
pokeLorentz(UV2,Umu2*Vmu2, mu);
}
@@ -345,16 +345,16 @@ int main(int argc, char** argv) {
// Check correspondence of algebra and group transformations
// Create a random vector
SU<Nc>::LatticeAlgebraVector h_sym(grid);
SU3::LatticeAlgebraVector h_sym(grid);
typename TwoIndexRep< Nc, Symmetric>::LatticeMatrix Ar_sym(grid);
random(gridRNG,h_sym);
h_sym = real(h_sym);
SU_TwoIndex<Nc,Symmetric>::TwoIndexLieAlgebraMatrix(h_sym,Ar_sym);
// Re-extract h_sym
SU<Nc>::LatticeAlgebraVector h_sym2(grid);
SU3::LatticeAlgebraVector h_sym2(grid);
SU_TwoIndex< Nc, Symmetric>::projectOnAlgebra(h_sym2, Ar_sym);
SU<Nc>::LatticeAlgebraVector h_diff_sym = h_sym - h_sym2;
SU3::LatticeAlgebraVector h_diff_sym = h_sym - h_sym2;
std::cout << GridLogMessage << "Projections structure check vector difference (Two Index Symmetric): " << norm2(h_diff_sym) << std::endl;
@@ -379,11 +379,11 @@ int main(int argc, char** argv) {
// Construct the fundamental matrix in the group
SU<Nc>::LatticeMatrix Af_sym(grid);
SU<Nc>::FundamentalLieAlgebraMatrix(h_sym,Af_sym);
SU<Nc>::LatticeMatrix Ufund2(grid);
SU3::LatticeMatrix Af_sym(grid);
SU3::FundamentalLieAlgebraMatrix(h_sym,Af_sym);
SU3::LatticeMatrix Ufund2(grid);
Ufund2 = expMat(Af_sym, 1.0, 16);
SU<Nc>::LatticeMatrix UnitCheck2(grid);
SU3::LatticeMatrix UnitCheck2(grid);
UnitCheck2 = Ufund2 * adj(Ufund2) - uno_f;
std::cout << GridLogMessage << "unitarity check 1: " << norm2(UnitCheck2)
<< std::endl;
@@ -421,14 +421,14 @@ int main(int argc, char** argv) {
// Test group structure
// (U_f * V_f)_r = U_r * V_r
LatticeGaugeField U2A(grid), V2A(grid);
SU<Nc>::HotConfiguration<LatticeGaugeField>(gridRNG, U2A);
SU<Nc>::HotConfiguration<LatticeGaugeField>(gridRNG, V2A);
SU3::HotConfiguration<LatticeGaugeField>(gridRNG, U2A);
SU3::HotConfiguration<LatticeGaugeField>(gridRNG, V2A);
LatticeGaugeField UV2A(grid);
UV2A = Zero();
for (int mu = 0; mu < Nd; mu++) {
SU<Nc>::LatticeMatrix Umu2A = peekLorentz(U2,mu);
SU<Nc>::LatticeMatrix Vmu2A = peekLorentz(V2,mu);
SU3::LatticeMatrix Umu2A = peekLorentz(U2,mu);
SU3::LatticeMatrix Vmu2A = peekLorentz(V2,mu);
pokeLorentz(UV2A,Umu2A*Vmu2A, mu);
}
@@ -455,16 +455,16 @@ int main(int argc, char** argv) {
// Check correspondence of algebra and group transformations
// Create a random vector
SU<Nc>::LatticeAlgebraVector h_Asym(grid);
SU3::LatticeAlgebraVector h_Asym(grid);
typename TwoIndexRep< Nc, AntiSymmetric>::LatticeMatrix Ar_Asym(grid);
random(gridRNG,h_Asym);
h_Asym = real(h_Asym);
SU_TwoIndex< Nc, AntiSymmetric>::TwoIndexLieAlgebraMatrix(h_Asym,Ar_Asym);
// Re-extract h_sym
SU<Nc>::LatticeAlgebraVector h_Asym2(grid);
SU3::LatticeAlgebraVector h_Asym2(grid);
SU_TwoIndex< Nc, AntiSymmetric>::projectOnAlgebra(h_Asym2, Ar_Asym);
SU<Nc>::LatticeAlgebraVector h_diff_Asym = h_Asym - h_Asym2;
SU3::LatticeAlgebraVector h_diff_Asym = h_Asym - h_Asym2;
std::cout << GridLogMessage << "Projections structure check vector difference (Two Index anti-Symmetric): " << norm2(h_diff_Asym) << std::endl;
@@ -489,11 +489,11 @@ int main(int argc, char** argv) {
// Construct the fundamental matrix in the group
SU<Nc>::LatticeMatrix Af_Asym(grid);
SU<Nc>::FundamentalLieAlgebraMatrix(h_Asym,Af_Asym);
SU<Nc>::LatticeMatrix Ufund2A(grid);
SU3::LatticeMatrix Af_Asym(grid);
SU3::FundamentalLieAlgebraMatrix(h_Asym,Af_Asym);
SU3::LatticeMatrix Ufund2A(grid);
Ufund2A = expMat(Af_Asym, 1.0, 16);
SU<Nc>::LatticeMatrix UnitCheck2A(grid);
SU3::LatticeMatrix UnitCheck2A(grid);
UnitCheck2A = Ufund2A * adj(Ufund2A) - uno_f;
std::cout << GridLogMessage << "unitarity check 1: " << norm2(UnitCheck2A)
<< std::endl;

View File

@@ -444,7 +444,7 @@ int main(int argc, char **argv) {
// Lattice 12x12 GEMM
scFooBar = scFoo * scBar;
// Benchmark some simple operations LatticeSU3 * Lattice SU3.
// Benchmark some simple operations LatticeSU<Nc> * Lattice SU<Nc>.
double t0, t1, flops;
double bytes;
int ncall = 5000;

View File

@@ -73,7 +73,7 @@ int main (int argc, char ** argv)
LatticeFermion ref (FGrid); ref = Zero();
LatticeFermion tmp (FGrid); tmp = Zero();
LatticeFermion err (FGrid); err = Zero();
LatticeGaugeField Umu (UGrid); SU3::HotConfiguration(RNG4, Umu);
LatticeGaugeField Umu (UGrid); SU<Nc>::HotConfiguration(RNG4, Umu);
std::vector<LatticeColourMatrix> U(4,UGrid);
// Only one non-zero (y)

View File

@@ -55,7 +55,7 @@ int main (int argc, char ** argv)
GridParallelRNG pRNG(grid); pRNG.SeedFixedIntegers(pseeds);
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(sseeds);
// SU3 colour operatoions
// SU<Nc> colour operatoions
LatticeColourMatrix link(grid);
LatticeColourMatrix staple(grid);
@@ -87,10 +87,10 @@ int main (int argc, char ** argv)
link = PeekIndex<LorentzIndex>(Umu,mu);
for( int subgroup=0;subgroup<SU3::su2subgroups();subgroup++ ) {
for( int subgroup=0;subgroup<SU<Nc>::su2subgroups();subgroup++ ) {
// update Even checkerboard
SU3::SubGroupHeatBath(sRNG,pRNG,beta,link,staple,subgroup,20,mask);
SU<Nc>::SubGroupHeatBath(sRNG,pRNG,beta,link,staple,subgroup,20,mask);
}

View File

@@ -64,7 +64,7 @@ int main (int argc, char ** argv)
FermionField err(&Grid); tmp=Zero();
FermionField phi (&Grid); random(pRNG,phi);
FermionField chi (&Grid); random(pRNG,chi);
LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
std::vector<LatticeColourMatrix> U(4,&Grid);

View File

@@ -75,7 +75,7 @@ int main (int argc, char ** argv)
FermionField phi (FGrid); random(pRNG5,phi);
FermionField chi (FGrid); random(pRNG5,chi);
LatticeGaugeField Umu(UGrid); SU3::ColdConfiguration(pRNG4,Umu);
LatticeGaugeField Umu(UGrid); SU<Nc>::ColdConfiguration(pRNG4,Umu);
LatticeGaugeField Umua(UGrid); Umua=Umu;
double volume=Ls;

View File

@@ -84,7 +84,7 @@ int main (int argc, char ** argv)
FermionField chi (FGrid); random(pRNG5,chi);
LatticeGaugeField Umu(UGrid);
SU3::HotConfiguration(pRNG4,Umu);
SU<Nc>::HotConfiguration(pRNG4,Umu);
/*
for(int mu=1;mu<4;mu++){

View File

@@ -83,7 +83,7 @@ int main (int argc, char ** argv)
FermionField chi (FGrid); random(pRNG5,chi);
LatticeGaugeFieldF Umu(UGrid);
SU3::HotConfiguration(pRNG4,Umu);
SU<Nc>::HotConfiguration(pRNG4,Umu);
/*
for(int mu=1;mu<4;mu++){

View File

@@ -64,7 +64,7 @@ int main (int argc, char ** argv)
FermionField err(&Grid); tmp=Zero();
FermionField phi (&Grid); random(pRNG,phi);
FermionField chi (&Grid); random(pRNG,chi);
LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
std::vector<LatticeColourMatrix> U(4,&Grid);

106
tests/core/Test_unary.cc Normal file
View File

@@ -0,0 +1,106 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_quenched_update.cc
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
;
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
std::vector<int> latt({8,8,8,8});
GridCartesian * grid = SpaceTimeGrid::makeFourDimGrid(latt,
GridDefaultSimd(Nd,vComplexD::Nsimd()),
GridDefaultMpi());
GridCartesian * gridF = SpaceTimeGrid::makeFourDimGrid(latt,
GridDefaultSimd(Nd,vComplexF::Nsimd()),
GridDefaultMpi());
///////////////////////////////
// Configuration of known size
///////////////////////////////
LatticeColourMatrixD ident(grid);
LatticeColourMatrixD U(grid);
LatticeColourMatrixD tmp(grid);
LatticeColourMatrixD org(grid);
LatticeColourMatrixF UF(gridF);
LatticeGaugeField Umu(grid);
ident =1.0;
// RNG set up for test
std::vector<int> pseeds({1,2,3,4,5}); // once I caught a fish alive
std::vector<int> sseeds({6,7,8,9,10});// then i let it go again
GridParallelRNG pRNG(grid); pRNG.SeedFixedIntegers(pseeds);
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(sseeds);
SU<Nc>::HotConfiguration(pRNG,Umu);
U = PeekIndex<LorentzIndex>(Umu,0);
org=U;
tmp= U*adj(U) - ident ;
RealD Def1 = norm2( tmp );
std::cout << " Defect1 "<<Def1<<std::endl;
tmp = U - org;
std::cout << "Diff1 "<<norm2(tmp)<<std::endl;
precisionChange(UF,U);
precisionChange(U,UF);
tmp= U*adj(U) - ident ;
RealD Def2 = norm2( tmp );
std::cout << " Defect2 "<<Def2<<std::endl;
tmp = U - org;
std::cout << "Diff2 "<<norm2(tmp)<<std::endl;
U = ProjectOnGroup(U);
tmp= U*adj(U) - ident ;
RealD Def3 = norm2( tmp);
std::cout << " Defect3 "<<Def3<<std::endl;
tmp = U - org;
std::cout << "Diff3 "<<norm2(tmp)<<std::endl;
Grid_finalize();
}

View File

@@ -74,7 +74,7 @@ int main(int argc, char **argv)
FermionField chi(&Grid);
random(pRNG, chi);
LatticeGaugeField Umu(&Grid);
SU3::HotConfiguration(pRNG, Umu);
SU<Nc>::HotConfiguration(pRNG, Umu);
std::vector<LatticeColourMatrix> U(4, &Grid);
double volume = 1;

View File

@@ -70,7 +70,7 @@ int main (int argc, char ** argv)
LatticeFermion tmp(&Grid); tmp=Zero();
LatticeFermion err(&Grid); tmp=Zero();
LatticeGaugeField Umu(&Grid);
SU3::HotConfiguration(pRNG,Umu);
SU<Nc>::HotConfiguration(pRNG,Umu);
std::vector<LatticeColourMatrix> U(4,&Grid);
double volume=1;

View File

@@ -71,7 +71,7 @@ int main (int argc, char ** argv)
LatticeFermion ref(&Grid); ref=Zero();
LatticeFermion tmp(&Grid); tmp=Zero();
LatticeFermion err(&Grid); tmp=Zero();
LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
std::vector<LatticeColourMatrix> U(4,&Grid);
double volume=1;

View File

@@ -116,7 +116,7 @@ int main (int argc, char ** argv)
LatticeGaugeField Umu(UGrid);
LatticeGaugeFieldF UmuF(UGridF);
SU3::HotConfiguration(RNG4,Umu);
SU<Nc>::HotConfiguration(RNG4,Umu);
precisionChange(UmuF,Umu);
std::vector<LatticeColourMatrix> U(4,UGrid);

View File

@@ -77,7 +77,7 @@ int main (int argc, char ** argv)
LatticeFermion ref(FGrid); ref=Zero();
LatticeFermion tmp(FGrid);
LatticeFermion err(FGrid);
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
#if 0
std::vector<LatticeColourMatrix> U(4,UGrid);

View File

@@ -70,7 +70,7 @@ int main (int argc, char ** argv)
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
std::vector<LatticeColourMatrix> U(4,UGrid);
RealD mass=0.1;

View File

@@ -71,9 +71,9 @@ int main (int argc, char ** argv)
std::string file("./ckpoint_lat.400");
NerscIO::readConfiguration(Umu,header,file);
// SU3::ColdConfiguration(RNG4,Umu);
// SU3::TepidConfiguration(RNG4,Umu);
// SU3::HotConfiguration(RNG4,Umu);
// SU<Nc>::ColdConfiguration(RNG4,Umu);
// SU<Nc>::TepidConfiguration(RNG4,Umu);
// SU<Nc>::HotConfiguration(RNG4,Umu);
// Umu=Zero();
RealD mass=0.1;

View File

@@ -108,8 +108,8 @@ int main (int argc, char ** argv)
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
LatticeGaugeField Umu(UGrid);
SU3::ColdConfiguration(Umu);
// SU3::HotConfiguration(RNG4,Umu);
SU<Nc>::ColdConfiguration(Umu);
// SU<Nc>::HotConfiguration(RNG4,Umu);
RealD mass=0.3;
RealD M5 =1.0;

View File

@@ -73,7 +73,7 @@ int main(int argc, char** argv)
// Random gauge field
LatticeGaugeField Umu(UGrid);
SU3::HotConfiguration(RNG4, Umu);
SU<Nc>::HotConfiguration(RNG4, Umu);
DomainWallEOFAFermionR Lop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mpv, 0.0, -1, M5);
DomainWallEOFAFermionR Rop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mpv, mf, mpv, -1.0, 1, M5);

View File

@@ -77,7 +77,7 @@ int main(int argc, char** argv)
// Random gauge field
LatticeGaugeField Umu(UGrid);
SU3::HotConfiguration(RNG4, Umu);
SU<Nc>::HotConfiguration(RNG4, Umu);
// GparityDomainWallFermionR::ImplParams params;
FermionAction::ImplParams params;

View File

@@ -75,7 +75,7 @@ int main(int argc, char** argv)
// Random gauge field
LatticeGaugeField Umu(UGrid);
SU3::HotConfiguration(RNG4, Umu);
SU<Nc>::HotConfiguration(RNG4, Umu);
MobiusEOFAFermionR Lop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mpv, 0.0, -1, M5, b, c);
MobiusEOFAFermionR Rop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mpv, mf, mpv, -1.0, 1, M5, b, c);

View File

@@ -79,7 +79,7 @@ int main(int argc, char** argv)
// Random gauge field
LatticeGaugeField Umu(UGrid);
SU3::HotConfiguration(RNG4, Umu);
SU<Nc>::HotConfiguration(RNG4, Umu);
FermionAction::ImplParams params;
FermionAction Lop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mpv, 0.0, -1, M5, b, c, params);

View File

@@ -102,7 +102,7 @@ int main(int argc, char **argv)
// Random gauge field
LatticeGaugeField Umu(UGrid);
SU3::HotConfiguration(RNG4, Umu);
SU<Nc>::HotConfiguration(RNG4, Umu);
// Initialize RHMC fermion operators
DomainWallFermionR Ddwf_f(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, M5);

View File

@@ -104,7 +104,7 @@ int main(int argc, char **argv)
// Random gauge field
LatticeGaugeField Umu(UGrid);
SU3::HotConfiguration(RNG4, Umu);
SU<Nc>::HotConfiguration(RNG4, Umu);
// Initialize RHMC fermion operators
GparityDomainWallFermionR::ImplParams params;

View File

@@ -104,7 +104,7 @@ int main(int argc, char **argv)
// Random gauge field
LatticeGaugeField Umu(UGrid);
SU3::HotConfiguration(RNG4, Umu);
SU<Nc>::HotConfiguration(RNG4, Umu);
// Initialize RHMC fermion operators
MobiusFermionR Ddwf_f(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, M5, b, c);

View File

@@ -106,7 +106,7 @@ int main(int argc, char **argv)
// Random gauge field
LatticeGaugeField Umu(UGrid);
SU3::HotConfiguration(RNG4, Umu);
SU<Nc>::HotConfiguration(RNG4, Umu);
// Initialize RHMC fermion operators
GparityDomainWallFermionR::ImplParams params;

View File

@@ -59,7 +59,7 @@ int main (int argc, char ** argv)
LatticeGaugeField U(UGrid);
SU3::HotConfiguration(RNG4,U);
SU<Nc>::HotConfiguration(RNG4,U);
////////////////////////////////////
// Unmodified matrix element
@@ -93,7 +93,7 @@ int main (int argc, char ** argv)
for(int mu=0;mu<Nd;mu++){
SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
PokeIndex<LorentzIndex>(mom,mommu,mu);

View File

@@ -60,7 +60,7 @@ int main (int argc, char ** argv)
LatticeGaugeField U(UGrid);
SU3::HotConfiguration(RNG4,U);
SU<Nc>::HotConfiguration(RNG4,U);
////////////////////////////////////
// Unmodified matrix element
@@ -94,7 +94,7 @@ int main (int argc, char ** argv)
for(int mu=0;mu<Nd;mu++){
SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
PokeIndex<LorentzIndex>(mom,mommu,mu);

View File

@@ -72,7 +72,7 @@ int main (int argc, char** argv)
LatticeFermion MphiPrime (FGrid);
LatticeGaugeField U(UGrid);
SU3::HotConfiguration(RNG4,U);
SU<Nc>::HotConfiguration(RNG4,U);
////////////////////////////////////
// Unmodified matrix element
@@ -105,7 +105,7 @@ int main (int argc, char** argv)
for(int mu=0; mu<Nd; mu++){
SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
PokeIndex<LorentzIndex>(mom, mommu, mu);

View File

@@ -63,8 +63,8 @@ int main (int argc, char ** argv)
LatticeGaugeField U(UGrid);
SU3::HotConfiguration(RNG4,U);
// SU3::ColdConfiguration(pRNG,U);
SU<Nc>::HotConfiguration(RNG4,U);
// SU<Nc>::ColdConfiguration(pRNG,U);
////////////////////////////////////
// Unmodified matrix element
@@ -112,7 +112,7 @@ int main (int argc, char ** argv)
for(int mu=0;mu<Nd;mu++){
SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
Hmom -= real(sum(trace(mommu*mommu)));

View File

@@ -75,7 +75,7 @@ int main (int argc, char** argv)
FermionField MphiPrime (FGrid);
LatticeGaugeField U(UGrid);
SU3::HotConfiguration(RNG4,U);
SU<Nc>::HotConfiguration(RNG4,U);
////////////////////////////////////
// Unmodified matrix element
@@ -109,7 +109,7 @@ int main (int argc, char** argv)
for(int mu=0; mu<Nd; mu++){
SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
PokeIndex<LorentzIndex>(mom, mommu, mu);

View File

@@ -51,7 +51,7 @@ int main (int argc, char ** argv)
LatticeGaugeField U(&Grid);
SU3::HotConfiguration(pRNG,U);
SU<Nc>::HotConfiguration(pRNG,U);
double beta = 1.0;
ConjugateWilsonGaugeActionR Action(beta);
@@ -80,7 +80,7 @@ int main (int argc, char ** argv)
for(int mu=0;mu<Nd;mu++){
SU3::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); // Traceless antihermitian momentum; gaussian in lie alg
SU<Nc>::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); // Traceless antihermitian momentum; gaussian in lie alg
PokeIndex<LorentzIndex>(mom,mommu,mu);

View File

@@ -54,7 +54,7 @@ int main (int argc, char ** argv)
LatticeGaugeField U(&Grid);
SU3::HotConfiguration(pRNG,U);
SU<Nc>::HotConfiguration(pRNG,U);
double beta = 1.0;
double c1 = 0.331;
@@ -82,7 +82,7 @@ int main (int argc, char ** argv)
for(int mu=0;mu<Nd;mu++){
SU3::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); // Traceless antihermitian momentum; gaussian in lie alg
SU<Nc>::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); // Traceless antihermitian momentum; gaussian in lie alg
PokeIndex<LorentzIndex>(mom,mommu,mu);

View File

@@ -63,7 +63,7 @@ int main (int argc, char ** argv)
LatticeGaugeField U(UGrid);
SU3::HotConfiguration(RNG4,U);
SU<Nc>::HotConfiguration(RNG4,U);
////////////////////////////////////
// Unmodified matrix element
@@ -100,7 +100,7 @@ int main (int argc, char ** argv)
for(int mu=0;mu<Nd;mu++){
SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
PokeIndex<LorentzIndex>(mom,mommu,mu);

View File

@@ -57,7 +57,7 @@ int main (int argc, char ** argv)
LatticeGaugeField U(UGrid);
SU3::HotConfiguration(RNG4,U);
SU<Nc>::HotConfiguration(RNG4,U);
////////////////////////////////////
// Unmodified matrix element
@@ -94,7 +94,7 @@ int main (int argc, char ** argv)
for(int mu=0;mu<Nd;mu++){
// Traceless antihermitian momentum; gaussian in lie alg
SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu);
SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu);
PokeIndex<LorentzIndex>(mom,mommu,mu);

View File

@@ -58,7 +58,7 @@ int main (int argc, char ** argv)
PokeIndex<LorentzIndex>(P, P_mu, mu);
}
SU3::HotConfiguration(pRNG,U);
SU<Nc>::HotConfiguration(pRNG,U);
ConjugateGradient<LatticeGaugeField> CG(1.0e-8, 10000);
@@ -95,7 +95,7 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage << "Update the U " << std::endl;
for(int mu=0;mu<Nd;mu++){
// Traceless antihermitian momentum; gaussian in lie algebra
SU3::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu);
SU<Nc>::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu);
auto Umu = PeekIndex<LorentzIndex>(U, mu);
PokeIndex<LorentzIndex>(mom,mommu,mu);
Umu = expMat(mommu, dt, 12) * Umu;

View File

@@ -60,7 +60,7 @@ int main (int argc, char ** argv)
LatticeGaugeField U(UGrid);
SU3::HotConfiguration(RNG4,U);
SU<Nc>::HotConfiguration(RNG4,U);
////////////////////////////////////
// Unmodified matrix element
@@ -96,7 +96,7 @@ int main (int argc, char ** argv)
for(int mu=0;mu<Nd;mu++){
SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
PokeIndex<LorentzIndex>(mom,mommu,mu);

View File

@@ -72,7 +72,7 @@ int main (int argc, char** argv)
LatticeFermion MphiPrime (FGrid);
LatticeGaugeField U(UGrid);
SU3::HotConfiguration(RNG4,U);
SU<Nc>::HotConfiguration(RNG4,U);
////////////////////////////////////
// Unmodified matrix element
@@ -107,7 +107,7 @@ int main (int argc, char** argv)
for(int mu=0; mu<Nd; mu++){
SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
PokeIndex<LorentzIndex>(mom, mommu, mu);

View File

@@ -76,7 +76,7 @@ int main (int argc, char** argv)
FermionField MphiPrime (FGrid);
LatticeGaugeField U(UGrid);
SU3::HotConfiguration(RNG4,U);
SU<Nc>::HotConfiguration(RNG4,U);
////////////////////////////////////
// Unmodified matrix element
@@ -112,7 +112,7 @@ int main (int argc, char** argv)
for(int mu=0; mu<Nd; mu++){
SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
PokeIndex<LorentzIndex>(mom, mommu, mu);
autoView( U_v , U, CpuRead);

View File

@@ -62,7 +62,7 @@ int main (int argc, char ** argv)
LatticeGaugeField U(UGrid);
SU3::HotConfiguration(RNG4,U);
SU<Nc>::HotConfiguration(RNG4,U);
////////////////////////////////////
// Unmodified matrix element
@@ -96,7 +96,7 @@ int main (int argc, char ** argv)
for(int mu=0;mu<Nd;mu++){
SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
PokeIndex<LorentzIndex>(mom,mommu,mu);

View File

@@ -54,7 +54,7 @@ int main (int argc, char ** argv)
LatticeGaugeField U(&Grid);
SU3::HotConfiguration(pRNG,U);
SU<Nc>::HotConfiguration(pRNG,U);
double beta = 1.0;
double c1 = -0.331;
@@ -82,7 +82,7 @@ int main (int argc, char ** argv)
for(int mu=0;mu<Nd;mu++){
SU3::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); // Traceless antihermitian momentum; gaussian in lie alg
SU<Nc>::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); // Traceless antihermitian momentum; gaussian in lie alg
PokeIndex<LorentzIndex>(mom,mommu,mu);

View File

@@ -61,7 +61,7 @@ int main (int argc, char ** argv)
LatticeGaugeField U(&Grid);
//SU2::HotConfiguration(pRNG,U);
SU3::ColdConfiguration(pRNG,U);
SU<Nc>::ColdConfiguration(pRNG,U);
////////////////////////////////////
// Unmodified matrix element
@@ -98,7 +98,7 @@ int main (int argc, char ** argv)
for(int mu=0;mu<Nd;mu++){
// Traceless antihermitian momentum; gaussian in lie alg
SU3::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu);
SU<Nc>::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu);
Hmom -= real(sum(trace(mommu*mommu)));

View File

@@ -62,8 +62,8 @@ int main(int argc, char **argv)
LatticeGaugeField U(&Grid);
SU3::HotConfiguration(pRNG, U);
//SU3::ColdConfiguration(pRNG, U);// Clover term Zero()
SU<Nc>::HotConfiguration(pRNG, U);
//SU<Nc>::ColdConfiguration(pRNG, U);// Clover term Zero()
////////////////////////////////////
// Unmodified matrix element
@@ -101,7 +101,7 @@ int main(int argc, char **argv)
for (int mu = 0; mu < Nd; mu++)
{
// Traceless antihermitian momentum; gaussian in lie alg
SU3::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu);
SU<Nc>::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu);
Hmom -= real(sum(trace(mommu * mommu)));
PokeIndex<LorentzIndex>(mom, mommu, mu);

View File

@@ -59,7 +59,7 @@ int main (int argc, char ** argv)
LatticeGaugeField U(UGrid);
SU3::HotConfiguration(RNG4,U);
SU<Nc>::HotConfiguration(RNG4,U);
////////////////////////////////////
// Unmodified matrix element
@@ -109,7 +109,7 @@ int main (int argc, char ** argv)
for(int mu=0;mu<Nd;mu++){
SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
PokeIndex<LorentzIndex>(mom,mommu,mu);

Some files were not shown because too many files have changed in this diff Show More