mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-14 01:35:36 +00:00
Tidy up of mpi3; also some cleaning of the dslash controls.
This commit is contained in:
parent
791cb050c8
commit
bb94ddd0eb
@ -48,8 +48,8 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl;
|
std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " L "<<"\t\t"<<" Ls "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
|
std::cout<<GridLogMessage << " L "<<"\t\t"<<" Ls "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
|
||||||
|
int maxlat=32;
|
||||||
for(int lat=4;lat<=32;lat+=2){
|
for(int lat=4;lat<=maxlat;lat+=2){
|
||||||
for(int Ls=1;Ls<=16;Ls*=2){
|
for(int Ls=1;Ls<=16;Ls*=2){
|
||||||
|
|
||||||
std::vector<int> latt_size ({lat*mpi_layout[0],
|
std::vector<int> latt_size ({lat*mpi_layout[0],
|
||||||
@ -124,7 +124,7 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage << " L "<<"\t\t"<<" Ls "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
|
std::cout<<GridLogMessage << " L "<<"\t\t"<<" Ls "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
for(int lat=4;lat<=32;lat+=2){
|
for(int lat=4;lat<=maxlat;lat+=2){
|
||||||
for(int Ls=1;Ls<=16;Ls*=2){
|
for(int Ls=1;Ls<=16;Ls*=2){
|
||||||
|
|
||||||
std::vector<int> latt_size ({lat,lat,lat,lat});
|
std::vector<int> latt_size ({lat,lat,lat,lat});
|
||||||
@ -199,7 +199,7 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " L "<<"\t\t"<<" Ls "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
|
std::cout<<GridLogMessage << " L "<<"\t\t"<<" Ls "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
|
||||||
|
|
||||||
for(int lat=4;lat<=32;lat+=2){
|
for(int lat=4;lat<=maxlat;lat+=2){
|
||||||
for(int Ls=1;Ls<=16;Ls*=2){
|
for(int Ls=1;Ls<=16;Ls*=2){
|
||||||
|
|
||||||
std::vector<int> latt_size ({lat*mpi_layout[0],
|
std::vector<int> latt_size ({lat*mpi_layout[0],
|
||||||
@ -271,131 +271,5 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
|
std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#if 0
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "= Benchmarking sequential persistent halo exchange in "<<nmu<<" dimensions"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << " L "<<"\t\t"<<" Ls "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
|
|
||||||
|
|
||||||
|
|
||||||
for(int lat=4;lat<=32;lat+=2){
|
|
||||||
for(int Ls=1;Ls<=16;Ls*=2){
|
|
||||||
|
|
||||||
std::vector<int> latt_size ({lat,lat,lat,lat});
|
|
||||||
|
|
||||||
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
|
||||||
|
|
||||||
std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
|
|
||||||
std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
|
|
||||||
|
|
||||||
|
|
||||||
int ncomm;
|
|
||||||
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
|
|
||||||
|
|
||||||
|
|
||||||
std::vector<CartesianCommunicator::CommsRequest_t> empty;
|
|
||||||
std::vector<std::vector<CartesianCommunicator::CommsRequest_t> > requests_fwd(Nd,empty);
|
|
||||||
std::vector<std::vector<CartesianCommunicator::CommsRequest_t> > requests_bwd(Nd,empty);
|
|
||||||
|
|
||||||
for(int mu=0;mu<4;mu++){
|
|
||||||
ncomm=0;
|
|
||||||
if (mpi_layout[mu]>1 ) {
|
|
||||||
ncomm++;
|
|
||||||
|
|
||||||
int comm_proc;
|
|
||||||
int xmit_to_rank;
|
|
||||||
int recv_from_rank;
|
|
||||||
|
|
||||||
comm_proc=1;
|
|
||||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
|
||||||
Grid.SendToRecvFromInit(requests_fwd[mu],
|
|
||||||
(void *)&xbuf[mu][0],
|
|
||||||
xmit_to_rank,
|
|
||||||
(void *)&rbuf[mu][0],
|
|
||||||
recv_from_rank,
|
|
||||||
bytes);
|
|
||||||
|
|
||||||
comm_proc = mpi_layout[mu]-1;
|
|
||||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
|
||||||
Grid.SendToRecvFromInit(requests_bwd[mu],
|
|
||||||
(void *)&xbuf[mu+4][0],
|
|
||||||
xmit_to_rank,
|
|
||||||
(void *)&rbuf[mu+4][0],
|
|
||||||
recv_from_rank,
|
|
||||||
bytes);
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
double start=usecond();
|
|
||||||
for(int i=0;i<Nloop;i++){
|
|
||||||
|
|
||||||
for(int mu=0;mu<4;mu++){
|
|
||||||
|
|
||||||
if (mpi_layout[mu]>1 ) {
|
|
||||||
|
|
||||||
Grid.SendToRecvFromBegin(requests_fwd[mu]);
|
|
||||||
Grid.SendToRecvFromComplete(requests_fwd[mu]);
|
|
||||||
Grid.SendToRecvFromBegin(requests_bwd[mu]);
|
|
||||||
Grid.SendToRecvFromComplete(requests_bwd[mu]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Grid.Barrier();
|
|
||||||
}
|
|
||||||
|
|
||||||
double stop=usecond();
|
|
||||||
|
|
||||||
double dbytes = bytes;
|
|
||||||
double xbytes = Nloop*dbytes*2.0*ncomm;
|
|
||||||
double rbytes = xbytes;
|
|
||||||
double bidibytes = xbytes+rbytes;
|
|
||||||
|
|
||||||
double time = stop-start;
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
{
|
|
||||||
double start=usecond();
|
|
||||||
for(int i=0;i<Nloop;i++){
|
|
||||||
|
|
||||||
for(int mu=0;mu<4;mu++){
|
|
||||||
|
|
||||||
if (mpi_layout[mu]>1 ) {
|
|
||||||
|
|
||||||
Grid.SendToRecvFromBegin(requests_fwd[mu]);
|
|
||||||
Grid.SendToRecvFromBegin(requests_bwd[mu]);
|
|
||||||
Grid.SendToRecvFromComplete(requests_fwd[mu]);
|
|
||||||
Grid.SendToRecvFromComplete(requests_bwd[mu]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Grid.Barrier();
|
|
||||||
}
|
|
||||||
|
|
||||||
double stop=usecond();
|
|
||||||
|
|
||||||
double dbytes = bytes;
|
|
||||||
double xbytes = Nloop*dbytes*2.0*ncomm;
|
|
||||||
double rbytes = xbytes;
|
|
||||||
double bidibytes = xbytes+rbytes;
|
|
||||||
|
|
||||||
double time = stop-start;
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
}
|
}
|
||||||
|
@ -44,7 +44,6 @@ struct scal {
|
|||||||
Gamma::GammaT
|
Gamma::GammaT
|
||||||
};
|
};
|
||||||
|
|
||||||
bool overlapComms = false;
|
|
||||||
typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
|
typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
|
||||||
typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
|
typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
|
||||||
typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
|
typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
|
||||||
@ -54,10 +53,6 @@ int main (int argc, char ** argv)
|
|||||||
{
|
{
|
||||||
Grid_init(&argc,&argv);
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
|
|
||||||
overlapComms = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
int threads = GridThread::GetThreads();
|
int threads = GridThread::GetThreads();
|
||||||
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
||||||
|
|
||||||
@ -126,14 +121,21 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
RealD NP = UGrid->_Nprocessors;
|
RealD NP = UGrid->_Nprocessors;
|
||||||
|
|
||||||
for(int doasm=1;doasm<2;doasm++){
|
|
||||||
|
|
||||||
QCD::WilsonKernelsStatic::AsmOpt=doasm;
|
|
||||||
|
|
||||||
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Naive wilson implementation "<<std::endl;
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
std::cout << GridLogMessage<< "Calling Dw"<<std::endl;
|
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
|
||||||
|
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
||||||
|
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
|
|
||||||
int ncall =100;
|
int ncall =100;
|
||||||
if (1) {
|
if (1) {
|
||||||
|
|
||||||
@ -162,6 +164,17 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
if (1)
|
if (1)
|
||||||
{
|
{
|
||||||
|
|
||||||
|
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::Dhop "<<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl;
|
||||||
|
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
||||||
|
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||||
|
|
||||||
typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
|
typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
|
||||||
LatticeFermion ssrc(sFGrid);
|
LatticeFermion ssrc(sFGrid);
|
||||||
LatticeFermion sref(sFGrid);
|
LatticeFermion sref(sFGrid);
|
||||||
@ -248,6 +261,16 @@ int main (int argc, char ** argv)
|
|||||||
sr_e = zero;
|
sr_e = zero;
|
||||||
sr_o = zero;
|
sr_o = zero;
|
||||||
|
|
||||||
|
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::DhopEO "<<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl;
|
||||||
|
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
||||||
|
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||||
|
|
||||||
sDw.ZeroCounters();
|
sDw.ZeroCounters();
|
||||||
sDw.stat.init("DhopEO");
|
sDw.stat.init("DhopEO");
|
||||||
double t0=usecond();
|
double t0=usecond();
|
||||||
@ -308,7 +331,7 @@ int main (int argc, char ** argv)
|
|||||||
ref = -0.5*ref;
|
ref = -0.5*ref;
|
||||||
}
|
}
|
||||||
Dw.Dhop(src,result,1);
|
Dw.Dhop(src,result,1);
|
||||||
std::cout << GridLogMessage << "Naive wilson implementation Dag" << std::endl;
|
std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
|
||||||
std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
|
std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
||||||
@ -322,13 +345,22 @@ int main (int argc, char ** argv)
|
|||||||
LatticeFermion r_eo (FGrid);
|
LatticeFermion r_eo (FGrid);
|
||||||
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Calling Deo and Doe"<<std::endl;
|
std::cout<<GridLogMessage << "Calling Deo and Doe and assert Deo+Doe == Dunprec"<<std::endl;
|
||||||
pickCheckerboard(Even,src_e,src);
|
pickCheckerboard(Even,src_e,src);
|
||||||
pickCheckerboard(Odd,src_o,src);
|
pickCheckerboard(Odd,src_o,src);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
|
std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
|
std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
|
||||||
|
|
||||||
|
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::DhopEO "<<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
|
||||||
|
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
||||||
|
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||||
{
|
{
|
||||||
Dw.ZeroCounters();
|
Dw.ZeroCounters();
|
||||||
double t0=usecond();
|
double t0=usecond();
|
||||||
@ -366,8 +398,5 @@ int main (int argc, char ** argv)
|
|||||||
assert(norm2(src_e)<1.0e-5);
|
assert(norm2(src_e)<1.0e-5);
|
||||||
assert(norm2(src_o)<1.0e-5);
|
assert(norm2(src_o)<1.0e-5);
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
}
|
}
|
||||||
|
@ -1,153 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./benchmarks/Benchmark_dwf.cc
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#include <Grid/Grid.h>
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
using namespace Grid;
|
|
||||||
using namespace Grid::QCD;
|
|
||||||
|
|
||||||
template<class d>
|
|
||||||
struct scal {
|
|
||||||
d internal;
|
|
||||||
};
|
|
||||||
|
|
||||||
Gamma::GammaMatrix Gmu [] = {
|
|
||||||
Gamma::GammaX,
|
|
||||||
Gamma::GammaY,
|
|
||||||
Gamma::GammaZ,
|
|
||||||
Gamma::GammaT
|
|
||||||
};
|
|
||||||
|
|
||||||
bool overlapComms = false;
|
|
||||||
|
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
|
||||||
{
|
|
||||||
Grid_init(&argc,&argv);
|
|
||||||
|
|
||||||
if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
|
|
||||||
overlapComms = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
int threads = GridThread::GetThreads();
|
|
||||||
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
|
||||||
|
|
||||||
std::vector<int> latt4 = GridDefaultLatt();
|
|
||||||
const int Ls=16;
|
|
||||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
|
||||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
|
||||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
|
||||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
|
||||||
|
|
||||||
std::vector<int> seeds4({1,2,3,4});
|
|
||||||
std::vector<int> seeds5({5,6,7,8});
|
|
||||||
|
|
||||||
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
|
||||||
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
|
||||||
|
|
||||||
LatticeFermion src (FGrid); random(RNG5,src);
|
|
||||||
LatticeFermion result(FGrid); result=zero;
|
|
||||||
LatticeFermion ref(FGrid); ref=zero;
|
|
||||||
LatticeFermion tmp(FGrid);
|
|
||||||
LatticeFermion err(FGrid);
|
|
||||||
|
|
||||||
ColourMatrix cm = Complex(1.0,0.0);
|
|
||||||
|
|
||||||
LatticeGaugeField Umu(UGrid);
|
|
||||||
random(RNG4,Umu);
|
|
||||||
|
|
||||||
LatticeGaugeField Umu5d(FGrid);
|
|
||||||
|
|
||||||
// replicate across fifth dimension
|
|
||||||
for(int ss=0;ss<Umu._grid->oSites();ss++){
|
|
||||||
for(int s=0;s<Ls;s++){
|
|
||||||
Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////
|
|
||||||
// Naive wilson implementation
|
|
||||||
////////////////////////////////////
|
|
||||||
std::vector<LatticeColourMatrix> U(4,FGrid);
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
|
||||||
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (1)
|
|
||||||
{
|
|
||||||
ref = zero;
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
|
||||||
|
|
||||||
tmp = U[mu]*Cshift(src,mu+1,1);
|
|
||||||
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
|
|
||||||
|
|
||||||
tmp =adj(U[mu])*src;
|
|
||||||
tmp =Cshift(tmp,mu+1,-1);
|
|
||||||
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
|
|
||||||
}
|
|
||||||
ref = -0.5*ref;
|
|
||||||
}
|
|
||||||
|
|
||||||
RealD mass=0.1;
|
|
||||||
RealD M5 =1.8;
|
|
||||||
|
|
||||||
typename DomainWallFermionR::ImplParams params;
|
|
||||||
params.overlapCommsCompute = overlapComms;
|
|
||||||
|
|
||||||
RealD NP = UGrid->_Nprocessors;
|
|
||||||
|
|
||||||
|
|
||||||
QCD::WilsonKernelsStatic::AsmOpt=1;
|
|
||||||
|
|
||||||
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
|
|
||||||
int ncall =50;
|
|
||||||
if (1) {
|
|
||||||
|
|
||||||
double t0=usecond();
|
|
||||||
for(int i=0;i<ncall;i++){
|
|
||||||
Dw.Dhop(src,result,0);
|
|
||||||
}
|
|
||||||
double t1=usecond();
|
|
||||||
|
|
||||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
|
||||||
double flops=1344*volume*ncall;
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NP<<std::endl;
|
|
||||||
err = ref-result;
|
|
||||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
|
||||||
// Dw.Report();
|
|
||||||
}
|
|
||||||
Grid_finalize();
|
|
||||||
}
|
|
@ -51,16 +51,18 @@ int main (int argc, char ** argv)
|
|||||||
{
|
{
|
||||||
Grid_init(&argc,&argv);
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
|
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
|
|
||||||
const int Ls=8;
|
const int Ls=8;
|
||||||
int threads = GridThread::GetThreads();
|
int threads = GridThread::GetThreads();
|
||||||
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
||||||
|
|
||||||
if ( getenv("ASMOPT") ) {
|
|
||||||
QCD::WilsonKernelsStatic::AsmOpt=1;
|
|
||||||
} else {
|
|
||||||
QCD::WilsonKernelsStatic::AsmOpt=0;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << "= Benchmarking DWF"<<std::endl;
|
std::cout<<GridLogMessage << "= Benchmarking DWF"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
|
||||||
|
@ -58,6 +58,19 @@ int main (int argc, char ** argv)
|
|||||||
std::vector<int> seeds({1,2,3,4});
|
std::vector<int> seeds({1,2,3,4});
|
||||||
RealD mass = 0.1;
|
RealD mass = 0.1;
|
||||||
|
|
||||||
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* Benchmarking WilsonFermionR::Dhop "<<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
|
||||||
|
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
||||||
|
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "============================================================================="<< std::endl;
|
std::cout<<GridLogMessage << "============================================================================="<< std::endl;
|
||||||
std::cout<<GridLogMessage << "= Benchmarking Wilson" << std::endl;
|
std::cout<<GridLogMessage << "= Benchmarking Wilson" << std::endl;
|
||||||
std::cout<<GridLogMessage << "============================================================================="<< std::endl;
|
std::cout<<GridLogMessage << "============================================================================="<< std::endl;
|
||||||
|
@ -1,175 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./tests/Test_zmm.cc
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#include <Grid/Grid.h>
|
|
||||||
|
|
||||||
|
|
||||||
using namespace Grid;
|
|
||||||
using namespace Grid::QCD;
|
|
||||||
|
|
||||||
|
|
||||||
int bench(std::ofstream &os, std::vector<int> &latt4,int Ls);
|
|
||||||
|
|
||||||
int main(int argc,char **argv)
|
|
||||||
{
|
|
||||||
Grid_init(&argc,&argv);
|
|
||||||
std::ofstream os("zmm.dat");
|
|
||||||
|
|
||||||
os << "#V Ls Lxy Lzt C++ Asm OMP L1 " <<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "====================================================================="<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "= Benchmarking ZMM"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "====================================================================="<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Volume \t\t\t\tC++DW/MFLOPs\tASM-DW/MFLOPs\tdiff"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "====================================================================="<<std::endl;
|
|
||||||
for(int L=4;L<=32;L+=4){
|
|
||||||
for(int m=1;m<=2;m++){
|
|
||||||
for(int Ls=8;Ls<=16;Ls+=8){
|
|
||||||
std::vector<int> grid({L,L,m*L,m*L});
|
|
||||||
std::cout << GridLogMessage <<"\t";
|
|
||||||
for(int i=0;i<4;i++) {
|
|
||||||
std::cout << grid[i]<<"x";
|
|
||||||
}
|
|
||||||
std::cout << Ls<<"\t\t";
|
|
||||||
bench(os,grid,Ls);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
|
|
||||||
{
|
|
||||||
|
|
||||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
|
||||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
|
||||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
|
||||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
|
||||||
|
|
||||||
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
|
|
||||||
std::vector<int> mpi_layout = GridDefaultMpi();
|
|
||||||
int threads = GridThread::GetThreads();
|
|
||||||
|
|
||||||
std::vector<int> seeds4({1,2,3,4});
|
|
||||||
std::vector<int> seeds5({5,6,7,8});
|
|
||||||
|
|
||||||
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4);
|
|
||||||
|
|
||||||
LatticeFermion src (FGrid);
|
|
||||||
LatticeFermion tmp (FGrid);
|
|
||||||
LatticeFermion srce(FrbGrid);
|
|
||||||
|
|
||||||
LatticeFermion resulto(FrbGrid); resulto=zero;
|
|
||||||
LatticeFermion resulta(FrbGrid); resulta=zero;
|
|
||||||
LatticeFermion junk(FrbGrid); junk=zero;
|
|
||||||
LatticeFermion diff(FrbGrid);
|
|
||||||
LatticeGaugeField Umu(UGrid);
|
|
||||||
|
|
||||||
double mfc, mfa, mfo, mfl1;
|
|
||||||
|
|
||||||
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
|
||||||
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
|
||||||
random(RNG5,src);
|
|
||||||
#if 1
|
|
||||||
random(RNG4,Umu);
|
|
||||||
#else
|
|
||||||
int mmu=2;
|
|
||||||
std::vector<LatticeColourMatrix> U(4,UGrid);
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
|
||||||
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
|
|
||||||
if ( mu!=mmu ) U[mu] = zero;
|
|
||||||
if ( mu==mmu ) U[mu] = 1.0;
|
|
||||||
PokeIndex<LorentzIndex>(Umu,U[mu],mu);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
pickCheckerboard(Even,srce,src);
|
|
||||||
|
|
||||||
RealD mass=0.1;
|
|
||||||
RealD M5 =1.8;
|
|
||||||
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
|
||||||
|
|
||||||
int ncall=50;
|
|
||||||
double t0=usecond();
|
|
||||||
for(int i=0;i<ncall;i++){
|
|
||||||
Dw.DhopOE(srce,resulto,0);
|
|
||||||
}
|
|
||||||
double t1=usecond();
|
|
||||||
|
|
||||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
|
||||||
double flops=1344*volume/2;
|
|
||||||
|
|
||||||
mfc = flops*ncall/(t1-t0);
|
|
||||||
std::cout<<mfc<<"\t\t";
|
|
||||||
|
|
||||||
QCD::WilsonKernelsStatic::AsmOpt=1;
|
|
||||||
t0=usecond();
|
|
||||||
for(int i=0;i<ncall;i++){
|
|
||||||
Dw.DhopOE(srce,resulta,0);
|
|
||||||
}
|
|
||||||
t1=usecond();
|
|
||||||
mfa = flops*ncall/(t1-t0);
|
|
||||||
std::cout<<mfa<<"\t\t";
|
|
||||||
/*
|
|
||||||
int dag=DaggerNo;
|
|
||||||
t0=usecond();
|
|
||||||
for(int i=0;i<1;i++){
|
|
||||||
Dw.DhopInternalOMPbench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
|
|
||||||
}
|
|
||||||
t1=usecond();
|
|
||||||
mfo = flops*100/(t1-t0);
|
|
||||||
std::cout<<GridLogMessage << "Called ASM-OMP Dw"<< " mflop/s = "<< mfo<<std::endl;
|
|
||||||
|
|
||||||
t0=usecond();
|
|
||||||
for(int i=0;i<1;i++){
|
|
||||||
Dw.DhopInternalL1bench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
|
|
||||||
}
|
|
||||||
t1=usecond();
|
|
||||||
mfl1= flops*100/(t1-t0);
|
|
||||||
std::cout<<GridLogMessage << "Called ASM-L1 Dw"<< " mflop/s = "<< mfl1<<std::endl;
|
|
||||||
os << latt4[0]*latt4[1]*latt4[2]*latt4[3]<< " "<<Ls<<" "<< latt4[0] <<" " <<latt4[2]<< " "
|
|
||||||
<< mfc<<" "
|
|
||||||
<< mfa<<" "
|
|
||||||
<< mfo<<" "
|
|
||||||
<< mfl1<<std::endl;
|
|
||||||
*/
|
|
||||||
|
|
||||||
#if 0
|
|
||||||
for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
|
|
||||||
Dw.DhopOE(srce,resulta,0);
|
|
||||||
PerformanceCounter Counter(i);
|
|
||||||
Counter.Start();
|
|
||||||
Dw.DhopOE(srce,resulta,0);
|
|
||||||
Counter.Stop();
|
|
||||||
Counter.Report();
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
//resulta = (-0.5) * resulta;
|
|
||||||
|
|
||||||
diff = resulto-resulta;
|
|
||||||
std::cout<<norm2(diff)<<std::endl;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
156
lib/Init.cc
156
lib/Init.cc
@ -123,6 +123,13 @@ void GridCmdOptionIntVector(std::string &str,std::vector<int> & vec)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void GridCmdOptionInt(std::string &str,int & val)
|
||||||
|
{
|
||||||
|
std::stringstream ss(str);
|
||||||
|
ss>>val;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void GridParseLayout(char **argv,int argc,
|
void GridParseLayout(char **argv,int argc,
|
||||||
std::vector<int> &latt,
|
std::vector<int> &latt,
|
||||||
@ -153,14 +160,12 @@ void GridParseLayout(char **argv,int argc,
|
|||||||
assert(ompthreads.size()==1);
|
assert(ompthreads.size()==1);
|
||||||
GridThread::SetThreads(ompthreads[0]);
|
GridThread::SetThreads(ompthreads[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
|
if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
|
||||||
std::vector<int> cores(0);
|
int cores;
|
||||||
arg= GridCmdOptionPayload(argv,argv+argc,"--cores");
|
arg= GridCmdOptionPayload(argv,argv+argc,"--cores");
|
||||||
GridCmdOptionIntVector(arg,cores);
|
GridCmdOptionInt(arg,cores);
|
||||||
GridThread::SetCores(cores[0]);
|
GridThread::SetCores(cores);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string GridCmdVectorIntToString(const std::vector<int> & vec){
|
std::string GridCmdVectorIntToString(const std::vector<int> & vec){
|
||||||
@ -169,7 +174,7 @@ std::string GridCmdVectorIntToString(const std::vector<int> & vec){
|
|||||||
return oss.str();
|
return oss.str();
|
||||||
}
|
}
|
||||||
/////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////
|
||||||
//
|
// Reinit guard
|
||||||
/////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////
|
||||||
static int Grid_is_initialised = 0;
|
static int Grid_is_initialised = 0;
|
||||||
|
|
||||||
@ -178,27 +183,31 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
{
|
{
|
||||||
GridLogger::StopWatch.Start();
|
GridLogger::StopWatch.Start();
|
||||||
|
|
||||||
|
std::string arg;
|
||||||
|
|
||||||
|
////////////////////////////////////
|
||||||
|
// Shared memory block size
|
||||||
|
////////////////////////////////////
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--shm") ){
|
||||||
|
int MB;
|
||||||
|
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm");
|
||||||
|
GridCmdOptionInt(arg,MB);
|
||||||
|
CartesianCommunicator::MAX_MPI_SHM_BYTES = MB*1024*1024;
|
||||||
|
}
|
||||||
|
|
||||||
CartesianCommunicator::Init(argc,argv);
|
CartesianCommunicator::Init(argc,argv);
|
||||||
|
|
||||||
// Parse command line args.
|
////////////////////////////////////
|
||||||
|
// Logging
|
||||||
|
////////////////////////////////////
|
||||||
|
|
||||||
std::string arg;
|
|
||||||
std::vector<std::string> logstreams;
|
std::vector<std::string> logstreams;
|
||||||
std::string defaultLog("Error,Warning,Message,Performance");
|
std::string defaultLog("Error,Warning,Message,Performance");
|
||||||
|
|
||||||
GridCmdOptionCSL(defaultLog,logstreams);
|
GridCmdOptionCSL(defaultLog,logstreams);
|
||||||
GridLogConfigure(logstreams);
|
GridLogConfigure(logstreams);
|
||||||
|
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--help") ){
|
if( !GridCmdOptionExists(*argv,*argv+*argc,"--debug-stdout") ){
|
||||||
std::cout<<GridLogMessage<<"--help : this message"<<std::endl;
|
Grid_quiesce_nodes();
|
||||||
std::cout<<GridLogMessage<<"--debug-signals : catch sigsegv and print a blame report"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"--debug-stdout : print stdout from EVERY node"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"--decomposition : report on default omp,mpi and simd decomposition"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"--mpi n.n.n.n : default MPI decomposition"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"--threads n : default number of OMP threads"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"--grid n.n.n.n : default Grid size"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"--log list : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
|
|
||||||
exit(EXIT_SUCCESS);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--log") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--log") ){
|
||||||
@ -207,38 +216,39 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
GridLogConfigure(logstreams);
|
GridLogConfigure(logstreams);
|
||||||
}
|
}
|
||||||
|
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
|
////////////////////////////////////
|
||||||
Grid_debug_handler_init();
|
// Help message
|
||||||
}
|
////////////////////////////////////
|
||||||
if( !GridCmdOptionExists(*argv,*argv+*argc,"--debug-stdout") ){
|
|
||||||
Grid_quiesce_nodes();
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--help") ){
|
||||||
}
|
std::cout<<GridLogMessage<<" --help : this message"<<std::endl;
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
QCD::WilsonKernelsStatic::HandOpt=1;
|
std::cout<<GridLogMessage<<"Geometry:"<<std::endl;
|
||||||
}
|
std::cout<<GridLogMessage<<" --mpi n.n.n.n : default MPI decomposition"<<std::endl;
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
|
std::cout<<GridLogMessage<<" --threads n : default number of OMP threads"<<std::endl;
|
||||||
LebesgueOrder::UseLebesgueOrder=1;
|
std::cout<<GridLogMessage<<" --grid n.n.n.n : default Grid size"<<std::endl;
|
||||||
}
|
std::cout<<GridLogMessage<<" --shm M : allocate M megabytes of shared memory for comms"<<std::endl;
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
|
std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
|
||||||
GridCmdOptionIntVector(arg,LebesgueOrder::Block);
|
std::cout<<GridLogMessage<<" --log list : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
|
||||||
}
|
std::cout<<GridLogMessage<<" --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--timestamp") ){
|
std::cout<<GridLogMessage<<" --debug-signals : catch sigsegv and print a blame report"<<std::endl;
|
||||||
GridLogTimestamp(1);
|
std::cout<<GridLogMessage<<" --debug-stdout : print stdout from EVERY node"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<" --timestamp : tag with millisecond resolution stamps"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"Performance:"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<" --dslash-generic: Wilson kernel for generic Nc"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<" --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<" --dslash-asm : Wilson kernel for AVX512"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<" --lebesgue : Cache oblivious Lebesgue curve/Morton order/Z-graph stencil looping"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<" --cacheblocking n.m.o.p : Hypercuboidal cache blocking"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
|
exit(EXIT_SUCCESS);
|
||||||
}
|
}
|
||||||
|
|
||||||
GridParseLayout(*argv,*argc,
|
////////////////////////////////////
|
||||||
Grid_default_latt,
|
// Banner
|
||||||
Grid_default_mpi);
|
////////////////////////////////////
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
|
|
||||||
std::cout<<GridLogMessage<<"Grid Decomposition\n";
|
|
||||||
std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"\tMPI tasks : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"\tvRealF : "<<sizeof(vRealF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"\tvRealD : "<<sizeof(vRealD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"\tvComplexF : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"\tvComplexD : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string COL_RED = GridLogColours.colour["RED"];
|
std::string COL_RED = GridLogColours.colour["RED"];
|
||||||
std::string COL_PURPLE = GridLogColours.colour["PURPLE"];
|
std::string COL_PURPLE = GridLogColours.colour["PURPLE"];
|
||||||
@ -248,7 +258,6 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
std::string COL_YELLOW = GridLogColours.colour["YELLOW"];
|
std::string COL_YELLOW = GridLogColours.colour["YELLOW"];
|
||||||
std::string COL_BACKGROUND = GridLogColours.colour["NORMAL"];
|
std::string COL_BACKGROUND = GridLogColours.colour["NORMAL"];
|
||||||
|
|
||||||
|
|
||||||
std::cout <<std::endl;
|
std::cout <<std::endl;
|
||||||
std::cout <<COL_RED << "__|__|__|__|__"<< "|__|__|_"<<COL_PURPLE<<"_|__|__|"<< "__|__|__|__|__"<<std::endl;
|
std::cout <<COL_RED << "__|__|__|__|__"<< "|__|__|_"<<COL_PURPLE<<"_|__|__|"<< "__|__|__|__|__"<<std::endl;
|
||||||
std::cout <<COL_RED << "__|__|__|__|__"<< "|__|__|_"<<COL_PURPLE<<"_|__|__|"<< "__|__|__|__|__"<<std::endl;
|
std::cout <<COL_RED << "__|__|__|__|__"<< "|__|__|_"<<COL_PURPLE<<"_|__|__|"<< "__|__|__|__|__"<<std::endl;
|
||||||
@ -281,6 +290,53 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
std::cout << COL_BACKGROUND <<std::endl;
|
std::cout << COL_BACKGROUND <<std::endl;
|
||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
|
|
||||||
|
////////////////////////////////////
|
||||||
|
// Debug and performance options
|
||||||
|
////////////////////////////////////
|
||||||
|
|
||||||
|
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
|
||||||
|
Grid_debug_handler_init();
|
||||||
|
}
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-unroll") ){
|
||||||
|
QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptHandUnroll;
|
||||||
|
}
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-asm") ){
|
||||||
|
QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptInlineAsm;
|
||||||
|
}
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-generic") ){
|
||||||
|
QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptGeneric;
|
||||||
|
}
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
|
||||||
|
LebesgueOrder::UseLebesgueOrder=1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
|
||||||
|
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
|
||||||
|
GridCmdOptionIntVector(arg,LebesgueOrder::Block);
|
||||||
|
}
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--timestamp") ){
|
||||||
|
GridLogTimestamp(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
GridParseLayout(*argv,*argc,
|
||||||
|
Grid_default_latt,
|
||||||
|
Grid_default_mpi);
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
|
||||||
|
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
|
||||||
|
std::cout<<GridLogMessage<<"Grid Decomposition\n";
|
||||||
|
std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"\tMPI tasks : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"\tvRealF : "<<sizeof(vRealF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"\tvRealD : "<<sizeof(vRealD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"\tvComplexF : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"\tvComplexD : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
Grid_is_initialised = 1;
|
Grid_is_initialised = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -32,6 +32,7 @@ namespace Grid {
|
|||||||
// Info that is setup once and indept of cartesian layout
|
// Info that is setup once and indept of cartesian layout
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
void * CartesianCommunicator::ShmCommBuf;
|
void * CartesianCommunicator::ShmCommBuf;
|
||||||
|
uint64_t CartesianCommunicator::MAX_MPI_SHM_BYTES = 128*1024*1024;
|
||||||
|
|
||||||
/////////////////////////////////
|
/////////////////////////////////
|
||||||
// Alloc, free shmem region
|
// Alloc, free shmem region
|
||||||
@ -41,8 +42,12 @@ void *CartesianCommunicator::ShmBufferMalloc(size_t bytes){
|
|||||||
void *ptr = (void *)heap_top;
|
void *ptr = (void *)heap_top;
|
||||||
heap_top += bytes;
|
heap_top += bytes;
|
||||||
heap_bytes+= bytes;
|
heap_bytes+= bytes;
|
||||||
std::cout <<"Shm alloc "<<ptr<<std::endl;
|
if (heap_bytes >= MAX_MPI_SHM_BYTES) {
|
||||||
|
std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
|
||||||
|
std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
|
||||||
|
std::cout<< " Current value is " << (MAX_MPI_SHM_BYTES/(1024*1024)) <<std::endl;
|
||||||
assert(heap_bytes<MAX_MPI_SHM_BYTES);
|
assert(heap_bytes<MAX_MPI_SHM_BYTES);
|
||||||
|
}
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::ShmBufferFreeAll(void) {
|
void CartesianCommunicator::ShmBufferFreeAll(void) {
|
||||||
|
@ -55,7 +55,7 @@ class CartesianCommunicator {
|
|||||||
// Give external control (command line override?) of this
|
// Give external control (command line override?) of this
|
||||||
|
|
||||||
static const int MAXLOG2RANKSPERNODE = 16;
|
static const int MAXLOG2RANKSPERNODE = 16;
|
||||||
static const uint64_t MAX_MPI_SHM_BYTES = 128*1024*1024;
|
static uint64_t MAX_MPI_SHM_BYTES;
|
||||||
|
|
||||||
// Communicator should know nothing of the physics grid, only processor grid.
|
// Communicator should know nothing of the physics grid, only processor grid.
|
||||||
int _Nprocessors; // How many in all
|
int _Nprocessors; // How many in all
|
||||||
|
@ -27,6 +27,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#include "Grid.h"
|
#include "Grid.h"
|
||||||
#include <mpi.h>
|
#include <mpi.h>
|
||||||
|
#include <semaphore.h>
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
@ -45,6 +46,7 @@ const int pool = 48;
|
|||||||
|
|
||||||
class SlaveState {
|
class SlaveState {
|
||||||
public:
|
public:
|
||||||
|
sem_t sem;
|
||||||
volatile int head;
|
volatile int head;
|
||||||
volatile int start;
|
volatile int start;
|
||||||
volatile int tail;
|
volatile int tail;
|
||||||
@ -56,29 +58,32 @@ public:
|
|||||||
SlaveState *state;
|
SlaveState *state;
|
||||||
MPI_Comm squadron;
|
MPI_Comm squadron;
|
||||||
uint64_t base;
|
uint64_t base;
|
||||||
|
int universe_rank;
|
||||||
|
int vertical_rank;
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
// Descriptor circular pointers
|
// Descriptor circular pointers
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
Slave() {};
|
Slave() {};
|
||||||
|
|
||||||
void Init(SlaveState * _state,MPI_Comm _squadron);
|
void Init(SlaveState * _state,MPI_Comm _squadron,int _universe_rank,int _vertical_rank);
|
||||||
|
|
||||||
void EventLoop (void) {
|
void EventLoop (void) {
|
||||||
std::cerr<< " Entering even loop "<<std::endl;
|
std::cout<< " Entering event loop "<<std::endl;
|
||||||
while(1){
|
while(1){
|
||||||
Event();
|
Event();
|
||||||
|
MPI_Barrier(squadron);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Event (void) ;
|
int Event (void) ;
|
||||||
|
|
||||||
uint64_t QueueCommand(int command,void *buf, int bytes, int hashtag, MPI_Comm comm,int rank) ;
|
uint64_t QueueCommand(int command,void *buf, int bytes, int hashtag, MPI_Comm comm,int u_rank) ;
|
||||||
|
|
||||||
void WaitAll() {
|
void QueueWaitAll() {
|
||||||
QueueCommand(COMMAND_WAITALL,0,0,0,squadron,0);
|
QueueCommand(COMMAND_WAITALL,0,0,0,squadron,0);
|
||||||
std::cerr<< " Waiting on FIFO drain "<<std::endl;
|
}
|
||||||
|
void WaitAll() {
|
||||||
while ( state->tail != state->head );
|
while ( state->tail != state->head );
|
||||||
std::cerr<< " FIFO drained "<< state->tail <<std::endl;
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -119,26 +124,31 @@ public:
|
|||||||
MPI_Comm &ShmComm,
|
MPI_Comm &ShmComm,
|
||||||
void * &ShmCommBuf);
|
void * &ShmCommBuf);
|
||||||
|
|
||||||
static void MapCommRankToWorldRank(int &hashtag, int & comm_world_peer,int tag, MPI_Comm comm,int rank);
|
static void MapCommRankToWorldRank(int &hashtag, int & comm_world_peer,int tag, MPI_Comm comm,int commrank);
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////
|
||||||
// routines for master proc must handle any communicator
|
// routines for master proc must handle any communicator
|
||||||
/////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////
|
||||||
|
|
||||||
static uint64_t QueueSend(int slave,void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
|
static uint64_t QueueSend(int slave,void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
|
||||||
std::cerr<< " Queueing send "<< bytes<<std::endl;
|
// std::cout<< " Queueing send "<< bytes<< " slave "<< slave << " to comm "<<rank <<std::endl;
|
||||||
return Slaves[slave].QueueCommand(COMMAND_ISEND,buf,bytes,tag,comm,rank);
|
return Slaves[slave].QueueCommand(COMMAND_ISEND,buf,bytes,tag,comm,rank);
|
||||||
};
|
};
|
||||||
|
|
||||||
static uint64_t QueueRecv(int slave, void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
|
static uint64_t QueueRecv(int slave, void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
|
||||||
std::cerr<< " Queueing receive "<< bytes<<std::endl;
|
// std::cout<< " Queueing recv "<< bytes<< " slave "<< slave << " from comm "<<rank <<std::endl;
|
||||||
return Slaves[slave].QueueCommand(COMMAND_IRECV,buf,bytes,tag,comm,rank);
|
return Slaves[slave].QueueCommand(COMMAND_IRECV,buf,bytes,tag,comm,rank);
|
||||||
};
|
};
|
||||||
|
|
||||||
static void WaitAll() {
|
static void WaitAll() {
|
||||||
|
for(int s=1;s<VerticalSize;s++) {
|
||||||
|
Slaves[s].QueueWaitAll();
|
||||||
|
}
|
||||||
|
MPI_Barrier(VerticalComm);
|
||||||
for(int s=1;s<VerticalSize;s++) {
|
for(int s=1;s<VerticalSize;s++) {
|
||||||
Slaves[s].WaitAll();
|
Slaves[s].WaitAll();
|
||||||
}
|
}
|
||||||
|
MPI_Barrier(VerticalComm);
|
||||||
};
|
};
|
||||||
|
|
||||||
static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){
|
static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){
|
||||||
@ -163,6 +173,7 @@ public:
|
|||||||
GetWork(bytes,s,mywork,myoff,procs);
|
GetWork(bytes,s,mywork,myoff,procs);
|
||||||
QueueSend(s+1,&cbuf[myoff],mywork,tag,comm,rank);
|
QueueSend(s+1,&cbuf[myoff],mywork,tag,comm,rank);
|
||||||
}
|
}
|
||||||
|
MPI_Barrier(VerticalComm);
|
||||||
};
|
};
|
||||||
|
|
||||||
static void QueueMultiplexedRecv(void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
|
static void QueueMultiplexedRecv(void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
|
||||||
@ -173,6 +184,7 @@ public:
|
|||||||
GetWork(bytes,s,mywork,myoff,procs);
|
GetWork(bytes,s,mywork,myoff,procs);
|
||||||
QueueRecv(s+1,&cbuf[myoff],mywork,tag,comm,rank);
|
QueueRecv(s+1,&cbuf[myoff],mywork,tag,comm,rank);
|
||||||
}
|
}
|
||||||
|
MPI_Barrier(VerticalComm);
|
||||||
};
|
};
|
||||||
|
|
||||||
};
|
};
|
||||||
@ -225,7 +237,7 @@ void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world,
|
|||||||
// Split into groups that can share memory (Verticals)
|
// Split into groups that can share memory (Verticals)
|
||||||
/////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////
|
||||||
// MPI_Comm_split_type(communicator_universe, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&VerticalComm);
|
// MPI_Comm_split_type(communicator_universe, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&VerticalComm);
|
||||||
MPI_Comm_split(communicator_universe,(UniverseRank&0x1),UniverseRank,&VerticalComm);
|
MPI_Comm_split(communicator_universe,(UniverseRank/2),UniverseRank,&VerticalComm);
|
||||||
MPI_Comm_rank(VerticalComm ,&VerticalRank);
|
MPI_Comm_rank(VerticalComm ,&VerticalRank);
|
||||||
MPI_Comm_size(VerticalComm ,&VerticalSize);
|
MPI_Comm_size(VerticalComm ,&VerticalSize);
|
||||||
|
|
||||||
@ -262,14 +274,14 @@ void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world,
|
|||||||
ierr|= MPI_Win_lock_all (MPI_MODE_NOCHECK, VerticalWindow);
|
ierr|= MPI_Win_lock_all (MPI_MODE_NOCHECK, VerticalWindow);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
|
|
||||||
std::cerr<<"SHM "<<ShmCommBuf<<std::endl;
|
std::cout<<"SHM "<<ShmCommBuf<<std::endl;
|
||||||
|
|
||||||
VerticalShmBufs.resize(VerticalSize);
|
VerticalShmBufs.resize(VerticalSize);
|
||||||
for(int r=0;r<VerticalSize;r++){
|
for(int r=0;r<VerticalSize;r++){
|
||||||
MPI_Aint sz;
|
MPI_Aint sz;
|
||||||
int dsp_unit;
|
int dsp_unit;
|
||||||
MPI_Win_shared_query (VerticalWindow, r, &sz, &dsp_unit, &VerticalShmBufs[r]);
|
MPI_Win_shared_query (VerticalWindow, r, &sz, &dsp_unit, &VerticalShmBufs[r]);
|
||||||
std::cerr<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
|
std::cout<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
@ -286,14 +298,16 @@ void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world,
|
|||||||
///////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////
|
||||||
if ( VerticalRank != 0 ) {
|
if ( VerticalRank != 0 ) {
|
||||||
Slave indentured;
|
Slave indentured;
|
||||||
indentured.Init( (SlaveState *) VerticalShmBufs[VerticalRank], VerticalComm);
|
indentured.Init( (SlaveState *) VerticalShmBufs[VerticalRank], VerticalComm, UniverseRank,VerticalRank);
|
||||||
|
MPI_Barrier(VerticalComm);
|
||||||
indentured.EventLoop();
|
indentured.EventLoop();
|
||||||
assert(0);
|
assert(0);
|
||||||
} else {
|
} else {
|
||||||
Slaves.resize(VerticalSize);
|
Slaves.resize(VerticalSize);
|
||||||
for(int i=1;i<VerticalSize;i++){
|
for(int i=1;i<VerticalSize;i++){
|
||||||
Slaves[i].Init((SlaveState *)VerticalShmBufs[i],VerticalComm);
|
Slaves[i].Init((SlaveState *)VerticalShmBufs[i],VerticalComm, UniverseRanks[HorizontalRank][i],i);
|
||||||
}
|
}
|
||||||
|
MPI_Barrier(VerticalComm);
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////
|
||||||
@ -337,8 +351,10 @@ void MPIoffloadEngine::MapCommRankToWorldRank(int &hashtag, int & comm_world_pee
|
|||||||
|
|
||||||
if ( comm == HorizontalComm ) {
|
if ( comm == HorizontalComm ) {
|
||||||
comm_world_peer = rank;
|
comm_world_peer = rank;
|
||||||
|
// std::cout << " MapCommRankToWorldRank horiz " <<rank<<"->"<<comm_world_peer<<std::endl;
|
||||||
} else if ( comm == communicator_cached ) {
|
} else if ( comm == communicator_cached ) {
|
||||||
comm_world_peer = UserCommunicatorToWorldRanks[rank];
|
comm_world_peer = UserCommunicatorToWorldRanks[rank];
|
||||||
|
// std::cout << " MapCommRankToWorldRank cached " <<rank<<"->"<<comm_world_peer<<std::endl;
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
int size;
|
int size;
|
||||||
@ -360,6 +376,7 @@ void MPIoffloadEngine::MapCommRankToWorldRank(int &hashtag, int & comm_world_pee
|
|||||||
MPI_Group_translate_ranks(CachedGroup,size,&cached_ranks[0],WorldGroup, &UserCommunicatorToWorldRanks[0]);
|
MPI_Group_translate_ranks(CachedGroup,size,&cached_ranks[0],WorldGroup, &UserCommunicatorToWorldRanks[0]);
|
||||||
|
|
||||||
comm_world_peer = UserCommunicatorToWorldRanks[rank];
|
comm_world_peer = UserCommunicatorToWorldRanks[rank];
|
||||||
|
// std::cout << " MapCommRankToWorldRank cache miss " <<rank<<"->"<<comm_world_peer<<std::endl;
|
||||||
|
|
||||||
assert(comm_world_peer != MPI_UNDEFINED);
|
assert(comm_world_peer != MPI_UNDEFINED);
|
||||||
}
|
}
|
||||||
@ -370,55 +387,30 @@ void MPIoffloadEngine::MapCommRankToWorldRank(int &hashtag, int & comm_world_pee
|
|||||||
int comm_hash = ((icomm>>0 )&0xFFFF)^((icomm>>16)&0xFFFF)
|
int comm_hash = ((icomm>>0 )&0xFFFF)^((icomm>>16)&0xFFFF)
|
||||||
^ ((icomm>>32)&0xFFFF)^((icomm>>48)&0xFFFF);
|
^ ((icomm>>32)&0xFFFF)^((icomm>>48)&0xFFFF);
|
||||||
|
|
||||||
hashtag = (comm_hash<<15) | tag;
|
// hashtag = (comm_hash<<15) | tag;
|
||||||
|
hashtag = tag;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
void Slave::Init(SlaveState * _state,MPI_Comm _squadron)
|
void Slave::Init(SlaveState * _state,MPI_Comm _squadron,int _universe_rank,int _vertical_rank)
|
||||||
{
|
{
|
||||||
squadron=_squadron;
|
squadron=_squadron;
|
||||||
|
universe_rank=_universe_rank;
|
||||||
|
vertical_rank=_vertical_rank;
|
||||||
state =_state;
|
state =_state;
|
||||||
|
std::cout << "state "<<_state<<" comm "<<_squadron<<" universe_rank"<<universe_rank <<std::endl;
|
||||||
state->head = state->tail = state->start = 0;
|
state->head = state->tail = state->start = 0;
|
||||||
MPI_Barrier(squadron);
|
|
||||||
base = (uint64_t)MPIoffloadEngine::VerticalShmBufs[0];
|
base = (uint64_t)MPIoffloadEngine::VerticalShmBufs[0];
|
||||||
int rank; MPI_Comm_rank(_squadron,&rank);
|
int rank; MPI_Comm_rank(_squadron,&rank);
|
||||||
}
|
}
|
||||||
#define PERI_PLUS(A) ( (A+1)%pool )
|
#define PERI_PLUS(A) ( (A+1)%pool )
|
||||||
void Slave::Event (void) {
|
int Slave::Event (void) {
|
||||||
|
|
||||||
static int tail_last;
|
static int tail_last;
|
||||||
static int head_last;
|
static int head_last;
|
||||||
static int start_last;
|
static int start_last;
|
||||||
int ierr;
|
int ierr;
|
||||||
|
|
||||||
if ( (state->tail != tail_last)
|
|
||||||
||(state->head != head_last)
|
|
||||||
||(state->start != start_last)
|
|
||||||
) {
|
|
||||||
std::cerr<< " Event loop "<< state->tail << " "<< state->start<< " "<< state->head <<std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////
|
|
||||||
// Try to advance the tail pointers
|
|
||||||
////////////////////////////////////////////////////
|
|
||||||
/*
|
|
||||||
int t=state->tail;
|
|
||||||
if ( t != state->start ) {
|
|
||||||
int flag=0;
|
|
||||||
|
|
||||||
std::cerr<< " Testing tail "<< t<<" "<< (void *)&state->Descrs[t].request
|
|
||||||
<< " "<<state->Descrs[t].request<<std::endl;
|
|
||||||
// ierr=MPI_Test((MPI_Request *)&state->Descrs[t].request,&flag,MPI_STATUS_IGNORE);
|
|
||||||
// ierr=MPI_Test((MPI_Request *)&state->Descrs[t].request,&flag,MPI_STATUS_IGNORE);
|
|
||||||
assert(ierr==0);
|
|
||||||
if ( flag ) {
|
|
||||||
state->tail = PERI_PLUS(t);
|
|
||||||
std::cerr<< " Tail advanced from "<< t<<std::endl;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////
|
////////////////////////////////////////////////////
|
||||||
// Try to advance the start pointers
|
// Try to advance the start pointers
|
||||||
////////////////////////////////////////////////////
|
////////////////////////////////////////////////////
|
||||||
@ -426,11 +418,11 @@ void Slave::Event (void) {
|
|||||||
if ( s != state->head ) {
|
if ( s != state->head ) {
|
||||||
switch ( state->Descrs[s].command ) {
|
switch ( state->Descrs[s].command ) {
|
||||||
case COMMAND_ISEND:
|
case COMMAND_ISEND:
|
||||||
std::cerr<< " Send "<<s << " ptr "<< state<<" "<< state->Descrs[s].buf<< "["<<state->Descrs[s].bytes<<"]"
|
/*
|
||||||
|
std::cout<< " Send "<<s << " ptr "<< state<<" "<< state->Descrs[s].buf<< "["<<state->Descrs[s].bytes<<"]"
|
||||||
<< " to " << state->Descrs[s].rank<< " tag" << state->Descrs[s].tag
|
<< " to " << state->Descrs[s].rank<< " tag" << state->Descrs[s].tag
|
||||||
<< " Comm " << MPIoffloadEngine::communicator_universe<< std::endl;
|
<< " Comm " << MPIoffloadEngine::communicator_universe<< " me " <<universe_rank<< std::endl;
|
||||||
|
*/
|
||||||
std::cerr<< " Request was "<<state->Descrs[s].request<<std::endl;
|
|
||||||
ierr = MPI_Isend((void *)(state->Descrs[s].buf+base),
|
ierr = MPI_Isend((void *)(state->Descrs[s].buf+base),
|
||||||
state->Descrs[s].bytes,
|
state->Descrs[s].bytes,
|
||||||
MPI_CHAR,
|
MPI_CHAR,
|
||||||
@ -438,18 +430,17 @@ void Slave::Event (void) {
|
|||||||
state->Descrs[s].tag,
|
state->Descrs[s].tag,
|
||||||
MPIoffloadEngine::communicator_universe,
|
MPIoffloadEngine::communicator_universe,
|
||||||
(MPI_Request *)&state->Descrs[s].request);
|
(MPI_Request *)&state->Descrs[s].request);
|
||||||
std::cerr<< " Request is "<<state->Descrs[s].request<<std::endl;
|
|
||||||
std::cerr<< " Request0 is "<<state->Descrs[0].request<<std::endl;
|
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
state->start = PERI_PLUS(s);
|
state->start = PERI_PLUS(s);
|
||||||
|
return 1;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case COMMAND_IRECV:
|
case COMMAND_IRECV:
|
||||||
std::cerr<< " Recv "<<s << " ptr "<< state<<" "<< state->Descrs[s].buf<< "["<<state->Descrs[s].bytes<<"]"
|
/*
|
||||||
<< " to " << state->Descrs[s].rank<< " tag" << state->Descrs[s].tag
|
std::cout<< " Recv "<<s << " ptr "<< state<<" "<< state->Descrs[s].buf<< "["<<state->Descrs[s].bytes<<"]"
|
||||||
<< " Comm " << MPIoffloadEngine::communicator_universe<< std::endl;
|
<< " from " << state->Descrs[s].rank<< " tag" << state->Descrs[s].tag
|
||||||
|
<< " Comm " << MPIoffloadEngine::communicator_universe<< " me "<< universe_rank<< std::endl;
|
||||||
std::cerr<< " Request was "<<state->Descrs[s].request<<std::endl;
|
*/
|
||||||
ierr=MPI_Irecv((void *)(state->Descrs[s].buf+base),
|
ierr=MPI_Irecv((void *)(state->Descrs[s].buf+base),
|
||||||
state->Descrs[s].bytes,
|
state->Descrs[s].bytes,
|
||||||
MPI_CHAR,
|
MPI_CHAR,
|
||||||
@ -457,30 +448,32 @@ void Slave::Event (void) {
|
|||||||
state->Descrs[s].tag,
|
state->Descrs[s].tag,
|
||||||
MPIoffloadEngine::communicator_universe,
|
MPIoffloadEngine::communicator_universe,
|
||||||
(MPI_Request *)&state->Descrs[s].request);
|
(MPI_Request *)&state->Descrs[s].request);
|
||||||
std::cerr<< " Request is "<<state->Descrs[s].request<<std::endl;
|
|
||||||
std::cerr<< " Request0 is "<<state->Descrs[0].request<<std::endl;
|
// std::cout<< " Request is "<<state->Descrs[s].request<<std::endl;
|
||||||
|
// std::cout<< " Request0 is "<<state->Descrs[0].request<<std::endl;
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
state->start = PERI_PLUS(s);
|
state->start = PERI_PLUS(s);
|
||||||
|
return 1;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case COMMAND_WAITALL:
|
case COMMAND_WAITALL:
|
||||||
std::cerr<< " Wait all "<<std::endl;
|
|
||||||
for(int t=state->tail;t!=s; t=PERI_PLUS(t) ){
|
for(int t=state->tail;t!=s; t=PERI_PLUS(t) ){
|
||||||
std::cerr<< " Wait ["<<t<<"] "<<state->Descrs[t].request <<std::endl;
|
|
||||||
std::cerr<< " Request0 is "<<state->Descrs[0].request<<std::endl;
|
|
||||||
MPI_Wait((MPI_Request *)&state->Descrs[t].request,MPI_STATUS_IGNORE);
|
MPI_Wait((MPI_Request *)&state->Descrs[t].request,MPI_STATUS_IGNORE);
|
||||||
};
|
};
|
||||||
s=PERI_PLUS(s);
|
s=PERI_PLUS(s);
|
||||||
state->start = s;
|
state->start = s;
|
||||||
state->tail = s;
|
state->tail = s;
|
||||||
|
MPI_Barrier(squadron);
|
||||||
|
return 1;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
assert(0);
|
assert(0);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
//////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
// External interaction with the queue
|
// External interaction with the queue
|
||||||
@ -500,17 +493,29 @@ uint64_t Slave::QueueCommand(int command,void *buf, int bytes, int tag, MPI_Comm
|
|||||||
MPI_Comm communicator;
|
MPI_Comm communicator;
|
||||||
MPI_Request request;
|
MPI_Request request;
|
||||||
|
|
||||||
MPIoffloadEngine::MapCommRankToWorldRank(hashtag,commrank,tag,comm,worldrank);
|
MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,tag,comm,commrank);
|
||||||
|
|
||||||
int VerticalRank = MPIoffloadEngine::VerticalRank;
|
|
||||||
uint64_t relative= (uint64_t)buf - base;
|
uint64_t relative= (uint64_t)buf - base;
|
||||||
state->Descrs[head].buf = relative;
|
state->Descrs[head].buf = relative;
|
||||||
state->Descrs[head].bytes = bytes;
|
state->Descrs[head].bytes = bytes;
|
||||||
state->Descrs[head].rank = MPIoffloadEngine::UniverseRanks[worldrank][VerticalRank];
|
state->Descrs[head].rank = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank];
|
||||||
state->Descrs[head].tag = hashtag;
|
state->Descrs[head].tag = hashtag;
|
||||||
state->Descrs[head].command= command;
|
state->Descrs[head].command= command;
|
||||||
std::cerr<< " QueueCommand "<<buf<<"["<<bytes<<"]" << std::endl;
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
if ( command == COMMAND_ISEND ) {
|
||||||
|
std::cout << "QueueSend from "<< universe_rank <<" to commrank " << commrank
|
||||||
|
<< " to worldrank " << worldrank <<std::endl;
|
||||||
|
std::cout << " via VerticalRank "<< vertical_rank <<" to universerank " << MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]<<std::endl;
|
||||||
|
std::cout << " QueueCommand "<<buf<<"["<<bytes<<"]" << std::endl;
|
||||||
|
}
|
||||||
|
if ( command == COMMAND_IRECV ) {
|
||||||
|
std::cout << "QueueRecv on "<< universe_rank <<" from commrank " << commrank
|
||||||
|
<< " from worldrank " << worldrank <<std::endl;
|
||||||
|
std::cout << " via VerticalRank "<< vertical_rank <<" from universerank " << MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]<<std::endl;
|
||||||
|
std::cout << " QueueSend "<<buf<<"["<<bytes<<"]" << std::endl;
|
||||||
|
}
|
||||||
|
*/
|
||||||
// Block until FIFO has space
|
// Block until FIFO has space
|
||||||
while( state->tail==next );
|
while( state->tail==next );
|
||||||
|
|
||||||
@ -671,6 +676,8 @@ void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_
|
|||||||
// assert xmit and recv lie in shared memory region
|
// assert xmit and recv lie in shared memory region
|
||||||
assert( (xmit_i >= shm) && (xmit_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
|
assert( (xmit_i >= shm) && (xmit_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
|
||||||
assert( (recv_i >= shm) && (recv_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
|
assert( (recv_i >= shm) && (recv_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
|
||||||
|
assert(from!=_processor);
|
||||||
|
assert(dest!=_processor);
|
||||||
MPIoffloadEngine::QueueMultiplexedSend(xmit,bytes,_processor,communicator,dest);
|
MPIoffloadEngine::QueueMultiplexedSend(xmit,bytes,_processor,communicator,dest);
|
||||||
MPIoffloadEngine::QueueMultiplexedRecv(recv,bytes,from,communicator,from);
|
MPIoffloadEngine::QueueMultiplexedRecv(recv,bytes,from,communicator,from);
|
||||||
}
|
}
|
||||||
|
@ -32,8 +32,7 @@ directory
|
|||||||
namespace Grid {
|
namespace Grid {
|
||||||
namespace QCD {
|
namespace QCD {
|
||||||
|
|
||||||
int WilsonKernelsStatic::HandOpt;
|
int WilsonKernelsStatic::Opt;
|
||||||
int WilsonKernelsStatic::AsmOpt;
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
WilsonKernels<Impl>::WilsonKernels(const ImplParams &p) : Base(p){};
|
WilsonKernels<Impl>::WilsonKernels(const ImplParams &p) : Base(p){};
|
||||||
|
@ -40,9 +40,9 @@ namespace QCD {
|
|||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
class WilsonKernelsStatic {
|
class WilsonKernelsStatic {
|
||||||
public:
|
public:
|
||||||
|
enum { OptGeneric, OptHandUnroll, OptInlineAsm };
|
||||||
// S-direction is INNERMOST and takes no part in the parity.
|
// S-direction is INNERMOST and takes no part in the parity.
|
||||||
static int AsmOpt; // these are a temporary hack
|
static int Opt; // these are a temporary hack
|
||||||
static int HandOpt; // these are a temporary hack
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic {
|
template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic {
|
||||||
@ -56,24 +56,40 @@ public:
|
|||||||
template <bool EnableBool = true>
|
template <bool EnableBool = true>
|
||||||
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
|
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
|
||||||
DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
|
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out)
|
||||||
#ifdef AVX512
|
|
||||||
if (AsmOpt) {
|
|
||||||
WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
|
|
||||||
} else {
|
|
||||||
#else
|
|
||||||
{
|
{
|
||||||
#endif
|
switch(Opt) {
|
||||||
|
#ifdef AVX512
|
||||||
|
case OptInlineAsm:
|
||||||
|
for (int site = 0; site < Ns; site++) {
|
||||||
|
for (int s = 0; s < Ls; s++) {
|
||||||
|
WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
|
||||||
|
sF++;
|
||||||
|
}
|
||||||
|
sU++;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
#endif
|
||||||
|
case OptHandUnroll:
|
||||||
for (int site = 0; site < Ns; site++) {
|
for (int site = 0; site < Ns; site++) {
|
||||||
for (int s = 0; s < Ls; s++) {
|
for (int s = 0; s < Ls; s++) {
|
||||||
if (HandOpt)
|
|
||||||
WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
|
WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
|
||||||
else
|
sF++;
|
||||||
|
}
|
||||||
|
sU++;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case OptGeneric:
|
||||||
|
for (int site = 0; site < Ns; site++) {
|
||||||
|
for (int s = 0; s < Ls; s++) {
|
||||||
WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
|
WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
|
||||||
sF++;
|
sF++;
|
||||||
}
|
}
|
||||||
sU++;
|
sU++;
|
||||||
}
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
assert(0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -81,7 +97,7 @@ public:
|
|||||||
typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
|
typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
|
||||||
DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
|
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
|
||||||
|
// no kernel choice
|
||||||
for (int site = 0; site < Ns; site++) {
|
for (int site = 0; site < Ns; site++) {
|
||||||
for (int s = 0; s < Ls; s++) {
|
for (int s = 0; s < Ls; s++) {
|
||||||
WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in, out);
|
WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in, out);
|
||||||
@ -95,23 +111,39 @@ public:
|
|||||||
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,void>::type
|
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,void>::type
|
||||||
DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
|
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
|
||||||
|
|
||||||
|
switch(Opt) {
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
if (AsmOpt) {
|
case OptInlineAsm:
|
||||||
WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
|
for (int site = 0; site < Ns; site++) {
|
||||||
} else {
|
for (int s = 0; s < Ls; s++) {
|
||||||
#else
|
WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
|
||||||
{
|
sF++;
|
||||||
#endif
|
}
|
||||||
|
sU++;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
#endif
|
||||||
|
case OptHandUnroll:
|
||||||
for (int site = 0; site < Ns; site++) {
|
for (int site = 0; site < Ns; site++) {
|
||||||
for (int s = 0; s < Ls; s++) {
|
for (int s = 0; s < Ls; s++) {
|
||||||
if (HandOpt)
|
|
||||||
WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
|
WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
|
||||||
else
|
sF++;
|
||||||
|
}
|
||||||
|
sU++;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case OptGeneric:
|
||||||
|
for (int site = 0; site < Ns; site++) {
|
||||||
|
for (int s = 0; s < Ls; s++) {
|
||||||
WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
|
WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
|
||||||
sF++;
|
sF++;
|
||||||
}
|
}
|
||||||
sU++;
|
sU++;
|
||||||
}
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
assert(0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -32,7 +32,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
int LebesgueOrder::UseLebesgueOrder;
|
int LebesgueOrder::UseLebesgueOrder;
|
||||||
std::vector<int> LebesgueOrder::Block({2,2,2,2});
|
std::vector<int> LebesgueOrder::Block({8,2,2,2});
|
||||||
|
|
||||||
LebesgueOrder::IndexInteger LebesgueOrder::alignup(IndexInteger n){
|
LebesgueOrder::IndexInteger LebesgueOrder::alignup(IndexInteger n){
|
||||||
n--; // 1000 0011 --> 1000 0010
|
n--; // 1000 0011 --> 1000 0010
|
||||||
|
Loading…
Reference in New Issue
Block a user