mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-04 19:25:56 +01:00
Merge branch 'develop' into release/v0.6.0
This commit is contained in:
commit
c067051d5f
@ -42,15 +42,14 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
int Nloop=10;
|
int Nloop=10;
|
||||||
int nmu=0;
|
int nmu=0;
|
||||||
for(int mu=0;mu<4;mu++) if (mpi_layout[mu]>1) nmu++;
|
for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl;
|
std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " L "<<"\t\t"<<" Ls "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
|
std::cout<<GridLogMessage << " L "<<"\t\t"<<" Ls "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
|
||||||
|
int maxlat=16;
|
||||||
|
for(int lat=4;lat<=maxlat;lat+=2){
|
||||||
|
|
||||||
for(int lat=4;lat<=32;lat+=2){
|
|
||||||
for(int Ls=1;Ls<=16;Ls*=2){
|
for(int Ls=1;Ls<=16;Ls*=2){
|
||||||
|
|
||||||
std::vector<int> latt_size ({lat*mpi_layout[0],
|
std::vector<int> latt_size ({lat*mpi_layout[0],
|
||||||
@ -125,7 +124,7 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage << " L "<<"\t\t"<<" Ls "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
|
std::cout<<GridLogMessage << " L "<<"\t\t"<<" Ls "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
for(int lat=4;lat<=32;lat+=2){
|
for(int lat=4;lat<=maxlat;lat+=2){
|
||||||
for(int Ls=1;Ls<=16;Ls*=2){
|
for(int Ls=1;Ls<=16;Ls*=2){
|
||||||
|
|
||||||
std::vector<int> latt_size ({lat,lat,lat,lat});
|
std::vector<int> latt_size ({lat,lat,lat,lat});
|
||||||
@ -194,128 +193,83 @@ int main (int argc, char ** argv)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0
|
Nloop=100;
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << "= Benchmarking sequential persistent halo exchange in "<<nmu<<" dimensions"<<std::endl;
|
std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " L "<<"\t\t"<<" Ls "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
|
std::cout<<GridLogMessage << " L "<<"\t\t"<<" Ls "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
|
||||||
|
|
||||||
|
for(int lat=4;lat<=maxlat;lat+=2){
|
||||||
for(int lat=4;lat<=32;lat+=2){
|
|
||||||
for(int Ls=1;Ls<=16;Ls*=2){
|
for(int Ls=1;Ls<=16;Ls*=2){
|
||||||
|
|
||||||
std::vector<int> latt_size ({lat,lat,lat,lat});
|
std::vector<int> latt_size ({lat*mpi_layout[0],
|
||||||
|
lat*mpi_layout[1],
|
||||||
|
lat*mpi_layout[2],
|
||||||
|
lat*mpi_layout[3]});
|
||||||
|
|
||||||
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
||||||
|
|
||||||
std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
|
std::vector<HalfSpinColourVectorD *> xbuf(8);
|
||||||
std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
|
std::vector<HalfSpinColourVectorD *> rbuf(8);
|
||||||
|
Grid.ShmBufferFreeAll();
|
||||||
|
for(int d=0;d<8;d++){
|
||||||
|
xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
|
||||||
|
rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
|
||||||
|
}
|
||||||
|
|
||||||
int ncomm;
|
int ncomm;
|
||||||
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
|
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
|
||||||
|
|
||||||
|
double start=usecond();
|
||||||
|
for(int i=0;i<Nloop;i++){
|
||||||
|
|
||||||
std::vector<CartesianCommunicator::CommsRequest_t> empty;
|
std::vector<CartesianCommunicator::CommsRequest_t> requests;
|
||||||
std::vector<std::vector<CartesianCommunicator::CommsRequest_t> > requests_fwd(Nd,empty);
|
|
||||||
std::vector<std::vector<CartesianCommunicator::CommsRequest_t> > requests_bwd(Nd,empty);
|
|
||||||
|
|
||||||
for(int mu=0;mu<4;mu++){
|
|
||||||
ncomm=0;
|
ncomm=0;
|
||||||
if (mpi_layout[mu]>1 ) {
|
for(int mu=0;mu<4;mu++){
|
||||||
ncomm++;
|
|
||||||
|
if (mpi_layout[mu]>1 ) {
|
||||||
int comm_proc;
|
|
||||||
int xmit_to_rank;
|
|
||||||
int recv_from_rank;
|
|
||||||
|
|
||||||
comm_proc=1;
|
|
||||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
|
||||||
Grid.SendToRecvFromInit(requests_fwd[mu],
|
|
||||||
(void *)&xbuf[mu][0],
|
|
||||||
xmit_to_rank,
|
|
||||||
(void *)&rbuf[mu][0],
|
|
||||||
recv_from_rank,
|
|
||||||
bytes);
|
|
||||||
|
|
||||||
comm_proc = mpi_layout[mu]-1;
|
|
||||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
|
||||||
Grid.SendToRecvFromInit(requests_bwd[mu],
|
|
||||||
(void *)&xbuf[mu+4][0],
|
|
||||||
xmit_to_rank,
|
|
||||||
(void *)&rbuf[mu+4][0],
|
|
||||||
recv_from_rank,
|
|
||||||
bytes);
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
double start=usecond();
|
|
||||||
for(int i=0;i<Nloop;i++){
|
|
||||||
|
|
||||||
for(int mu=0;mu<4;mu++){
|
ncomm++;
|
||||||
|
int comm_proc=1;
|
||||||
|
int xmit_to_rank;
|
||||||
|
int recv_from_rank;
|
||||||
|
|
||||||
if (mpi_layout[mu]>1 ) {
|
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
Grid.StencilSendToRecvFromBegin(requests,
|
||||||
Grid.SendToRecvFromBegin(requests_fwd[mu]);
|
(void *)&xbuf[mu][0],
|
||||||
Grid.SendToRecvFromComplete(requests_fwd[mu]);
|
xmit_to_rank,
|
||||||
Grid.SendToRecvFromBegin(requests_bwd[mu]);
|
(void *)&rbuf[mu][0],
|
||||||
Grid.SendToRecvFromComplete(requests_bwd[mu]);
|
recv_from_rank,
|
||||||
}
|
bytes);
|
||||||
}
|
|
||||||
Grid.Barrier();
|
|
||||||
}
|
|
||||||
|
|
||||||
double stop=usecond();
|
comm_proc = mpi_layout[mu]-1;
|
||||||
|
|
||||||
double dbytes = bytes;
|
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
double xbytes = Nloop*dbytes*2.0*ncomm;
|
Grid.StencilSendToRecvFromBegin(requests,
|
||||||
double rbytes = xbytes;
|
(void *)&xbuf[mu+4][0],
|
||||||
double bidibytes = xbytes+rbytes;
|
xmit_to_rank,
|
||||||
|
(void *)&rbuf[mu+4][0],
|
||||||
double time = stop-start;
|
recv_from_rank,
|
||||||
|
bytes);
|
||||||
std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
{
|
|
||||||
double start=usecond();
|
|
||||||
for(int i=0;i<Nloop;i++){
|
|
||||||
|
|
||||||
for(int mu=0;mu<4;mu++){
|
|
||||||
|
|
||||||
if (mpi_layout[mu]>1 ) {
|
|
||||||
|
|
||||||
Grid.SendToRecvFromBegin(requests_fwd[mu]);
|
|
||||||
Grid.SendToRecvFromBegin(requests_bwd[mu]);
|
|
||||||
Grid.SendToRecvFromComplete(requests_fwd[mu]);
|
|
||||||
Grid.SendToRecvFromComplete(requests_bwd[mu]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
Grid.Barrier();
|
|
||||||
}
|
}
|
||||||
|
Grid.StencilSendToRecvFromComplete(requests);
|
||||||
double stop=usecond();
|
Grid.Barrier();
|
||||||
|
|
||||||
double dbytes = bytes;
|
|
||||||
double xbytes = Nloop*dbytes*2.0*ncomm;
|
|
||||||
double rbytes = xbytes;
|
|
||||||
double bidibytes = xbytes+rbytes;
|
|
||||||
|
|
||||||
double time = stop-start;
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
double stop=usecond();
|
||||||
|
|
||||||
|
double dbytes = bytes;
|
||||||
|
double xbytes = Nloop*dbytes*2.0*ncomm;
|
||||||
|
double rbytes = xbytes;
|
||||||
|
double bidibytes = xbytes+rbytes;
|
||||||
|
|
||||||
|
double time = stop-start; // microseconds
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
}
|
}
|
||||||
|
@ -44,7 +44,6 @@ struct scal {
|
|||||||
Gamma::GammaT
|
Gamma::GammaT
|
||||||
};
|
};
|
||||||
|
|
||||||
bool overlapComms = false;
|
|
||||||
typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
|
typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
|
||||||
typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
|
typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
|
||||||
typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
|
typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
|
||||||
@ -54,10 +53,6 @@ int main (int argc, char ** argv)
|
|||||||
{
|
{
|
||||||
Grid_init(&argc,&argv);
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
|
|
||||||
overlapComms = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
int threads = GridThread::GetThreads();
|
int threads = GridThread::GetThreads();
|
||||||
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
||||||
|
|
||||||
@ -126,14 +121,21 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
RealD NP = UGrid->_Nprocessors;
|
RealD NP = UGrid->_Nprocessors;
|
||||||
|
|
||||||
for(int doasm=1;doasm<2;doasm++){
|
|
||||||
|
|
||||||
QCD::WilsonKernelsStatic::AsmOpt=doasm;
|
|
||||||
|
|
||||||
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Naive wilson implementation "<<std::endl;
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
std::cout << GridLogMessage<< "Calling Dw"<<std::endl;
|
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
|
||||||
|
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
||||||
|
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
|
|
||||||
int ncall =100;
|
int ncall =100;
|
||||||
if (1) {
|
if (1) {
|
||||||
|
|
||||||
@ -162,6 +164,17 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
if (1)
|
if (1)
|
||||||
{
|
{
|
||||||
|
|
||||||
|
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::Dhop "<<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl;
|
||||||
|
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
||||||
|
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||||
|
|
||||||
typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
|
typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
|
||||||
LatticeFermion ssrc(sFGrid);
|
LatticeFermion ssrc(sFGrid);
|
||||||
LatticeFermion sref(sFGrid);
|
LatticeFermion sref(sFGrid);
|
||||||
@ -248,6 +261,16 @@ int main (int argc, char ** argv)
|
|||||||
sr_e = zero;
|
sr_e = zero;
|
||||||
sr_o = zero;
|
sr_o = zero;
|
||||||
|
|
||||||
|
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::DhopEO "<<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl;
|
||||||
|
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
||||||
|
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||||
|
|
||||||
sDw.ZeroCounters();
|
sDw.ZeroCounters();
|
||||||
sDw.stat.init("DhopEO");
|
sDw.stat.init("DhopEO");
|
||||||
double t0=usecond();
|
double t0=usecond();
|
||||||
@ -308,7 +331,7 @@ int main (int argc, char ** argv)
|
|||||||
ref = -0.5*ref;
|
ref = -0.5*ref;
|
||||||
}
|
}
|
||||||
Dw.Dhop(src,result,1);
|
Dw.Dhop(src,result,1);
|
||||||
std::cout << GridLogMessage << "Naive wilson implementation Dag" << std::endl;
|
std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
|
||||||
std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
|
std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
||||||
@ -322,13 +345,22 @@ int main (int argc, char ** argv)
|
|||||||
LatticeFermion r_eo (FGrid);
|
LatticeFermion r_eo (FGrid);
|
||||||
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Calling Deo and Doe"<<std::endl;
|
std::cout<<GridLogMessage << "Calling Deo and Doe and assert Deo+Doe == Dunprec"<<std::endl;
|
||||||
pickCheckerboard(Even,src_e,src);
|
pickCheckerboard(Even,src_e,src);
|
||||||
pickCheckerboard(Odd,src_o,src);
|
pickCheckerboard(Odd,src_o,src);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
|
std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
|
std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
|
||||||
|
|
||||||
|
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::DhopEO "<<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
|
||||||
|
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
||||||
|
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||||
{
|
{
|
||||||
Dw.ZeroCounters();
|
Dw.ZeroCounters();
|
||||||
double t0=usecond();
|
double t0=usecond();
|
||||||
@ -366,8 +398,5 @@ int main (int argc, char ** argv)
|
|||||||
assert(norm2(src_e)<1.0e-5);
|
assert(norm2(src_e)<1.0e-5);
|
||||||
assert(norm2(src_o)<1.0e-5);
|
assert(norm2(src_o)<1.0e-5);
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
}
|
}
|
||||||
|
@ -1,153 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./benchmarks/Benchmark_dwf.cc
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#include <Grid/Grid.h>
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
using namespace Grid;
|
|
||||||
using namespace Grid::QCD;
|
|
||||||
|
|
||||||
template<class d>
|
|
||||||
struct scal {
|
|
||||||
d internal;
|
|
||||||
};
|
|
||||||
|
|
||||||
Gamma::GammaMatrix Gmu [] = {
|
|
||||||
Gamma::GammaX,
|
|
||||||
Gamma::GammaY,
|
|
||||||
Gamma::GammaZ,
|
|
||||||
Gamma::GammaT
|
|
||||||
};
|
|
||||||
|
|
||||||
bool overlapComms = false;
|
|
||||||
|
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
|
||||||
{
|
|
||||||
Grid_init(&argc,&argv);
|
|
||||||
|
|
||||||
if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
|
|
||||||
overlapComms = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
int threads = GridThread::GetThreads();
|
|
||||||
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
|
||||||
|
|
||||||
std::vector<int> latt4 = GridDefaultLatt();
|
|
||||||
const int Ls=16;
|
|
||||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
|
||||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
|
||||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
|
||||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
|
||||||
|
|
||||||
std::vector<int> seeds4({1,2,3,4});
|
|
||||||
std::vector<int> seeds5({5,6,7,8});
|
|
||||||
|
|
||||||
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
|
||||||
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
|
||||||
|
|
||||||
LatticeFermion src (FGrid); random(RNG5,src);
|
|
||||||
LatticeFermion result(FGrid); result=zero;
|
|
||||||
LatticeFermion ref(FGrid); ref=zero;
|
|
||||||
LatticeFermion tmp(FGrid);
|
|
||||||
LatticeFermion err(FGrid);
|
|
||||||
|
|
||||||
ColourMatrix cm = Complex(1.0,0.0);
|
|
||||||
|
|
||||||
LatticeGaugeField Umu(UGrid);
|
|
||||||
random(RNG4,Umu);
|
|
||||||
|
|
||||||
LatticeGaugeField Umu5d(FGrid);
|
|
||||||
|
|
||||||
// replicate across fifth dimension
|
|
||||||
for(int ss=0;ss<Umu._grid->oSites();ss++){
|
|
||||||
for(int s=0;s<Ls;s++){
|
|
||||||
Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////
|
|
||||||
// Naive wilson implementation
|
|
||||||
////////////////////////////////////
|
|
||||||
std::vector<LatticeColourMatrix> U(4,FGrid);
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
|
||||||
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (1)
|
|
||||||
{
|
|
||||||
ref = zero;
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
|
||||||
|
|
||||||
tmp = U[mu]*Cshift(src,mu+1,1);
|
|
||||||
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
|
|
||||||
|
|
||||||
tmp =adj(U[mu])*src;
|
|
||||||
tmp =Cshift(tmp,mu+1,-1);
|
|
||||||
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
|
|
||||||
}
|
|
||||||
ref = -0.5*ref;
|
|
||||||
}
|
|
||||||
|
|
||||||
RealD mass=0.1;
|
|
||||||
RealD M5 =1.8;
|
|
||||||
|
|
||||||
typename DomainWallFermionR::ImplParams params;
|
|
||||||
params.overlapCommsCompute = overlapComms;
|
|
||||||
|
|
||||||
RealD NP = UGrid->_Nprocessors;
|
|
||||||
|
|
||||||
|
|
||||||
QCD::WilsonKernelsStatic::AsmOpt=1;
|
|
||||||
|
|
||||||
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
|
|
||||||
int ncall =50;
|
|
||||||
if (1) {
|
|
||||||
|
|
||||||
double t0=usecond();
|
|
||||||
for(int i=0;i<ncall;i++){
|
|
||||||
Dw.Dhop(src,result,0);
|
|
||||||
}
|
|
||||||
double t1=usecond();
|
|
||||||
|
|
||||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
|
||||||
double flops=1344*volume*ncall;
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NP<<std::endl;
|
|
||||||
err = ref-result;
|
|
||||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
|
||||||
// Dw.Report();
|
|
||||||
}
|
|
||||||
Grid_finalize();
|
|
||||||
}
|
|
@ -51,16 +51,18 @@ int main (int argc, char ** argv)
|
|||||||
{
|
{
|
||||||
Grid_init(&argc,&argv);
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
|
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
|
|
||||||
const int Ls=8;
|
const int Ls=8;
|
||||||
int threads = GridThread::GetThreads();
|
int threads = GridThread::GetThreads();
|
||||||
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
||||||
|
|
||||||
if ( getenv("ASMOPT") ) {
|
|
||||||
QCD::WilsonKernelsStatic::AsmOpt=1;
|
|
||||||
} else {
|
|
||||||
QCD::WilsonKernelsStatic::AsmOpt=0;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << "= Benchmarking DWF"<<std::endl;
|
std::cout<<GridLogMessage << "= Benchmarking DWF"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
|
||||||
|
@ -58,6 +58,19 @@ int main (int argc, char ** argv)
|
|||||||
std::vector<int> seeds({1,2,3,4});
|
std::vector<int> seeds({1,2,3,4});
|
||||||
RealD mass = 0.1;
|
RealD mass = 0.1;
|
||||||
|
|
||||||
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* Benchmarking WilsonFermionR::Dhop "<<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
|
||||||
|
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
||||||
|
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "============================================================================="<< std::endl;
|
std::cout<<GridLogMessage << "============================================================================="<< std::endl;
|
||||||
std::cout<<GridLogMessage << "= Benchmarking Wilson" << std::endl;
|
std::cout<<GridLogMessage << "= Benchmarking Wilson" << std::endl;
|
||||||
std::cout<<GridLogMessage << "============================================================================="<< std::endl;
|
std::cout<<GridLogMessage << "============================================================================="<< std::endl;
|
||||||
|
@ -1,175 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./tests/Test_zmm.cc
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#include <Grid/Grid.h>
|
|
||||||
|
|
||||||
|
|
||||||
using namespace Grid;
|
|
||||||
using namespace Grid::QCD;
|
|
||||||
|
|
||||||
|
|
||||||
int bench(std::ofstream &os, std::vector<int> &latt4,int Ls);
|
|
||||||
|
|
||||||
int main(int argc,char **argv)
|
|
||||||
{
|
|
||||||
Grid_init(&argc,&argv);
|
|
||||||
std::ofstream os("zmm.dat");
|
|
||||||
|
|
||||||
os << "#V Ls Lxy Lzt C++ Asm OMP L1 " <<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "====================================================================="<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "= Benchmarking ZMM"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "====================================================================="<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Volume \t\t\t\tC++DW/MFLOPs\tASM-DW/MFLOPs\tdiff"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "====================================================================="<<std::endl;
|
|
||||||
for(int L=4;L<=32;L+=4){
|
|
||||||
for(int m=1;m<=2;m++){
|
|
||||||
for(int Ls=8;Ls<=16;Ls+=8){
|
|
||||||
std::vector<int> grid({L,L,m*L,m*L});
|
|
||||||
std::cout << GridLogMessage <<"\t";
|
|
||||||
for(int i=0;i<4;i++) {
|
|
||||||
std::cout << grid[i]<<"x";
|
|
||||||
}
|
|
||||||
std::cout << Ls<<"\t\t";
|
|
||||||
bench(os,grid,Ls);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
|
|
||||||
{
|
|
||||||
|
|
||||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
|
||||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
|
||||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
|
||||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
|
||||||
|
|
||||||
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
|
|
||||||
std::vector<int> mpi_layout = GridDefaultMpi();
|
|
||||||
int threads = GridThread::GetThreads();
|
|
||||||
|
|
||||||
std::vector<int> seeds4({1,2,3,4});
|
|
||||||
std::vector<int> seeds5({5,6,7,8});
|
|
||||||
|
|
||||||
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4);
|
|
||||||
|
|
||||||
LatticeFermion src (FGrid);
|
|
||||||
LatticeFermion tmp (FGrid);
|
|
||||||
LatticeFermion srce(FrbGrid);
|
|
||||||
|
|
||||||
LatticeFermion resulto(FrbGrid); resulto=zero;
|
|
||||||
LatticeFermion resulta(FrbGrid); resulta=zero;
|
|
||||||
LatticeFermion junk(FrbGrid); junk=zero;
|
|
||||||
LatticeFermion diff(FrbGrid);
|
|
||||||
LatticeGaugeField Umu(UGrid);
|
|
||||||
|
|
||||||
double mfc, mfa, mfo, mfl1;
|
|
||||||
|
|
||||||
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
|
||||||
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
|
||||||
random(RNG5,src);
|
|
||||||
#if 1
|
|
||||||
random(RNG4,Umu);
|
|
||||||
#else
|
|
||||||
int mmu=2;
|
|
||||||
std::vector<LatticeColourMatrix> U(4,UGrid);
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
|
||||||
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
|
|
||||||
if ( mu!=mmu ) U[mu] = zero;
|
|
||||||
if ( mu==mmu ) U[mu] = 1.0;
|
|
||||||
PokeIndex<LorentzIndex>(Umu,U[mu],mu);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
pickCheckerboard(Even,srce,src);
|
|
||||||
|
|
||||||
RealD mass=0.1;
|
|
||||||
RealD M5 =1.8;
|
|
||||||
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
|
||||||
|
|
||||||
int ncall=50;
|
|
||||||
double t0=usecond();
|
|
||||||
for(int i=0;i<ncall;i++){
|
|
||||||
Dw.DhopOE(srce,resulto,0);
|
|
||||||
}
|
|
||||||
double t1=usecond();
|
|
||||||
|
|
||||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
|
||||||
double flops=1344*volume/2;
|
|
||||||
|
|
||||||
mfc = flops*ncall/(t1-t0);
|
|
||||||
std::cout<<mfc<<"\t\t";
|
|
||||||
|
|
||||||
QCD::WilsonKernelsStatic::AsmOpt=1;
|
|
||||||
t0=usecond();
|
|
||||||
for(int i=0;i<ncall;i++){
|
|
||||||
Dw.DhopOE(srce,resulta,0);
|
|
||||||
}
|
|
||||||
t1=usecond();
|
|
||||||
mfa = flops*ncall/(t1-t0);
|
|
||||||
std::cout<<mfa<<"\t\t";
|
|
||||||
/*
|
|
||||||
int dag=DaggerNo;
|
|
||||||
t0=usecond();
|
|
||||||
for(int i=0;i<1;i++){
|
|
||||||
Dw.DhopInternalOMPbench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
|
|
||||||
}
|
|
||||||
t1=usecond();
|
|
||||||
mfo = flops*100/(t1-t0);
|
|
||||||
std::cout<<GridLogMessage << "Called ASM-OMP Dw"<< " mflop/s = "<< mfo<<std::endl;
|
|
||||||
|
|
||||||
t0=usecond();
|
|
||||||
for(int i=0;i<1;i++){
|
|
||||||
Dw.DhopInternalL1bench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
|
|
||||||
}
|
|
||||||
t1=usecond();
|
|
||||||
mfl1= flops*100/(t1-t0);
|
|
||||||
std::cout<<GridLogMessage << "Called ASM-L1 Dw"<< " mflop/s = "<< mfl1<<std::endl;
|
|
||||||
os << latt4[0]*latt4[1]*latt4[2]*latt4[3]<< " "<<Ls<<" "<< latt4[0] <<" " <<latt4[2]<< " "
|
|
||||||
<< mfc<<" "
|
|
||||||
<< mfa<<" "
|
|
||||||
<< mfo<<" "
|
|
||||||
<< mfl1<<std::endl;
|
|
||||||
*/
|
|
||||||
|
|
||||||
#if 0
|
|
||||||
for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
|
|
||||||
Dw.DhopOE(srce,resulta,0);
|
|
||||||
PerformanceCounter Counter(i);
|
|
||||||
Counter.Start();
|
|
||||||
Dw.DhopOE(srce,resulta,0);
|
|
||||||
Counter.Stop();
|
|
||||||
Counter.Report();
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
//resulta = (-0.5) * resulta;
|
|
||||||
|
|
||||||
diff = resulto-resulta;
|
|
||||||
std::cout<<norm2(diff)<<std::endl;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
@ -1,18 +1,12 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.2.9.tar.bz2'
|
EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.2.9.tar.bz2'
|
||||||
FFTW_URL=http://www.fftw.org/fftw-3.3.4.tar.gz
|
|
||||||
|
|
||||||
echo "-- deploying Eigen source..."
|
echo "-- deploying Eigen source..."
|
||||||
wget ${EIGEN_URL} --no-check-certificate
|
wget ${EIGEN_URL} --no-check-certificate
|
||||||
./scripts/update_eigen.sh `basename ${EIGEN_URL}`
|
./scripts/update_eigen.sh `basename ${EIGEN_URL}`
|
||||||
rm `basename ${EIGEN_URL}`
|
rm `basename ${EIGEN_URL}`
|
||||||
|
|
||||||
echo "-- copying fftw prototypes..."
|
|
||||||
wget ${FFTW_URL}
|
|
||||||
./scripts/update_fftw.sh `basename ${FFTW_URL}`
|
|
||||||
rm `basename ${FFTW_URL}`
|
|
||||||
|
|
||||||
echo '-- generating Make.inc files...'
|
echo '-- generating Make.inc files...'
|
||||||
./scripts/filelist
|
./scripts/filelist
|
||||||
echo '-- generating configure script...'
|
echo '-- generating configure script...'
|
||||||
|
10
configure.ac
10
configure.ac
@ -260,6 +260,9 @@ case ${ac_COMMS} in
|
|||||||
mpi3|mpi3-auto)
|
mpi3|mpi3-auto)
|
||||||
AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
|
AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
|
||||||
;;
|
;;
|
||||||
|
mpi3l)
|
||||||
|
AC_DEFINE([GRID_COMMS_MPI3L],[1],[GRID_COMMS_MPI3L] )
|
||||||
|
;;
|
||||||
shmem)
|
shmem)
|
||||||
AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] )
|
AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] )
|
||||||
;;
|
;;
|
||||||
@ -280,10 +283,9 @@ case ${ac_COMMS} in
|
|||||||
esac
|
esac
|
||||||
|
|
||||||
AM_CONDITIONAL(BUILD_COMMS_SHMEM,[ test "X${ac_COMMS}X" == "XshmemX" ])
|
AM_CONDITIONAL(BUILD_COMMS_SHMEM,[ test "X${ac_COMMS}X" == "XshmemX" ])
|
||||||
AM_CONDITIONAL(BUILD_COMMS_MPI,
|
AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" || test "X${ac_COMMS}X" == "Xmpi-autoX" ])
|
||||||
[ test "X${ac_COMMS}X" == "XmpiX" || test "X${ac_COMMS}X" == "Xmpi-autoX" ])
|
AM_CONDITIONAL(BUILD_COMMS_MPI3,[ test "X${ac_COMMS}X" == "Xmpi3X"] )
|
||||||
AM_CONDITIONAL(BUILD_COMMS_MPI3,
|
AM_CONDITIONAL(BUILD_COMMS_MPI3L,[ test "X${ac_COMMS}X" == "Xmpi3lX"] )
|
||||||
[ test "X${ac_COMMS}X" == "Xmpi3X" || test "X${ac_COMMS}X" == "Xmpi3-autoX" ])
|
|
||||||
AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
|
AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
|
||||||
|
|
||||||
############### RNG selection
|
############### RNG selection
|
||||||
|
@ -42,6 +42,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid/cshift/Cshift_mpi.h>
|
#include <Grid/cshift/Cshift_mpi.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GRID_COMMS_MPI3L
|
||||||
|
#include <Grid/cshift/Cshift_mpi.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef GRID_COMMS_SHMEM
|
#ifdef GRID_COMMS_SHMEM
|
||||||
#include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
|
#include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
|
||||||
#endif
|
#endif
|
||||||
|
156
lib/Init.cc
156
lib/Init.cc
@ -123,6 +123,13 @@ void GridCmdOptionIntVector(std::string &str,std::vector<int> & vec)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void GridCmdOptionInt(std::string &str,int & val)
|
||||||
|
{
|
||||||
|
std::stringstream ss(str);
|
||||||
|
ss>>val;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void GridParseLayout(char **argv,int argc,
|
void GridParseLayout(char **argv,int argc,
|
||||||
std::vector<int> &latt,
|
std::vector<int> &latt,
|
||||||
@ -153,14 +160,12 @@ void GridParseLayout(char **argv,int argc,
|
|||||||
assert(ompthreads.size()==1);
|
assert(ompthreads.size()==1);
|
||||||
GridThread::SetThreads(ompthreads[0]);
|
GridThread::SetThreads(ompthreads[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
|
if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
|
||||||
std::vector<int> cores(0);
|
int cores;
|
||||||
arg= GridCmdOptionPayload(argv,argv+argc,"--cores");
|
arg= GridCmdOptionPayload(argv,argv+argc,"--cores");
|
||||||
GridCmdOptionIntVector(arg,cores);
|
GridCmdOptionInt(arg,cores);
|
||||||
GridThread::SetCores(cores[0]);
|
GridThread::SetCores(cores);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string GridCmdVectorIntToString(const std::vector<int> & vec){
|
std::string GridCmdVectorIntToString(const std::vector<int> & vec){
|
||||||
@ -169,7 +174,7 @@ std::string GridCmdVectorIntToString(const std::vector<int> & vec){
|
|||||||
return oss.str();
|
return oss.str();
|
||||||
}
|
}
|
||||||
/////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////
|
||||||
//
|
// Reinit guard
|
||||||
/////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////
|
||||||
static int Grid_is_initialised = 0;
|
static int Grid_is_initialised = 0;
|
||||||
|
|
||||||
@ -178,27 +183,31 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
{
|
{
|
||||||
GridLogger::StopWatch.Start();
|
GridLogger::StopWatch.Start();
|
||||||
|
|
||||||
|
std::string arg;
|
||||||
|
|
||||||
|
////////////////////////////////////
|
||||||
|
// Shared memory block size
|
||||||
|
////////////////////////////////////
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--shm") ){
|
||||||
|
int MB;
|
||||||
|
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm");
|
||||||
|
GridCmdOptionInt(arg,MB);
|
||||||
|
CartesianCommunicator::MAX_MPI_SHM_BYTES = MB*1024*1024;
|
||||||
|
}
|
||||||
|
|
||||||
CartesianCommunicator::Init(argc,argv);
|
CartesianCommunicator::Init(argc,argv);
|
||||||
|
|
||||||
// Parse command line args.
|
////////////////////////////////////
|
||||||
|
// Logging
|
||||||
|
////////////////////////////////////
|
||||||
|
|
||||||
std::string arg;
|
|
||||||
std::vector<std::string> logstreams;
|
std::vector<std::string> logstreams;
|
||||||
std::string defaultLog("Error,Warning,Message,Performance");
|
std::string defaultLog("Error,Warning,Message,Performance");
|
||||||
|
|
||||||
GridCmdOptionCSL(defaultLog,logstreams);
|
GridCmdOptionCSL(defaultLog,logstreams);
|
||||||
GridLogConfigure(logstreams);
|
GridLogConfigure(logstreams);
|
||||||
|
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--help") ){
|
if( !GridCmdOptionExists(*argv,*argv+*argc,"--debug-stdout") ){
|
||||||
std::cout<<GridLogMessage<<"--help : this message"<<std::endl;
|
Grid_quiesce_nodes();
|
||||||
std::cout<<GridLogMessage<<"--debug-signals : catch sigsegv and print a blame report"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"--debug-stdout : print stdout from EVERY node"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"--decomposition : report on default omp,mpi and simd decomposition"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"--mpi n.n.n.n : default MPI decomposition"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"--threads n : default number of OMP threads"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"--grid n.n.n.n : default Grid size"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"--log list : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
|
|
||||||
exit(EXIT_SUCCESS);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--log") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--log") ){
|
||||||
@ -207,38 +216,39 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
GridLogConfigure(logstreams);
|
GridLogConfigure(logstreams);
|
||||||
}
|
}
|
||||||
|
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
|
////////////////////////////////////
|
||||||
Grid_debug_handler_init();
|
// Help message
|
||||||
}
|
////////////////////////////////////
|
||||||
if( !GridCmdOptionExists(*argv,*argv+*argc,"--debug-stdout") ){
|
|
||||||
Grid_quiesce_nodes();
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--help") ){
|
||||||
}
|
std::cout<<GridLogMessage<<" --help : this message"<<std::endl;
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
QCD::WilsonKernelsStatic::HandOpt=1;
|
std::cout<<GridLogMessage<<"Geometry:"<<std::endl;
|
||||||
}
|
std::cout<<GridLogMessage<<" --mpi n.n.n.n : default MPI decomposition"<<std::endl;
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
|
std::cout<<GridLogMessage<<" --threads n : default number of OMP threads"<<std::endl;
|
||||||
LebesgueOrder::UseLebesgueOrder=1;
|
std::cout<<GridLogMessage<<" --grid n.n.n.n : default Grid size"<<std::endl;
|
||||||
}
|
std::cout<<GridLogMessage<<" --shm M : allocate M megabytes of shared memory for comms"<<std::endl;
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
|
std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
|
||||||
GridCmdOptionIntVector(arg,LebesgueOrder::Block);
|
std::cout<<GridLogMessage<<" --log list : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
|
||||||
}
|
std::cout<<GridLogMessage<<" --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--timestamp") ){
|
std::cout<<GridLogMessage<<" --debug-signals : catch sigsegv and print a blame report"<<std::endl;
|
||||||
GridLogTimestamp(1);
|
std::cout<<GridLogMessage<<" --debug-stdout : print stdout from EVERY node"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<" --timestamp : tag with millisecond resolution stamps"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"Performance:"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<" --dslash-generic: Wilson kernel for generic Nc"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<" --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<" --dslash-asm : Wilson kernel for AVX512"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<" --lebesgue : Cache oblivious Lebesgue curve/Morton order/Z-graph stencil looping"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<" --cacheblocking n.m.o.p : Hypercuboidal cache blocking"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
|
exit(EXIT_SUCCESS);
|
||||||
}
|
}
|
||||||
|
|
||||||
GridParseLayout(*argv,*argc,
|
////////////////////////////////////
|
||||||
Grid_default_latt,
|
// Banner
|
||||||
Grid_default_mpi);
|
////////////////////////////////////
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
|
|
||||||
std::cout<<GridLogMessage<<"Grid Decomposition\n";
|
|
||||||
std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"\tMPI tasks : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"\tvRealF : "<<sizeof(vRealF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"\tvRealD : "<<sizeof(vRealD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"\tvComplexF : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"\tvComplexD : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string COL_RED = GridLogColours.colour["RED"];
|
std::string COL_RED = GridLogColours.colour["RED"];
|
||||||
std::string COL_PURPLE = GridLogColours.colour["PURPLE"];
|
std::string COL_PURPLE = GridLogColours.colour["PURPLE"];
|
||||||
@ -247,7 +257,6 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
std::string COL_BLUE = GridLogColours.colour["BLUE"];
|
std::string COL_BLUE = GridLogColours.colour["BLUE"];
|
||||||
std::string COL_YELLOW = GridLogColours.colour["YELLOW"];
|
std::string COL_YELLOW = GridLogColours.colour["YELLOW"];
|
||||||
std::string COL_BACKGROUND = GridLogColours.colour["NORMAL"];
|
std::string COL_BACKGROUND = GridLogColours.colour["NORMAL"];
|
||||||
|
|
||||||
|
|
||||||
std::cout <<std::endl;
|
std::cout <<std::endl;
|
||||||
std::cout <<COL_RED << "__|__|__|__|__"<< "|__|__|_"<<COL_PURPLE<<"_|__|__|"<< "__|__|__|__|__"<<std::endl;
|
std::cout <<COL_RED << "__|__|__|__|__"<< "|__|__|_"<<COL_PURPLE<<"_|__|__|"<< "__|__|__|__|__"<<std::endl;
|
||||||
@ -281,6 +290,53 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
std::cout << COL_BACKGROUND <<std::endl;
|
std::cout << COL_BACKGROUND <<std::endl;
|
||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
|
|
||||||
|
////////////////////////////////////
|
||||||
|
// Debug and performance options
|
||||||
|
////////////////////////////////////
|
||||||
|
|
||||||
|
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
|
||||||
|
Grid_debug_handler_init();
|
||||||
|
}
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-unroll") ){
|
||||||
|
QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptHandUnroll;
|
||||||
|
}
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-asm") ){
|
||||||
|
QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptInlineAsm;
|
||||||
|
}
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-generic") ){
|
||||||
|
QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptGeneric;
|
||||||
|
}
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
|
||||||
|
LebesgueOrder::UseLebesgueOrder=1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
|
||||||
|
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
|
||||||
|
GridCmdOptionIntVector(arg,LebesgueOrder::Block);
|
||||||
|
}
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--timestamp") ){
|
||||||
|
GridLogTimestamp(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
GridParseLayout(*argv,*argc,
|
||||||
|
Grid_default_latt,
|
||||||
|
Grid_default_mpi);
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
|
||||||
|
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
|
||||||
|
std::cout<<GridLogMessage<<"Grid Decomposition\n";
|
||||||
|
std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"\tMPI tasks : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"\tvRealF : "<<sizeof(vRealF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"\tvRealD : "<<sizeof(vRealD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"\tvComplexF : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"\tvComplexD : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
Grid_is_initialised = 1;
|
Grid_is_initialised = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9,6 +9,11 @@ if BUILD_COMMS_MPI3
|
|||||||
extra_sources+=communicator/Communicator_base.cc
|
extra_sources+=communicator/Communicator_base.cc
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
if BUILD_COMMS_MPI3L
|
||||||
|
extra_sources+=communicator/Communicator_mpi3_leader.cc
|
||||||
|
extra_sources+=communicator/Communicator_base.cc
|
||||||
|
endif
|
||||||
|
|
||||||
if BUILD_COMMS_SHMEM
|
if BUILD_COMMS_SHMEM
|
||||||
extra_sources+=communicator/Communicator_shmem.cc
|
extra_sources+=communicator/Communicator_shmem.cc
|
||||||
extra_sources+=communicator/Communicator_base.cc
|
extra_sources+=communicator/Communicator_base.cc
|
||||||
|
@ -31,14 +31,8 @@ namespace Grid {
|
|||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Info that is setup once and indept of cartesian layout
|
// Info that is setup once and indept of cartesian layout
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
int CartesianCommunicator::ShmRank;
|
|
||||||
int CartesianCommunicator::ShmSize;
|
|
||||||
int CartesianCommunicator::GroupRank;
|
|
||||||
int CartesianCommunicator::GroupSize;
|
|
||||||
int CartesianCommunicator::WorldRank;
|
|
||||||
int CartesianCommunicator::WorldSize;
|
|
||||||
int CartesianCommunicator::Slave;
|
|
||||||
void * CartesianCommunicator::ShmCommBuf;
|
void * CartesianCommunicator::ShmCommBuf;
|
||||||
|
uint64_t CartesianCommunicator::MAX_MPI_SHM_BYTES = 128*1024*1024;
|
||||||
|
|
||||||
/////////////////////////////////
|
/////////////////////////////////
|
||||||
// Alloc, free shmem region
|
// Alloc, free shmem region
|
||||||
@ -48,7 +42,12 @@ void *CartesianCommunicator::ShmBufferMalloc(size_t bytes){
|
|||||||
void *ptr = (void *)heap_top;
|
void *ptr = (void *)heap_top;
|
||||||
heap_top += bytes;
|
heap_top += bytes;
|
||||||
heap_bytes+= bytes;
|
heap_bytes+= bytes;
|
||||||
assert(heap_bytes < MAX_MPI_SHM_BYTES);
|
if (heap_bytes >= MAX_MPI_SHM_BYTES) {
|
||||||
|
std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
|
||||||
|
std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
|
||||||
|
std::cout<< " Current value is " << (MAX_MPI_SHM_BYTES/(1024*1024)) <<std::endl;
|
||||||
|
assert(heap_bytes<MAX_MPI_SHM_BYTES);
|
||||||
|
}
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::ShmBufferFreeAll(void) {
|
void CartesianCommunicator::ShmBufferFreeAll(void) {
|
||||||
@ -69,12 +68,6 @@ int CartesianCommunicator::ProcessorCount(void) { return
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// very VERY rarely (Log, serial RNG) we need world without a grid
|
// very VERY rarely (Log, serial RNG) we need world without a grid
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int CartesianCommunicator::RankWorld(void){ return WorldRank; };
|
|
||||||
int CartesianCommunicator::Ranks (void) { return WorldSize; };
|
|
||||||
int CartesianCommunicator::Nodes (void) { return GroupSize; };
|
|
||||||
int CartesianCommunicator::Cores (void) { return ShmSize; };
|
|
||||||
int CartesianCommunicator::NodeRank (void) { return GroupRank; };
|
|
||||||
int CartesianCommunicator::CoreRank (void) { return ShmRank; };
|
|
||||||
|
|
||||||
void CartesianCommunicator::GlobalSum(ComplexF &c)
|
void CartesianCommunicator::GlobalSum(ComplexF &c)
|
||||||
{
|
{
|
||||||
@ -93,7 +86,7 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
|
|||||||
GlobalSumVector((double *)c,2*N);
|
GlobalSumVector((double *)c,2*N);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef GRID_COMMS_MPI3
|
#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L)
|
||||||
|
|
||||||
void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
|
||||||
/*************************************************************************************
|
/*************************************************************************************
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
@ -37,6 +38,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#ifdef GRID_COMMS_MPI3
|
#ifdef GRID_COMMS_MPI3
|
||||||
#include <mpi.h>
|
#include <mpi.h>
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef GRID_COMMS_MPI3L
|
||||||
|
#include <mpi.h>
|
||||||
|
#endif
|
||||||
#ifdef GRID_COMMS_SHMEM
|
#ifdef GRID_COMMS_SHMEM
|
||||||
#include <mpp/shmem.h>
|
#include <mpp/shmem.h>
|
||||||
#endif
|
#endif
|
||||||
@ -51,7 +55,7 @@ class CartesianCommunicator {
|
|||||||
// Give external control (command line override?) of this
|
// Give external control (command line override?) of this
|
||||||
|
|
||||||
static const int MAXLOG2RANKSPERNODE = 16;
|
static const int MAXLOG2RANKSPERNODE = 16;
|
||||||
static const uint64_t MAX_MPI_SHM_BYTES = 128*1024*1024;
|
static uint64_t MAX_MPI_SHM_BYTES;
|
||||||
|
|
||||||
// Communicator should know nothing of the physics grid, only processor grid.
|
// Communicator should know nothing of the physics grid, only processor grid.
|
||||||
int _Nprocessors; // How many in all
|
int _Nprocessors; // How many in all
|
||||||
@ -60,9 +64,9 @@ class CartesianCommunicator {
|
|||||||
std::vector<int> _processor_coor; // linear processor coordinate
|
std::vector<int> _processor_coor; // linear processor coordinate
|
||||||
unsigned long _ndimension;
|
unsigned long _ndimension;
|
||||||
|
|
||||||
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
|
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPI3L)
|
||||||
MPI_Comm communicator;
|
|
||||||
static MPI_Comm communicator_world;
|
static MPI_Comm communicator_world;
|
||||||
|
MPI_Comm communicator;
|
||||||
typedef MPI_Request CommsRequest_t;
|
typedef MPI_Request CommsRequest_t;
|
||||||
#else
|
#else
|
||||||
typedef int CommsRequest_t;
|
typedef int CommsRequest_t;
|
||||||
@ -75,7 +79,15 @@ class CartesianCommunicator {
|
|||||||
// cartesian communicator on a subset of ranks, slave ranks controlled
|
// cartesian communicator on a subset of ranks, slave ranks controlled
|
||||||
// by group leader with data xfer via shared memory
|
// by group leader with data xfer via shared memory
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
#ifdef GRID_COMMS_MPI3
|
#ifdef GRID_COMMS_MPI3
|
||||||
|
|
||||||
|
static int ShmRank;
|
||||||
|
static int ShmSize;
|
||||||
|
static int GroupRank;
|
||||||
|
static int GroupSize;
|
||||||
|
static int WorldRank;
|
||||||
|
static int WorldSize;
|
||||||
|
|
||||||
std::vector<int> WorldDims;
|
std::vector<int> WorldDims;
|
||||||
std::vector<int> GroupDims;
|
std::vector<int> GroupDims;
|
||||||
std::vector<int> ShmDims;
|
std::vector<int> ShmDims;
|
||||||
@ -83,7 +95,7 @@ class CartesianCommunicator {
|
|||||||
std::vector<int> GroupCoor;
|
std::vector<int> GroupCoor;
|
||||||
std::vector<int> ShmCoor;
|
std::vector<int> ShmCoor;
|
||||||
std::vector<int> WorldCoor;
|
std::vector<int> WorldCoor;
|
||||||
|
|
||||||
static std::vector<int> GroupRanks;
|
static std::vector<int> GroupRanks;
|
||||||
static std::vector<int> MyGroup;
|
static std::vector<int> MyGroup;
|
||||||
static int ShmSetup;
|
static int ShmSetup;
|
||||||
@ -93,13 +105,20 @@ class CartesianCommunicator {
|
|||||||
std::vector<int> LexicographicToWorldRank;
|
std::vector<int> LexicographicToWorldRank;
|
||||||
|
|
||||||
static std::vector<void *> ShmCommBufs;
|
static std::vector<void *> ShmCommBufs;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
static void ShmInitGeneric(void);
|
static void ShmInitGeneric(void);
|
||||||
static commVector<uint8_t> ShmBufStorageVector;
|
static commVector<uint8_t> ShmBufStorageVector;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/////////////////////////////////
|
||||||
|
// Grid information and queries
|
||||||
|
// Implemented in Communicator_base.C
|
||||||
|
/////////////////////////////////
|
||||||
static void * ShmCommBuf;
|
static void * ShmCommBuf;
|
||||||
size_t heap_top;
|
size_t heap_top;
|
||||||
size_t heap_bytes;
|
size_t heap_bytes;
|
||||||
|
|
||||||
void *ShmBufferSelf(void);
|
void *ShmBufferSelf(void);
|
||||||
void *ShmBuffer(int rank);
|
void *ShmBuffer(int rank);
|
||||||
void *ShmBufferTranslate(int rank,void * local_p);
|
void *ShmBufferTranslate(int rank,void * local_p);
|
||||||
@ -123,28 +142,12 @@ class CartesianCommunicator {
|
|||||||
int RankFromProcessorCoor(std::vector<int> &coor);
|
int RankFromProcessorCoor(std::vector<int> &coor);
|
||||||
void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
|
void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
|
||||||
|
|
||||||
/////////////////////////////////
|
|
||||||
// Grid information and queries
|
|
||||||
/////////////////////////////////
|
|
||||||
static int ShmRank;
|
|
||||||
static int ShmSize;
|
|
||||||
static int GroupSize;
|
|
||||||
static int GroupRank;
|
|
||||||
static int WorldRank;
|
|
||||||
static int WorldSize;
|
|
||||||
static int Slave;
|
|
||||||
|
|
||||||
int IsBoss(void) ;
|
int IsBoss(void) ;
|
||||||
int BossRank(void) ;
|
int BossRank(void) ;
|
||||||
int ThisRank(void) ;
|
int ThisRank(void) ;
|
||||||
const std::vector<int> & ThisProcessorCoor(void) ;
|
const std::vector<int> & ThisProcessorCoor(void) ;
|
||||||
const std::vector<int> & ProcessorGrid(void) ;
|
const std::vector<int> & ProcessorGrid(void) ;
|
||||||
int ProcessorCount(void) ;
|
int ProcessorCount(void) ;
|
||||||
static int Ranks (void);
|
|
||||||
static int Nodes (void);
|
|
||||||
static int Cores (void);
|
|
||||||
static int NodeRank (void);
|
|
||||||
static int CoreRank (void);
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// very VERY rarely (Log, serial RNG) we need world without a grid
|
// very VERY rarely (Log, serial RNG) we need world without a grid
|
||||||
|
@ -44,13 +44,6 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
|
|||||||
MPI_Init(argc,argv);
|
MPI_Init(argc,argv);
|
||||||
}
|
}
|
||||||
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
|
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
|
||||||
MPI_Comm_rank(communicator_world,&WorldRank);
|
|
||||||
MPI_Comm_size(communicator_world,&WorldSize);
|
|
||||||
ShmRank=0;
|
|
||||||
ShmSize=1;
|
|
||||||
GroupRank=WorldRank;
|
|
||||||
GroupSize=WorldSize;
|
|
||||||
Slave =0;
|
|
||||||
ShmInitGeneric();
|
ShmInitGeneric();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -198,6 +191,11 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
|
|||||||
// Should only be used prior to Grid Init finished.
|
// Should only be used prior to Grid Init finished.
|
||||||
// Check for this?
|
// Check for this?
|
||||||
///////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////
|
||||||
|
int CartesianCommunicator::RankWorld(void){
|
||||||
|
int r;
|
||||||
|
MPI_Comm_rank(communicator_world,&r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
||||||
{
|
{
|
||||||
int ierr= MPI_Bcast(data,
|
int ierr= MPI_Bcast(data,
|
||||||
|
@ -30,12 +30,18 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Info that is setup once and indept of cartesian layout
|
// Info that is setup once and indept of cartesian layout
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
int CartesianCommunicator::ShmSetup = 0;
|
int CartesianCommunicator::ShmSetup = 0;
|
||||||
|
|
||||||
|
int CartesianCommunicator::ShmRank;
|
||||||
|
int CartesianCommunicator::ShmSize;
|
||||||
|
int CartesianCommunicator::GroupRank;
|
||||||
|
int CartesianCommunicator::GroupSize;
|
||||||
|
int CartesianCommunicator::WorldRank;
|
||||||
|
int CartesianCommunicator::WorldSize;
|
||||||
|
|
||||||
MPI_Comm CartesianCommunicator::communicator_world;
|
MPI_Comm CartesianCommunicator::communicator_world;
|
||||||
MPI_Comm CartesianCommunicator::ShmComm;
|
MPI_Comm CartesianCommunicator::ShmComm;
|
||||||
MPI_Win CartesianCommunicator::ShmWindow;
|
MPI_Win CartesianCommunicator::ShmWindow;
|
||||||
@ -97,15 +103,15 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
|
|||||||
|
|
||||||
std::vector<int> world_ranks(WorldSize);
|
std::vector<int> world_ranks(WorldSize);
|
||||||
GroupRanks.resize(WorldSize);
|
GroupRanks.resize(WorldSize);
|
||||||
MyGroup.resize(ShmSize);
|
|
||||||
for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
|
for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
|
||||||
|
|
||||||
MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &GroupRanks[0]);
|
MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &GroupRanks[0]);
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
// Identify who is in my group and noninate the leader
|
// Identify who is in my group and noninate the leader
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
int g=0;
|
int g=0;
|
||||||
|
MyGroup.resize(ShmSize);
|
||||||
for(int rank=0;rank<WorldSize;rank++){
|
for(int rank=0;rank<WorldSize;rank++){
|
||||||
if(GroupRanks[rank]!=MPI_UNDEFINED){
|
if(GroupRanks[rank]!=MPI_UNDEFINED){
|
||||||
assert(g<ShmSize);
|
assert(g<ShmSize);
|
||||||
|
871
lib/communicator/Communicator_mpi3_leader.cc
Normal file
871
lib/communicator/Communicator_mpi3_leader.cc
Normal file
@ -0,0 +1,871 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/communicator/Communicator_mpi.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include "Grid.h"
|
||||||
|
#include <mpi.h>
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
/// Workarounds:
|
||||||
|
/// i) bloody mac os doesn't implement unnamed semaphores since it is "optional" posix.
|
||||||
|
/// darwin dispatch semaphores don't seem to be multiprocess.
|
||||||
|
///
|
||||||
|
/// ii) openmpi under --mca shmem posix works with two squadrons per node;
|
||||||
|
/// openmpi under default mca settings (I think --mca shmem mmap) on MacOS makes two squadrons map the SAME
|
||||||
|
/// memory as each other, despite their living on different communicators. This appears to be a bug in OpenMPI.
|
||||||
|
///
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
#include <semaphore.h>
|
||||||
|
typedef sem_t *Grid_semaphore;
|
||||||
|
|
||||||
|
#define SEM_INIT(S) S = sem_open(sem_name,0,0600,0); assert ( S != SEM_FAILED );
|
||||||
|
#define SEM_INIT_EXCL(S) sem_unlink(sem_name); S = sem_open(sem_name,O_CREAT|O_EXCL,0600,0); assert ( S != SEM_FAILED );
|
||||||
|
#define SEM_POST(S) assert ( sem_post(S) == 0 );
|
||||||
|
#define SEM_WAIT(S) assert ( sem_wait(S) == 0 );
|
||||||
|
|
||||||
|
#include <sys/mman.h>
|
||||||
|
|
||||||
|
|
||||||
|
namespace Grid {
|
||||||
|
|
||||||
|
enum { COMMAND_ISEND, COMMAND_IRECV, COMMAND_WAITALL };
|
||||||
|
|
||||||
|
struct Descriptor {
|
||||||
|
uint64_t buf;
|
||||||
|
size_t bytes;
|
||||||
|
int rank;
|
||||||
|
int tag;
|
||||||
|
int command;
|
||||||
|
MPI_Request request;
|
||||||
|
};
|
||||||
|
|
||||||
|
const int pool = 48;
|
||||||
|
|
||||||
|
class SlaveState {
|
||||||
|
public:
|
||||||
|
volatile int head;
|
||||||
|
volatile int start;
|
||||||
|
volatile int tail;
|
||||||
|
volatile Descriptor Descrs[pool];
|
||||||
|
};
|
||||||
|
|
||||||
|
class Slave {
|
||||||
|
public:
|
||||||
|
Grid_semaphore sem_head;
|
||||||
|
Grid_semaphore sem_tail;
|
||||||
|
SlaveState *state;
|
||||||
|
MPI_Comm squadron;
|
||||||
|
uint64_t base;
|
||||||
|
int universe_rank;
|
||||||
|
int vertical_rank;
|
||||||
|
char sem_name [NAME_MAX];
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
// Descriptor circular pointers
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
Slave() {};
|
||||||
|
|
||||||
|
void Init(SlaveState * _state,MPI_Comm _squadron,int _universe_rank,int _vertical_rank);
|
||||||
|
|
||||||
|
void SemInit(void) {
|
||||||
|
sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank);
|
||||||
|
printf("SEM_NAME: %s \n",sem_name);
|
||||||
|
SEM_INIT(sem_head);
|
||||||
|
sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank);
|
||||||
|
printf("SEM_NAME: %s \n",sem_name);
|
||||||
|
SEM_INIT(sem_tail);
|
||||||
|
}
|
||||||
|
void SemInitExcl(void) {
|
||||||
|
sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank);
|
||||||
|
printf("SEM_INIT_EXCL: %s \n",sem_name);
|
||||||
|
SEM_INIT_EXCL(sem_head);
|
||||||
|
sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank);
|
||||||
|
printf("SEM_INIT_EXCL: %s \n",sem_name);
|
||||||
|
SEM_INIT_EXCL(sem_tail);
|
||||||
|
}
|
||||||
|
void WakeUpDMA(void) {
|
||||||
|
SEM_POST(sem_head);
|
||||||
|
};
|
||||||
|
void WakeUpCompute(void) {
|
||||||
|
SEM_POST(sem_tail);
|
||||||
|
};
|
||||||
|
void WaitForCommand(void) {
|
||||||
|
SEM_WAIT(sem_head);
|
||||||
|
};
|
||||||
|
void WaitForComplete(void) {
|
||||||
|
SEM_WAIT(sem_tail);
|
||||||
|
};
|
||||||
|
void EventLoop (void) {
|
||||||
|
std::cout<< " Entering event loop "<<std::endl;
|
||||||
|
while(1){
|
||||||
|
WaitForCommand();
|
||||||
|
// std::cout << "Getting command "<<std::endl;
|
||||||
|
Event();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int Event (void) ;
|
||||||
|
|
||||||
|
uint64_t QueueCommand(int command,void *buf, int bytes, int hashtag, MPI_Comm comm,int u_rank) ;
|
||||||
|
|
||||||
|
void WaitAll() {
|
||||||
|
// std::cout << "Queueing WAIT command "<<std::endl;
|
||||||
|
QueueCommand(COMMAND_WAITALL,0,0,0,squadron,0);
|
||||||
|
// std::cout << "Waking up DMA "<<std::endl;
|
||||||
|
WakeUpDMA();
|
||||||
|
// std::cout << "Waiting from semaphore "<<std::endl;
|
||||||
|
WaitForComplete();
|
||||||
|
// std::cout << "Checking FIFO is empty "<<std::endl;
|
||||||
|
assert ( state->tail == state->head );
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
// One instance of a data mover.
|
||||||
|
// Master and Slave must agree on location in shared memory
|
||||||
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
class MPIoffloadEngine {
|
||||||
|
public:
|
||||||
|
|
||||||
|
static std::vector<Slave> Slaves;
|
||||||
|
|
||||||
|
static int ShmSetup;
|
||||||
|
|
||||||
|
static int UniverseRank;
|
||||||
|
static int UniverseSize;
|
||||||
|
|
||||||
|
static MPI_Comm communicator_universe;
|
||||||
|
static MPI_Comm communicator_cached;
|
||||||
|
|
||||||
|
static MPI_Comm HorizontalComm;
|
||||||
|
static int HorizontalRank;
|
||||||
|
static int HorizontalSize;
|
||||||
|
|
||||||
|
static MPI_Comm VerticalComm;
|
||||||
|
static MPI_Win VerticalWindow;
|
||||||
|
static int VerticalSize;
|
||||||
|
static int VerticalRank;
|
||||||
|
|
||||||
|
static std::vector<void *> VerticalShmBufs;
|
||||||
|
static std::vector<std::vector<int> > UniverseRanks;
|
||||||
|
static std::vector<int> UserCommunicatorToWorldRanks;
|
||||||
|
|
||||||
|
static MPI_Group WorldGroup, CachedGroup;
|
||||||
|
|
||||||
|
static void CommunicatorInit (MPI_Comm &communicator_world,
|
||||||
|
MPI_Comm &ShmComm,
|
||||||
|
void * &ShmCommBuf);
|
||||||
|
|
||||||
|
static void MapCommRankToWorldRank(int &hashtag, int & comm_world_peer,int tag, MPI_Comm comm,int commrank);
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////
|
||||||
|
// routines for master proc must handle any communicator
|
||||||
|
/////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
static void QueueSend(int slave,void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
|
||||||
|
// std::cout<< " Queueing send "<< bytes<< " slave "<< slave << " to comm "<<rank <<std::endl;
|
||||||
|
Slaves[slave].QueueCommand(COMMAND_ISEND,buf,bytes,tag,comm,rank);
|
||||||
|
// std::cout << "Queued send command to rank "<< rank<< " via "<<slave <<std::endl;
|
||||||
|
Slaves[slave].WakeUpDMA();
|
||||||
|
// std::cout << "Waking up DMA "<< slave<<std::endl;
|
||||||
|
};
|
||||||
|
|
||||||
|
static void QueueRecv(int slave, void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
|
||||||
|
// std::cout<< " Queueing recv "<< bytes<< " slave "<< slave << " from comm "<<rank <<std::endl;
|
||||||
|
Slaves[slave].QueueCommand(COMMAND_IRECV,buf,bytes,tag,comm,rank);
|
||||||
|
// std::cout << "Queued recv command from rank "<< rank<< " via "<<slave <<std::endl;
|
||||||
|
Slaves[slave].WakeUpDMA();
|
||||||
|
// std::cout << "Waking up DMA "<< slave<<std::endl;
|
||||||
|
};
|
||||||
|
|
||||||
|
static void WaitAll() {
|
||||||
|
for(int s=1;s<VerticalSize;s++) {
|
||||||
|
// std::cout << "Waiting for slave "<< s<<std::endl;
|
||||||
|
Slaves[s].WaitAll();
|
||||||
|
}
|
||||||
|
// std::cout << " Wait all Complete "<<std::endl;
|
||||||
|
};
|
||||||
|
|
||||||
|
static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){
|
||||||
|
int basework = nwork/units;
|
||||||
|
int backfill = units-(nwork%units);
|
||||||
|
if ( me >= units ) {
|
||||||
|
mywork = myoff = 0;
|
||||||
|
} else {
|
||||||
|
mywork = (nwork+me)/units;
|
||||||
|
myoff = basework * me;
|
||||||
|
if ( me > backfill )
|
||||||
|
myoff+= (me-backfill);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
|
||||||
|
static void QueueMultiplexedSend(void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
|
||||||
|
uint8_t * cbuf = (uint8_t *) buf;
|
||||||
|
int mywork, myoff, procs;
|
||||||
|
procs = VerticalSize-1;
|
||||||
|
for(int s=0;s<procs;s++) {
|
||||||
|
GetWork(bytes,s,mywork,myoff,procs);
|
||||||
|
QueueSend(s+1,&cbuf[myoff],mywork,tag,comm,rank);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static void QueueMultiplexedRecv(void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
|
||||||
|
uint8_t * cbuf = (uint8_t *) buf;
|
||||||
|
int mywork, myoff, procs;
|
||||||
|
procs = VerticalSize-1;
|
||||||
|
for(int s=0;s<procs;s++) {
|
||||||
|
GetWork(bytes,s,mywork,myoff,procs);
|
||||||
|
QueueRecv(s+1,&cbuf[myoff],mywork,tag,comm,rank);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Info that is setup once and indept of cartesian layout
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
std::vector<Slave> MPIoffloadEngine::Slaves;
|
||||||
|
|
||||||
|
int MPIoffloadEngine::UniverseRank;
|
||||||
|
int MPIoffloadEngine::UniverseSize;
|
||||||
|
|
||||||
|
MPI_Comm MPIoffloadEngine::communicator_universe;
|
||||||
|
MPI_Comm MPIoffloadEngine::communicator_cached;
|
||||||
|
MPI_Group MPIoffloadEngine::WorldGroup;
|
||||||
|
MPI_Group MPIoffloadEngine::CachedGroup;
|
||||||
|
|
||||||
|
MPI_Comm MPIoffloadEngine::HorizontalComm;
|
||||||
|
int MPIoffloadEngine::HorizontalRank;
|
||||||
|
int MPIoffloadEngine::HorizontalSize;
|
||||||
|
|
||||||
|
MPI_Comm MPIoffloadEngine::VerticalComm;
|
||||||
|
int MPIoffloadEngine::VerticalSize;
|
||||||
|
int MPIoffloadEngine::VerticalRank;
|
||||||
|
MPI_Win MPIoffloadEngine::VerticalWindow;
|
||||||
|
std::vector<void *> MPIoffloadEngine::VerticalShmBufs;
|
||||||
|
std::vector<std::vector<int> > MPIoffloadEngine::UniverseRanks;
|
||||||
|
std::vector<int> MPIoffloadEngine::UserCommunicatorToWorldRanks;
|
||||||
|
|
||||||
|
int MPIoffloadEngine::ShmSetup = 0;
|
||||||
|
|
||||||
|
void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world,
|
||||||
|
MPI_Comm &ShmComm,
|
||||||
|
void * &ShmCommBuf)
|
||||||
|
{
|
||||||
|
int flag;
|
||||||
|
assert(ShmSetup==0);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////
|
||||||
|
// Universe is all nodes prior to squadron grouping
|
||||||
|
//////////////////////////////////////////////////////////////////////
|
||||||
|
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_universe);
|
||||||
|
MPI_Comm_rank(communicator_universe,&UniverseRank);
|
||||||
|
MPI_Comm_size(communicator_universe,&UniverseSize);
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////////
|
||||||
|
// Split into groups that can share memory (Verticals)
|
||||||
|
/////////////////////////////////////////////////////////////////////
|
||||||
|
#define MPI_SHARED_MEM_DEBUG
|
||||||
|
#ifdef MPI_SHARED_MEM_DEBUG
|
||||||
|
MPI_Comm_split(communicator_universe,(UniverseRank/4),UniverseRank,&VerticalComm);
|
||||||
|
#else
|
||||||
|
MPI_Comm_split_type(communicator_universe, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&VerticalComm);
|
||||||
|
#endif
|
||||||
|
MPI_Comm_rank(VerticalComm ,&VerticalRank);
|
||||||
|
MPI_Comm_size(VerticalComm ,&VerticalSize);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////
|
||||||
|
// Split into horizontal groups by rank in squadron
|
||||||
|
//////////////////////////////////////////////////////////////////////
|
||||||
|
MPI_Comm_split(communicator_universe,VerticalRank,UniverseRank,&HorizontalComm);
|
||||||
|
MPI_Comm_rank(HorizontalComm,&HorizontalRank);
|
||||||
|
MPI_Comm_size(HorizontalComm,&HorizontalSize);
|
||||||
|
assert(HorizontalSize*VerticalSize==UniverseSize);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// What is my place in the world
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
int WorldRank=0;
|
||||||
|
if(VerticalRank==0) WorldRank = HorizontalRank;
|
||||||
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&WorldRank,1,MPI_INT,MPI_SUM,VerticalComm);
|
||||||
|
assert(ierr==0);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Where is the world in the universe?
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
UniverseRanks = std::vector<std::vector<int> >(HorizontalSize,std::vector<int>(VerticalSize,0));
|
||||||
|
UniverseRanks[WorldRank][VerticalRank] = UniverseRank;
|
||||||
|
for(int w=0;w<HorizontalSize;w++){
|
||||||
|
ierr=MPI_Allreduce(MPI_IN_PLACE,&UniverseRanks[w][0],VerticalSize,MPI_INT,MPI_SUM,communicator_universe);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// allocate the shared window for our group, pass back Shm info to CartesianCommunicator
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
VerticalShmBufs.resize(VerticalSize);
|
||||||
|
|
||||||
|
#undef MPI_SHARED_MEM
|
||||||
|
#ifdef MPI_SHARED_MEM
|
||||||
|
ierr = MPI_Win_allocate_shared(CartesianCommunicator::MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,VerticalComm,&ShmCommBuf,&VerticalWindow);
|
||||||
|
ierr|= MPI_Win_lock_all (MPI_MODE_NOCHECK, VerticalWindow);
|
||||||
|
assert(ierr==0);
|
||||||
|
// std::cout<<"SHM "<<ShmCommBuf<<std::endl;
|
||||||
|
|
||||||
|
for(int r=0;r<VerticalSize;r++){
|
||||||
|
MPI_Aint sz;
|
||||||
|
int dsp_unit;
|
||||||
|
MPI_Win_shared_query (VerticalWindow, r, &sz, &dsp_unit, &VerticalShmBufs[r]);
|
||||||
|
// std::cout<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
char shm_name [NAME_MAX];
|
||||||
|
MPI_Barrier(VerticalComm);
|
||||||
|
|
||||||
|
if ( VerticalRank == 0 ) {
|
||||||
|
for(int r=0;r<VerticalSize;r++){
|
||||||
|
|
||||||
|
size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
|
||||||
|
if ( r>0 ) size = sizeof(SlaveState);
|
||||||
|
|
||||||
|
sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldRank,r);
|
||||||
|
|
||||||
|
shm_unlink(shm_name);
|
||||||
|
|
||||||
|
int fd=shm_open(shm_name,O_RDWR|O_CREAT,0600);
|
||||||
|
if ( fd < 0 ) {
|
||||||
|
perror("failed shm_open");
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
ftruncate(fd, size);
|
||||||
|
|
||||||
|
VerticalShmBufs[r] = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
|
||||||
|
|
||||||
|
if ( VerticalShmBufs[r] == MAP_FAILED ) {
|
||||||
|
perror("failed mmap");
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t * check = (uint64_t *) VerticalShmBufs[r];
|
||||||
|
check[0] = WorldRank;
|
||||||
|
check[1] = r;
|
||||||
|
|
||||||
|
// std::cout<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
MPI_Barrier(VerticalComm);
|
||||||
|
|
||||||
|
if ( VerticalRank != 0 ) {
|
||||||
|
for(int r=0;r<VerticalSize;r++){
|
||||||
|
|
||||||
|
size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES ;
|
||||||
|
if ( r>0 ) size = sizeof(SlaveState);
|
||||||
|
|
||||||
|
sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldRank,r);
|
||||||
|
|
||||||
|
int fd=shm_open(shm_name,O_RDWR|O_CREAT,0600);
|
||||||
|
if ( fd<0 ) {
|
||||||
|
perror("failed shm_open");
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
VerticalShmBufs[r] = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
|
||||||
|
|
||||||
|
uint64_t * check = (uint64_t *) VerticalShmBufs[r];
|
||||||
|
assert(check[0]== WorldRank);
|
||||||
|
assert(check[1]== r);
|
||||||
|
std::cerr<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
MPI_Barrier(VerticalComm);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////
|
||||||
|
// Map rank of leader on node in their in new world, to the
|
||||||
|
// rank in this vertical plane's horizontal communicator
|
||||||
|
//////////////////////////////////////////////////////////////////////
|
||||||
|
communicator_world = HorizontalComm;
|
||||||
|
ShmComm = VerticalComm;
|
||||||
|
ShmCommBuf = VerticalShmBufs[0];
|
||||||
|
MPI_Comm_group (communicator_world, &WorldGroup);
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////
|
||||||
|
// Start the slave data movers
|
||||||
|
///////////////////////////////////////////////////////////
|
||||||
|
if ( VerticalRank != 0 ) {
|
||||||
|
Slave indentured;
|
||||||
|
indentured.Init( (SlaveState *) VerticalShmBufs[VerticalRank], VerticalComm, UniverseRank,VerticalRank);
|
||||||
|
indentured.SemInitExcl();// init semaphore in shared memory
|
||||||
|
MPI_Barrier(VerticalComm);
|
||||||
|
MPI_Barrier(VerticalComm);
|
||||||
|
indentured.EventLoop();
|
||||||
|
assert(0);
|
||||||
|
} else {
|
||||||
|
Slaves.resize(VerticalSize);
|
||||||
|
for(int i=1;i<VerticalSize;i++){
|
||||||
|
Slaves[i].Init((SlaveState *)VerticalShmBufs[i],VerticalComm, UniverseRanks[HorizontalRank][i],i);
|
||||||
|
}
|
||||||
|
MPI_Barrier(VerticalComm);
|
||||||
|
for(int i=1;i<VerticalSize;i++){
|
||||||
|
Slaves[i].SemInit();// init semaphore in shared memory
|
||||||
|
}
|
||||||
|
MPI_Barrier(VerticalComm);
|
||||||
|
}
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////
|
||||||
|
// Verbose for now
|
||||||
|
///////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
ShmSetup=1;
|
||||||
|
|
||||||
|
if (UniverseRank == 0){
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Grid MPI-3 configuration: detected ";
|
||||||
|
std::cout<<UniverseSize << " Ranks " ;
|
||||||
|
std::cout<<HorizontalSize << " Nodes " ;
|
||||||
|
std::cout<<VerticalSize << " with ranks-per-node "<<std::endl;
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Grid MPI-3 configuration: using one lead process per node " << std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Grid MPI-3 configuration: reduced communicator has size " << HorizontalSize << std::endl;
|
||||||
|
|
||||||
|
for(int g=0;g<HorizontalSize;g++){
|
||||||
|
std::cout<<GridLogMessage<<" Node "<<g<<" led by MPI rank "<< UniverseRanks[g][0]<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int g=0;g<HorizontalSize;g++){
|
||||||
|
std::cout<<GridLogMessage<<" { ";
|
||||||
|
for(int s=0;s<VerticalSize;s++){
|
||||||
|
std::cout<< UniverseRanks[g][s];
|
||||||
|
if ( s<VerticalSize-1 ) {
|
||||||
|
std::cout<<",";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::cout<<" } "<<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Map the communicator into communicator_world, and find the neighbour.
|
||||||
|
// Cache the mappings; cache size is 1.
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
void MPIoffloadEngine::MapCommRankToWorldRank(int &hashtag, int & comm_world_peer,int tag, MPI_Comm comm,int rank) {
|
||||||
|
|
||||||
|
if ( comm == HorizontalComm ) {
|
||||||
|
comm_world_peer = rank;
|
||||||
|
// std::cout << " MapCommRankToWorldRank horiz " <<rank<<"->"<<comm_world_peer<<std::endl;
|
||||||
|
} else if ( comm == communicator_cached ) {
|
||||||
|
comm_world_peer = UserCommunicatorToWorldRanks[rank];
|
||||||
|
// std::cout << " MapCommRankToWorldRank cached " <<rank<<"->"<<comm_world_peer<<std::endl;
|
||||||
|
} else {
|
||||||
|
|
||||||
|
int size;
|
||||||
|
|
||||||
|
MPI_Comm_size(comm,&size);
|
||||||
|
|
||||||
|
UserCommunicatorToWorldRanks.resize(size);
|
||||||
|
|
||||||
|
std::vector<int> cached_ranks(size);
|
||||||
|
|
||||||
|
for(int r=0;r<size;r++) {
|
||||||
|
cached_ranks[r]=r;
|
||||||
|
}
|
||||||
|
|
||||||
|
communicator_cached=comm;
|
||||||
|
|
||||||
|
MPI_Comm_group(communicator_cached, &CachedGroup);
|
||||||
|
|
||||||
|
MPI_Group_translate_ranks(CachedGroup,size,&cached_ranks[0],WorldGroup, &UserCommunicatorToWorldRanks[0]);
|
||||||
|
|
||||||
|
comm_world_peer = UserCommunicatorToWorldRanks[rank];
|
||||||
|
// std::cout << " MapCommRankToWorldRank cache miss " <<rank<<"->"<<comm_world_peer<<std::endl;
|
||||||
|
|
||||||
|
assert(comm_world_peer != MPI_UNDEFINED);
|
||||||
|
}
|
||||||
|
|
||||||
|
assert( (tag & (~0xFFFFL)) ==0);
|
||||||
|
|
||||||
|
uint64_t icomm = (uint64_t)comm;
|
||||||
|
int comm_hash = ((icomm>>0 )&0xFFFF)^((icomm>>16)&0xFFFF)
|
||||||
|
^ ((icomm>>32)&0xFFFF)^((icomm>>48)&0xFFFF);
|
||||||
|
|
||||||
|
// hashtag = (comm_hash<<15) | tag;
|
||||||
|
hashtag = tag;
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
void Slave::Init(SlaveState * _state,MPI_Comm _squadron,int _universe_rank,int _vertical_rank)
|
||||||
|
{
|
||||||
|
squadron=_squadron;
|
||||||
|
universe_rank=_universe_rank;
|
||||||
|
vertical_rank=_vertical_rank;
|
||||||
|
state =_state;
|
||||||
|
std::cout << "state "<<_state<<" comm "<<_squadron<<" universe_rank"<<universe_rank <<std::endl;
|
||||||
|
state->head = state->tail = state->start = 0;
|
||||||
|
base = (uint64_t)MPIoffloadEngine::VerticalShmBufs[0];
|
||||||
|
int rank; MPI_Comm_rank(_squadron,&rank);
|
||||||
|
}
|
||||||
|
#define PERI_PLUS(A) ( (A+1)%pool )
|
||||||
|
int Slave::Event (void) {
|
||||||
|
|
||||||
|
static int tail_last;
|
||||||
|
static int head_last;
|
||||||
|
static int start_last;
|
||||||
|
int ierr;
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////
|
||||||
|
// Try to advance the start pointers
|
||||||
|
////////////////////////////////////////////////////
|
||||||
|
int s=state->start;
|
||||||
|
if ( s != state->head ) {
|
||||||
|
switch ( state->Descrs[s].command ) {
|
||||||
|
case COMMAND_ISEND:
|
||||||
|
/*
|
||||||
|
std::cout<< " Send "<<s << " ptr "<< state<<" "<< state->Descrs[s].buf<< "["<<state->Descrs[s].bytes<<"]"
|
||||||
|
<< " to " << state->Descrs[s].rank<< " tag" << state->Descrs[s].tag
|
||||||
|
<< " Comm " << MPIoffloadEngine::communicator_universe<< " me " <<universe_rank<< std::endl;
|
||||||
|
*/
|
||||||
|
ierr = MPI_Isend((void *)(state->Descrs[s].buf+base),
|
||||||
|
state->Descrs[s].bytes,
|
||||||
|
MPI_CHAR,
|
||||||
|
state->Descrs[s].rank,
|
||||||
|
state->Descrs[s].tag,
|
||||||
|
MPIoffloadEngine::communicator_universe,
|
||||||
|
(MPI_Request *)&state->Descrs[s].request);
|
||||||
|
assert(ierr==0);
|
||||||
|
state->start = PERI_PLUS(s);
|
||||||
|
return 1;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case COMMAND_IRECV:
|
||||||
|
/*
|
||||||
|
std::cout<< " Recv "<<s << " ptr "<< state<<" "<< state->Descrs[s].buf<< "["<<state->Descrs[s].bytes<<"]"
|
||||||
|
<< " from " << state->Descrs[s].rank<< " tag" << state->Descrs[s].tag
|
||||||
|
<< " Comm " << MPIoffloadEngine::communicator_universe<< " me "<< universe_rank<< std::endl;
|
||||||
|
*/
|
||||||
|
ierr=MPI_Irecv((void *)(state->Descrs[s].buf+base),
|
||||||
|
state->Descrs[s].bytes,
|
||||||
|
MPI_CHAR,
|
||||||
|
state->Descrs[s].rank,
|
||||||
|
state->Descrs[s].tag,
|
||||||
|
MPIoffloadEngine::communicator_universe,
|
||||||
|
(MPI_Request *)&state->Descrs[s].request);
|
||||||
|
|
||||||
|
// std::cout<< " Request is "<<state->Descrs[s].request<<std::endl;
|
||||||
|
// std::cout<< " Request0 is "<<state->Descrs[0].request<<std::endl;
|
||||||
|
assert(ierr==0);
|
||||||
|
state->start = PERI_PLUS(s);
|
||||||
|
return 1;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case COMMAND_WAITALL:
|
||||||
|
|
||||||
|
for(int t=state->tail;t!=s; t=PERI_PLUS(t) ){
|
||||||
|
MPI_Wait((MPI_Request *)&state->Descrs[t].request,MPI_STATUS_IGNORE);
|
||||||
|
};
|
||||||
|
s=PERI_PLUS(s);
|
||||||
|
state->start = s;
|
||||||
|
state->tail = s;
|
||||||
|
|
||||||
|
WakeUpCompute();
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
assert(0);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
// External interaction with the queue
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
uint64_t Slave::QueueCommand(int command,void *buf, int bytes, int tag, MPI_Comm comm,int commrank)
|
||||||
|
{
|
||||||
|
/////////////////////////////////////////
|
||||||
|
// Spin; if FIFO is full until not full
|
||||||
|
/////////////////////////////////////////
|
||||||
|
int head =state->head;
|
||||||
|
int next = PERI_PLUS(head);
|
||||||
|
|
||||||
|
// Set up descriptor
|
||||||
|
int worldrank;
|
||||||
|
int hashtag;
|
||||||
|
MPI_Comm communicator;
|
||||||
|
MPI_Request request;
|
||||||
|
|
||||||
|
MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,tag,comm,commrank);
|
||||||
|
|
||||||
|
uint64_t relative= (uint64_t)buf - base;
|
||||||
|
state->Descrs[head].buf = relative;
|
||||||
|
state->Descrs[head].bytes = bytes;
|
||||||
|
state->Descrs[head].rank = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank];
|
||||||
|
state->Descrs[head].tag = hashtag;
|
||||||
|
state->Descrs[head].command= command;
|
||||||
|
|
||||||
|
/*
|
||||||
|
if ( command == COMMAND_ISEND ) {
|
||||||
|
std::cout << "QueueSend from "<< universe_rank <<" to commrank " << commrank
|
||||||
|
<< " to worldrank " << worldrank <<std::endl;
|
||||||
|
std::cout << " via VerticalRank "<< vertical_rank <<" to universerank " << MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]<<std::endl;
|
||||||
|
std::cout << " QueueCommand "<<buf<<"["<<bytes<<"]" << std::endl;
|
||||||
|
}
|
||||||
|
if ( command == COMMAND_IRECV ) {
|
||||||
|
std::cout << "QueueRecv on "<< universe_rank <<" from commrank " << commrank
|
||||||
|
<< " from worldrank " << worldrank <<std::endl;
|
||||||
|
std::cout << " via VerticalRank "<< vertical_rank <<" from universerank " << MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]<<std::endl;
|
||||||
|
std::cout << " QueueSend "<<buf<<"["<<bytes<<"]" << std::endl;
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
// Block until FIFO has space
|
||||||
|
while( state->tail==next );
|
||||||
|
|
||||||
|
// Msync on weak order architectures
|
||||||
|
// Advance pointer
|
||||||
|
state->head = next;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Info that is setup once and indept of cartesian layout
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
MPI_Comm CartesianCommunicator::communicator_world;
|
||||||
|
|
||||||
|
void CartesianCommunicator::Init(int *argc, char ***argv)
|
||||||
|
{
|
||||||
|
int flag;
|
||||||
|
MPI_Initialized(&flag); // needed to coexist with other libs apparently
|
||||||
|
if ( !flag ) {
|
||||||
|
MPI_Init(argc,argv);
|
||||||
|
}
|
||||||
|
communicator_world = MPI_COMM_WORLD;
|
||||||
|
MPI_Comm ShmComm;
|
||||||
|
MPIoffloadEngine::CommunicatorInit (communicator_world,ShmComm,ShmCommBuf);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
|
||||||
|
{
|
||||||
|
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
|
||||||
|
{
|
||||||
|
int rank;
|
||||||
|
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
|
||||||
|
assert(ierr==0);
|
||||||
|
return rank;
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
|
||||||
|
{
|
||||||
|
coor.resize(_ndimension);
|
||||||
|
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
|
||||||
|
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
||||||
|
{
|
||||||
|
_ndimension = processors.size();
|
||||||
|
std::vector<int> periodic(_ndimension,1);
|
||||||
|
|
||||||
|
_Nprocessors=1;
|
||||||
|
_processors = processors;
|
||||||
|
|
||||||
|
for(int i=0;i<_ndimension;i++){
|
||||||
|
_Nprocessors*=_processors[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
int Size;
|
||||||
|
MPI_Comm_size(communicator_world,&Size);
|
||||||
|
assert(Size==_Nprocessors);
|
||||||
|
|
||||||
|
_processor_coor.resize(_ndimension);
|
||||||
|
MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
|
||||||
|
MPI_Comm_rank (communicator,&_processor);
|
||||||
|
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
|
||||||
|
};
|
||||||
|
|
||||||
|
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
||||||
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSum(uint64_t &u){
|
||||||
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSum(float &f){
|
||||||
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
||||||
|
{
|
||||||
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSum(double &d)
|
||||||
|
{
|
||||||
|
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
||||||
|
{
|
||||||
|
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Basic Halo comms primitive
|
||||||
|
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||||
|
int dest,
|
||||||
|
void *recv,
|
||||||
|
int from,
|
||||||
|
int bytes)
|
||||||
|
{
|
||||||
|
std::vector<CommsRequest_t> reqs(0);
|
||||||
|
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
|
||||||
|
SendToRecvFromComplete(reqs);
|
||||||
|
}
|
||||||
|
|
||||||
|
void CartesianCommunicator::SendRecvPacket(void *xmit,
|
||||||
|
void *recv,
|
||||||
|
int sender,
|
||||||
|
int receiver,
|
||||||
|
int bytes)
|
||||||
|
{
|
||||||
|
MPI_Status stat;
|
||||||
|
assert(sender != receiver);
|
||||||
|
int tag = sender;
|
||||||
|
if ( _processor == sender ) {
|
||||||
|
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
|
||||||
|
}
|
||||||
|
if ( _processor == receiver ) {
|
||||||
|
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Basic Halo comms primitive
|
||||||
|
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
|
void *xmit,
|
||||||
|
int dest,
|
||||||
|
void *recv,
|
||||||
|
int from,
|
||||||
|
int bytes)
|
||||||
|
{
|
||||||
|
MPI_Request xrq;
|
||||||
|
MPI_Request rrq;
|
||||||
|
int rank = _processor;
|
||||||
|
int ierr;
|
||||||
|
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
|
||||||
|
ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
|
||||||
|
|
||||||
|
assert(ierr==0);
|
||||||
|
|
||||||
|
list.push_back(xrq);
|
||||||
|
list.push_back(rrq);
|
||||||
|
}
|
||||||
|
|
||||||
|
void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
|
void *xmit,
|
||||||
|
int dest,
|
||||||
|
void *recv,
|
||||||
|
int from,
|
||||||
|
int bytes)
|
||||||
|
{
|
||||||
|
uint64_t xmit_i = (uint64_t) xmit;
|
||||||
|
uint64_t recv_i = (uint64_t) recv;
|
||||||
|
uint64_t shm = (uint64_t) ShmCommBuf;
|
||||||
|
// assert xmit and recv lie in shared memory region
|
||||||
|
assert( (xmit_i >= shm) && (xmit_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
|
||||||
|
assert( (recv_i >= shm) && (recv_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
|
||||||
|
assert(from!=_processor);
|
||||||
|
assert(dest!=_processor);
|
||||||
|
MPIoffloadEngine::QueueMultiplexedSend(xmit,bytes,_processor,communicator,dest);
|
||||||
|
MPIoffloadEngine::QueueMultiplexedRecv(recv,bytes,from,communicator,from);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
||||||
|
{
|
||||||
|
MPIoffloadEngine::WaitAll();
|
||||||
|
}
|
||||||
|
|
||||||
|
void CartesianCommunicator::StencilBarrier(void)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
||||||
|
{
|
||||||
|
int nreq=list.size();
|
||||||
|
std::vector<MPI_Status> status(nreq);
|
||||||
|
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void CartesianCommunicator::Barrier(void)
|
||||||
|
{
|
||||||
|
int ierr = MPI_Barrier(communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
|
||||||
|
{
|
||||||
|
int ierr=MPI_Bcast(data,
|
||||||
|
bytes,
|
||||||
|
MPI_BYTE,
|
||||||
|
root,
|
||||||
|
communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
||||||
|
{
|
||||||
|
int ierr= MPI_Bcast(data,
|
||||||
|
bytes,
|
||||||
|
MPI_BYTE,
|
||||||
|
root,
|
||||||
|
communicator_world);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; }
|
||||||
|
|
||||||
|
void *CartesianCommunicator::ShmBuffer(int rank) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
};
|
||||||
|
|
@ -34,13 +34,6 @@ namespace Grid {
|
|||||||
|
|
||||||
void CartesianCommunicator::Init(int *argc, char *** arv)
|
void CartesianCommunicator::Init(int *argc, char *** arv)
|
||||||
{
|
{
|
||||||
WorldRank = 0;
|
|
||||||
WorldSize = 1;
|
|
||||||
ShmRank=0;
|
|
||||||
ShmSize=1;
|
|
||||||
GroupRank=WorldRank;
|
|
||||||
GroupSize=WorldSize;
|
|
||||||
Slave =0;
|
|
||||||
ShmInitGeneric();
|
ShmInitGeneric();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -99,6 +92,7 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
|
|||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int CartesianCommunicator::RankWorld(void){return 0;}
|
||||||
void CartesianCommunicator::Barrier(void){}
|
void CartesianCommunicator::Barrier(void){}
|
||||||
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
|
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
|
||||||
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
|
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
|
||||||
|
@ -50,11 +50,16 @@ typedef struct HandShake_t {
|
|||||||
uint64_t seq_remote;
|
uint64_t seq_remote;
|
||||||
} HandShake;
|
} HandShake;
|
||||||
|
|
||||||
|
std::array<long,_SHMEM_REDUCE_SYNC_SIZE> make_psync_init(void) {
|
||||||
|
array<long,_SHMEM_REDUCE_SYNC_SIZE> ret;
|
||||||
|
ret.fill(SHMEM_SYNC_VALUE);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync_init = make_psync_init();
|
||||||
|
|
||||||
static Vector< HandShake > XConnections;
|
static Vector< HandShake > XConnections;
|
||||||
static Vector< HandShake > RConnections;
|
static Vector< HandShake > RConnections;
|
||||||
|
|
||||||
|
|
||||||
void CartesianCommunicator::Init(int *argc, char ***argv) {
|
void CartesianCommunicator::Init(int *argc, char ***argv) {
|
||||||
shmem_init();
|
shmem_init();
|
||||||
XConnections.resize(shmem_n_pes());
|
XConnections.resize(shmem_n_pes());
|
||||||
@ -65,13 +70,6 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
|
|||||||
RConnections[pe].seq_local = 0;
|
RConnections[pe].seq_local = 0;
|
||||||
RConnections[pe].seq_remote= 0;
|
RConnections[pe].seq_remote= 0;
|
||||||
}
|
}
|
||||||
WorldSize = shmem_n_pes();
|
|
||||||
WorldRank = shmem_my_pe();
|
|
||||||
ShmRank=0;
|
|
||||||
ShmSize=1;
|
|
||||||
GroupRank=WorldRank;
|
|
||||||
GroupSize=WorldSize;
|
|
||||||
Slave =0;
|
|
||||||
shmem_barrier_all();
|
shmem_barrier_all();
|
||||||
ShmInitGeneric();
|
ShmInitGeneric();
|
||||||
}
|
}
|
||||||
@ -103,7 +101,7 @@ void CartesianCommunicator::GlobalSum(uint32_t &u){
|
|||||||
static long long source ;
|
static long long source ;
|
||||||
static long long dest ;
|
static long long dest ;
|
||||||
static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
|
static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
|
||||||
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
|
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
|
||||||
|
|
||||||
// int nreduce=1;
|
// int nreduce=1;
|
||||||
// int pestart=0;
|
// int pestart=0;
|
||||||
@ -119,7 +117,7 @@ void CartesianCommunicator::GlobalSum(uint64_t &u){
|
|||||||
static long long source ;
|
static long long source ;
|
||||||
static long long dest ;
|
static long long dest ;
|
||||||
static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
|
static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
|
||||||
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
|
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
|
||||||
|
|
||||||
// int nreduce=1;
|
// int nreduce=1;
|
||||||
// int pestart=0;
|
// int pestart=0;
|
||||||
@ -135,7 +133,7 @@ void CartesianCommunicator::GlobalSum(float &f){
|
|||||||
static float source ;
|
static float source ;
|
||||||
static float dest ;
|
static float dest ;
|
||||||
static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
|
static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
|
||||||
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
|
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
|
||||||
|
|
||||||
source = f;
|
source = f;
|
||||||
dest =0.0;
|
dest =0.0;
|
||||||
@ -147,7 +145,7 @@ void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
|||||||
static float source ;
|
static float source ;
|
||||||
static float dest = 0 ;
|
static float dest = 0 ;
|
||||||
static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
|
static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
|
||||||
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
|
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
|
||||||
|
|
||||||
if ( shmem_addr_accessible(f,_processor) ){
|
if ( shmem_addr_accessible(f,_processor) ){
|
||||||
shmem_float_sum_to_all(f,f,N,0,0,_Nprocessors,llwrk,psync);
|
shmem_float_sum_to_all(f,f,N,0,0,_Nprocessors,llwrk,psync);
|
||||||
@ -166,7 +164,7 @@ void CartesianCommunicator::GlobalSum(double &d)
|
|||||||
static double source;
|
static double source;
|
||||||
static double dest ;
|
static double dest ;
|
||||||
static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
|
static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
|
||||||
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
|
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
|
||||||
|
|
||||||
source = d;
|
source = d;
|
||||||
dest = 0;
|
dest = 0;
|
||||||
@ -178,7 +176,8 @@ void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
|||||||
static double source ;
|
static double source ;
|
||||||
static double dest ;
|
static double dest ;
|
||||||
static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
|
static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
|
||||||
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
|
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
|
||||||
|
|
||||||
|
|
||||||
if ( shmem_addr_accessible(d,_processor) ){
|
if ( shmem_addr_accessible(d,_processor) ){
|
||||||
shmem_double_sum_to_all(d,d,N,0,0,_Nprocessors,llwrk,psync);
|
shmem_double_sum_to_all(d,d,N,0,0,_Nprocessors,llwrk,psync);
|
||||||
@ -295,7 +294,7 @@ void CartesianCommunicator::Barrier(void)
|
|||||||
}
|
}
|
||||||
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
|
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
|
||||||
{
|
{
|
||||||
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
|
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
|
||||||
static uint32_t word;
|
static uint32_t word;
|
||||||
uint32_t *array = (uint32_t *) data;
|
uint32_t *array = (uint32_t *) data;
|
||||||
assert( (bytes % 4)==0);
|
assert( (bytes % 4)==0);
|
||||||
@ -318,7 +317,7 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
|
|||||||
}
|
}
|
||||||
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
||||||
{
|
{
|
||||||
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
|
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
|
||||||
static uint32_t word;
|
static uint32_t word;
|
||||||
uint32_t *array = (uint32_t *) data;
|
uint32_t *array = (uint32_t *) data;
|
||||||
assert( (bytes % 4)==0);
|
assert( (bytes % 4)==0);
|
||||||
|
@ -32,8 +32,7 @@ directory
|
|||||||
namespace Grid {
|
namespace Grid {
|
||||||
namespace QCD {
|
namespace QCD {
|
||||||
|
|
||||||
int WilsonKernelsStatic::HandOpt;
|
int WilsonKernelsStatic::Opt;
|
||||||
int WilsonKernelsStatic::AsmOpt;
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
WilsonKernels<Impl>::WilsonKernels(const ImplParams &p) : Base(p){};
|
WilsonKernels<Impl>::WilsonKernels(const ImplParams &p) : Base(p){};
|
||||||
|
@ -40,9 +40,9 @@ namespace QCD {
|
|||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
class WilsonKernelsStatic {
|
class WilsonKernelsStatic {
|
||||||
public:
|
public:
|
||||||
|
enum { OptGeneric, OptHandUnroll, OptInlineAsm };
|
||||||
// S-direction is INNERMOST and takes no part in the parity.
|
// S-direction is INNERMOST and takes no part in the parity.
|
||||||
static int AsmOpt; // these are a temporary hack
|
static int Opt; // these are a temporary hack
|
||||||
static int HandOpt; // these are a temporary hack
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic {
|
template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic {
|
||||||
@ -56,24 +56,40 @@ public:
|
|||||||
template <bool EnableBool = true>
|
template <bool EnableBool = true>
|
||||||
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
|
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
|
||||||
DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
|
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
|
switch(Opt) {
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
if (AsmOpt) {
|
case OptInlineAsm:
|
||||||
WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
|
|
||||||
} else {
|
|
||||||
#else
|
|
||||||
{
|
|
||||||
#endif
|
|
||||||
for (int site = 0; site < Ns; site++) {
|
for (int site = 0; site < Ns; site++) {
|
||||||
for (int s = 0; s < Ls; s++) {
|
for (int s = 0; s < Ls; s++) {
|
||||||
if (HandOpt)
|
WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
|
||||||
WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
|
|
||||||
else
|
|
||||||
WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
|
|
||||||
sF++;
|
sF++;
|
||||||
}
|
}
|
||||||
sU++;
|
sU++;
|
||||||
}
|
}
|
||||||
|
break;
|
||||||
|
#endif
|
||||||
|
case OptHandUnroll:
|
||||||
|
for (int site = 0; site < Ns; site++) {
|
||||||
|
for (int s = 0; s < Ls; s++) {
|
||||||
|
WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
|
||||||
|
sF++;
|
||||||
|
}
|
||||||
|
sU++;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case OptGeneric:
|
||||||
|
for (int site = 0; site < Ns; site++) {
|
||||||
|
for (int s = 0; s < Ls; s++) {
|
||||||
|
WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
|
||||||
|
sF++;
|
||||||
|
}
|
||||||
|
sU++;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
assert(0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -81,7 +97,7 @@ public:
|
|||||||
typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
|
typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
|
||||||
DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
|
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
|
||||||
|
// no kernel choice
|
||||||
for (int site = 0; site < Ns; site++) {
|
for (int site = 0; site < Ns; site++) {
|
||||||
for (int s = 0; s < Ls; s++) {
|
for (int s = 0; s < Ls; s++) {
|
||||||
WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in, out);
|
WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in, out);
|
||||||
@ -95,23 +111,39 @@ public:
|
|||||||
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,void>::type
|
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,void>::type
|
||||||
DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
|
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
|
||||||
|
|
||||||
|
switch(Opt) {
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
if (AsmOpt) {
|
case OptInlineAsm:
|
||||||
WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
|
|
||||||
} else {
|
|
||||||
#else
|
|
||||||
{
|
|
||||||
#endif
|
|
||||||
for (int site = 0; site < Ns; site++) {
|
for (int site = 0; site < Ns; site++) {
|
||||||
for (int s = 0; s < Ls; s++) {
|
for (int s = 0; s < Ls; s++) {
|
||||||
if (HandOpt)
|
WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
|
||||||
WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
|
|
||||||
else
|
|
||||||
WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
|
|
||||||
sF++;
|
sF++;
|
||||||
}
|
}
|
||||||
sU++;
|
sU++;
|
||||||
}
|
}
|
||||||
|
break;
|
||||||
|
#endif
|
||||||
|
case OptHandUnroll:
|
||||||
|
for (int site = 0; site < Ns; site++) {
|
||||||
|
for (int s = 0; s < Ls; s++) {
|
||||||
|
WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
|
||||||
|
sF++;
|
||||||
|
}
|
||||||
|
sU++;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case OptGeneric:
|
||||||
|
for (int site = 0; site < Ns; site++) {
|
||||||
|
for (int s = 0; s < Ls; s++) {
|
||||||
|
WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
|
||||||
|
sF++;
|
||||||
|
}
|
||||||
|
sU++;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
assert(0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -32,7 +32,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
int LebesgueOrder::UseLebesgueOrder;
|
int LebesgueOrder::UseLebesgueOrder;
|
||||||
std::vector<int> LebesgueOrder::Block({2,2,2,2});
|
std::vector<int> LebesgueOrder::Block({8,2,2,2});
|
||||||
|
|
||||||
LebesgueOrder::IndexInteger LebesgueOrder::alignup(IndexInteger n){
|
LebesgueOrder::IndexInteger LebesgueOrder::alignup(IndexInteger n){
|
||||||
n--; // 1000 0011 --> 1000 0010
|
n--; // 1000 0011 --> 1000 0010
|
||||||
|
@ -1 +0,0 @@
|
|||||||
./configure --host=arm-linux-gnueabihf CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target arm-linux-gnueabihf -I/usr/arm-linux-gnueabihf/include/ -I/home/neo/Codes/gmp6.0/gmp-arm/include/ -I/usr/arm-linux-gnueabihf/include/c++/4.8.2/arm-linux-gnueabihf/ -L/home/neo/Codes/gmp6.0/gmp-arm/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-arm/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-arm/lib/ -static -mcpu=cortex-a7' --enable-simd=NEONv7
|
|
@ -1,3 +0,0 @@
|
|||||||
#./configure --host=arm-linux-gnueabihf CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target arm-linux-gnueabihf -I/usr/arm-linux-gnueabihf/include/ -I/home/neo/Codes/gmp6.0/gmp-arm/include/ -I/usr/lib/llvm-3.5/lib/clang/3.5.0/include/ -L/home/neo/Codes/gmp6.0/gmp-arm/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-arm/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-arm/lib/ -static -mcpu=cortex-a57' --enable-simd=NEONv7
|
|
||||||
|
|
||||||
./configure --host=aarch64-linux-gnu CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target aarch64-linux-gnu -static -I/home/neo/Codes/gmp6.0/gmp-armv8/include/ -L/home/neo/Codes/gmp6.0/gmp-armv8/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-armv8/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-armv8/lib/ -I/usr/aarch64-linux-gnu/include/ -I/usr/aarch64-linux-gnu/include/c++/4.8.2/aarch64-linux-gnu/' --enable-simd=NEONv7
|
|
@ -1,9 +0,0 @@
|
|||||||
for omp in 1 2 4
|
|
||||||
do
|
|
||||||
echo > wilson.t$omp
|
|
||||||
for vol in 4.4.4.4 4.4.4.8 4.4.8.8 4.8.8.8 8.8.8.8 8.8.8.16 8.8.16.16 8.16.16.16
|
|
||||||
do
|
|
||||||
perf=` ./benchmarks/Grid_wilson --grid $vol --omp $omp | grep mflop | awk '{print $3}'`
|
|
||||||
echo $vol $perf >> wilson.t$omp
|
|
||||||
done
|
|
||||||
done
|
|
@ -1,46 +0,0 @@
|
|||||||
#!/bin/bash -e
|
|
||||||
|
|
||||||
DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi clang-sse"
|
|
||||||
EXTRADIRS="g++-avx g++-sse4 icpc-avx icpc-avx2 icpc-avx512"
|
|
||||||
BLACK="\033[30m"
|
|
||||||
RED="\033[31m"
|
|
||||||
GREEN="\033[32m"
|
|
||||||
YELLOW="\033[33m"
|
|
||||||
BLUE="\033[34m"
|
|
||||||
PINK="\033[35m"
|
|
||||||
CYAN="\033[36m"
|
|
||||||
WHITE="\033[37m"
|
|
||||||
NORMAL="\033[0;39m"
|
|
||||||
|
|
||||||
for D in $DIRS
|
|
||||||
do
|
|
||||||
|
|
||||||
echo
|
|
||||||
echo -e $RED ==============================
|
|
||||||
echo -e $GREEN $D
|
|
||||||
echo -e $RED ==============================
|
|
||||||
echo -e $BLUE
|
|
||||||
|
|
||||||
cd builds/$D
|
|
||||||
make clean all -j 8
|
|
||||||
cd ../../
|
|
||||||
echo -e $NORMAL
|
|
||||||
done
|
|
||||||
|
|
||||||
if [ "X$1" == "Xextra" ]
|
|
||||||
then
|
|
||||||
for D in $EXTRADIRS
|
|
||||||
do
|
|
||||||
|
|
||||||
echo
|
|
||||||
echo -e $RED ==============================
|
|
||||||
echo -e $RED $D
|
|
||||||
echo -e $RED ==============================
|
|
||||||
echo -e $BLUE
|
|
||||||
|
|
||||||
cd builds/$D
|
|
||||||
make clean all -j 8
|
|
||||||
cd ../../
|
|
||||||
echo -e $NORMAL
|
|
||||||
done
|
|
||||||
fi
|
|
@ -1,11 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi icpc-avx icpc-avx2 icpc-avx512 g++-sse4 g++-avx clang-sse icpc-avx-openmp-mpi icpc-avx-openmp"
|
|
||||||
|
|
||||||
for D in $DIRS
|
|
||||||
do
|
|
||||||
mkdir -p builds/$D
|
|
||||||
cd builds/$D
|
|
||||||
../../scripts/configure-commands $D
|
|
||||||
cd ../..
|
|
||||||
done
|
|
@ -1,89 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
WD=$1
|
|
||||||
BLACK="\033[30m"
|
|
||||||
RED="\033[31m"
|
|
||||||
GREEN="\033[32m"
|
|
||||||
YELLOW="\033[33m"
|
|
||||||
BLUE="\033[34m"
|
|
||||||
PINK="\033[35m"
|
|
||||||
CYAN="\033[36m"
|
|
||||||
WHITE="\033[37m"
|
|
||||||
NORMAL="\033[0;39m"
|
|
||||||
echo
|
|
||||||
echo -e $RED ==============================
|
|
||||||
echo -e $GREEN $WD
|
|
||||||
echo -e $RED ==============================
|
|
||||||
echo -e $YELLOW
|
|
||||||
|
|
||||||
case $WD in
|
|
||||||
g++-avx)
|
|
||||||
CXX=g++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
|
|
||||||
;;
|
|
||||||
g++-avx-openmp)
|
|
||||||
CXX=g++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LIBS="-fopenmp -lgmp -lmpfr" --enable-comms=none
|
|
||||||
;;
|
|
||||||
g++5-sse4)
|
|
||||||
CXX=g++-5 ../../configure --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
|
|
||||||
;;
|
|
||||||
g++5-avx)
|
|
||||||
CXX=g++-5 ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
|
|
||||||
;;
|
|
||||||
icpc-avx)
|
|
||||||
CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
|
|
||||||
;;
|
|
||||||
icpc-avx-openmp-mpi)
|
|
||||||
CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
|
|
||||||
;;
|
|
||||||
icpc-avx-openmp)
|
|
||||||
CXX=icpc ../../configure --enable-precision=single --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LIBS="-fopenmp -lgmp -lmpfr" --enable-comms=mpi
|
|
||||||
;;
|
|
||||||
icpc-avx2)
|
|
||||||
CXX=icpc ../../configure --enable-simd=AVX2 CXXFLAGS="-march=core-avx2 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
|
|
||||||
;;
|
|
||||||
icpc-avx512)
|
|
||||||
CXX=icpc ../../configure --enable-simd=AVX512 CXXFLAGS="-xCOMMON-AVX512 -O3 -std=c++11" --host=none LIBS="-lgmp -lmpfr" --enable-comms=none
|
|
||||||
;;
|
|
||||||
icpc-mic)
|
|
||||||
CXX=icpc ../../configure --host=none --enable-simd=IMCI CXXFLAGS="-mmic -O3 -std=c++11" LDFLAGS=-mmic LIBS="-lgmp -lmpfr" --enable-comms=none
|
|
||||||
;;
|
|
||||||
icpc-mic-avx512)
|
|
||||||
CXX=icpc ../../configure --host=none --enable-simd=IMCI CXXFLAGS="-xCOMMON_AVX512 -O3 -std=c++11" LDFLAGS=-xCOMMON_AVX512 LIBS="-lgmp -lmpfr" --enable-comms=none
|
|
||||||
;;
|
|
||||||
clang-sse)
|
|
||||||
CXX=clang++ ../../configure --enable-precision=single --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
|
|
||||||
;;
|
|
||||||
clang-avx)
|
|
||||||
CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
|
|
||||||
;;
|
|
||||||
clang-avx2)
|
|
||||||
CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
|
|
||||||
;;
|
|
||||||
clang-avx-openmp)
|
|
||||||
CXX=clang-omp++ ../../configure --enable-precision=double --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LDFLAGS="-fopenmp" LIBS="-lgmp -lmpfr" --enable-comms=none
|
|
||||||
;;
|
|
||||||
clang-xc30)
|
|
||||||
CXX=$HOME/Clang/install/bin/clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11 -I/opt/gcc/4.9.2/snos/include/g++/x86_64-suse-linux/ -I/opt/gcc/4.9.2/snos/include/g++/ " LDFLAGS="" LIBS="-lgmp -lmpfr" --enable-comms=none
|
|
||||||
;;
|
|
||||||
clang-xc30-openmp)
|
|
||||||
CXX=$HOME/Clang/install/bin/clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11 -I/opt/gcc/4.9.2/snos/include/g++/x86_64-suse-linux/ -I/opt/gcc/4.9.2/snos/include/g++/ " LDFLAGS="-fopenmp" LIBS="-lgmp -lmpfr" --enable-comms=none
|
|
||||||
;;
|
|
||||||
clang-avx2-openmp)
|
|
||||||
CXX=clang-omp++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -fopenmp -O3 -std=c++11" LDFLAGS="-fopenmp" LIBS="-lgmp -lmpfr" --enable-comms=none
|
|
||||||
;;
|
|
||||||
clang-avx-openmp-mpi)
|
|
||||||
CXX=clang-omp++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
|
|
||||||
;;
|
|
||||||
clang-avx2-openmp-mpi)
|
|
||||||
CXX=clang-omp++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
|
|
||||||
;;
|
|
||||||
clang-avx-mpi)
|
|
||||||
CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -lgmp -lmpfr" --enable-comms=mpi
|
|
||||||
;;
|
|
||||||
clang-avx2-mpi)
|
|
||||||
CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -lgmp -lmpfr" --enable-comms=mpi
|
|
||||||
;;
|
|
||||||
clang-avx2)
|
|
||||||
CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -std=c++11" LDFLAGS="-L/usr/local/lib/" LIBS="-lgmp -lmpfr" --enable-comms=none
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
echo -e $NORMAL
|
|
@ -1,10 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
DIRS="g++-avx-openmp g++-avx clang-xc30 clang-xc30-openmp"
|
|
||||||
|
|
||||||
for D in $DIRS
|
|
||||||
do
|
|
||||||
mkdir -p builds/$D
|
|
||||||
cd builds/$D
|
|
||||||
../../scripts/configure-commands $D
|
|
||||||
cd ../..
|
|
||||||
done
|
|
@ -1,10 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
DIRS="build-icpc-mic"
|
|
||||||
|
|
||||||
for D in $DIRS
|
|
||||||
do
|
|
||||||
mkdir -p $D
|
|
||||||
cd $D
|
|
||||||
../configure-commands
|
|
||||||
cd ..
|
|
||||||
done
|
|
@ -12,6 +12,7 @@ Grid physics library, www.github.com/paboyle/Grid
|
|||||||
Source file: $1
|
Source file: $1
|
||||||
|
|
||||||
Copyright (C) 2015
|
Copyright (C) 2015
|
||||||
|
Copyright (C) 2016
|
||||||
|
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
@ -38,8 +39,21 @@ See the full license in the file "LICENSE" in the top level distribution directo
|
|||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
|
|
||||||
cat message > tmp.fil
|
cat message > tmp.fil
|
||||||
cat $1 >> tmp.fil
|
|
||||||
|
NOTICE=`grep -n "END LEGAL" $1 | awk '{ print $1 }' `
|
||||||
|
|
||||||
|
if [ "X$NOTICE" != "X" ]
|
||||||
|
then
|
||||||
|
echo "found notice ending on line $NOTICE"
|
||||||
|
awk 'BEGIN { P=0 } { if ( P ) print } /END LEGAL/{P=1} ' $1 >> tmp.fil
|
||||||
|
else
|
||||||
|
cat $1 >> tmp.fil
|
||||||
|
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
cp tmp.fil $1
|
cp tmp.fil $1
|
||||||
|
|
||||||
shift
|
shift
|
||||||
|
@ -1,2 +0,0 @@
|
|||||||
module swap PrgEnv-cray PrgEnv-intel
|
|
||||||
module swap intel/14.0.4.211 intel/15.0.2.164
|
|
@ -1,4 +0,0 @@
|
|||||||
aclocal -I m4
|
|
||||||
autoheader -f
|
|
||||||
automake -f --add-missing
|
|
||||||
autoconf -f
|
|
@ -1,18 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
if (( $# != 1 )); then
|
|
||||||
echo "usage: `basename $0` <archive>" 1>&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
ARC=$1
|
|
||||||
|
|
||||||
INITDIR=`pwd`
|
|
||||||
rm -rf lib/fftw
|
|
||||||
mkdir lib/fftw
|
|
||||||
|
|
||||||
ARCDIR=`tar -tf ${ARC} | head -n1 | sed -e 's@/.*@@'`
|
|
||||||
tar -xf ${ARC}
|
|
||||||
cp ${ARCDIR}/api/fftw3.h lib/fftw/
|
|
||||||
|
|
||||||
cd ${INITDIR}
|
|
||||||
rm -rf ${ARCDIR}
|
|
@ -1,7 +0,0 @@
|
|||||||
plot 'wilson.t1' u 2 w l t "AVX1-OMP=1"
|
|
||||||
replot 'wilson.t2' u 2 w l t "AVX1-OMP=2"
|
|
||||||
replot 'wilson.t4' u 2 w l t "AVX1-OMP=4"
|
|
||||||
set terminal 'pdf'
|
|
||||||
set output 'wilson_clang.pdf'
|
|
||||||
replot
|
|
||||||
quit
|
|
Loading…
x
Reference in New Issue
Block a user