1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-11 22:50:45 +01:00
Grid/lib/qcd/action/fermion/WilsonFermion5D.cc
Peter Boyle 6aeaf6f568 Parallel IO worked on. I'm puzzled because I already thought I shook this out on MacOS + OpenMPI and then
turned up problems on the BlueWaters Cray.

Gets 75MB/s from home filesystem on parallel configuration read. Need to make the RNG IO parallel,
and also to look at aggregating bigger writes for the parallel write.
Not sure what the home filesystem is.
2016-02-21 08:03:21 -06:00

603 lines
18 KiB
C++

/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonFermion5D.cc
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid.h>
#include <PerfCount.h>
namespace Grid {
namespace QCD {
// S-direction is INNERMOST and takes no part in the parity.
const std::vector<int> WilsonFermion5DStatic::directions ({1,2,3,4, 1, 2, 3, 4});
const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
int WilsonFermion5DStatic::HandOptDslash;
int WilsonFermion5DStatic::AsmOptDslash;
// 5d lattice for DWF.
template<class Impl>
WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
RealD _M5,const ImplParams &p) :
Kernels(p),
_FiveDimGrid(&FiveDimGrid),
_FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
_FourDimGrid(&FourDimGrid),
_FourDimRedBlackGrid(&FourDimRedBlackGrid),
Stencil (_FiveDimGrid,npoint,Even,directions,displacements),
StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
M5(_M5),
Umu(_FourDimGrid),
UmuEven(_FourDimRedBlackGrid),
UmuOdd (_FourDimRedBlackGrid),
Lebesgue(_FourDimGrid),
LebesgueEvenOdd(_FourDimRedBlackGrid)
{
// some assertions
assert(FiveDimGrid._ndimension==5);
assert(FourDimGrid._ndimension==4);
assert(FiveDimRedBlackGrid._ndimension==5);
assert(FourDimRedBlackGrid._ndimension==4);
assert(FiveDimRedBlackGrid._checker_dim==1);
// Dimension zero of the five-d is the Ls direction
Ls=FiveDimGrid._fdimensions[0];
assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
assert(FiveDimRedBlackGrid._processors[0] ==1);
assert(FiveDimRedBlackGrid._simd_layout[0]==1);
assert(FiveDimGrid._processors[0] ==1);
assert(FiveDimGrid._simd_layout[0] ==1);
// Other dimensions must match the decomposition of the four-D fields
for(int d=0;d<4;d++){
assert(FourDimRedBlackGrid._fdimensions[d] ==FourDimGrid._fdimensions[d]);
assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
assert(FourDimRedBlackGrid._processors[d] ==FourDimGrid._processors[d]);
assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
assert(FourDimRedBlackGrid._simd_layout[d] ==FourDimGrid._simd_layout[d]);
assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]);
assert(FiveDimGrid._fdimensions[d+1] ==FourDimGrid._fdimensions[d]);
assert(FiveDimGrid._processors[d+1] ==FourDimGrid._processors[d]);
assert(FiveDimGrid._simd_layout[d+1] ==FourDimGrid._simd_layout[d]);
}
// Allocate the required comms buffer
ImportGauge(_Umu);
alltime=0;
commtime=0;
jointime=0;
dslashtime=0;
dslash1time=0;
}
template<class Impl>
void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
{
GaugeField HUmu(_Umu._grid);
HUmu = _Umu*(-0.5);
Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
pickCheckerboard(Even,UmuEven,Umu);
pickCheckerboard(Odd ,UmuOdd,Umu);
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp)
{
int dir = dir5-1; // Maps to the ordering above in "directions" that is passed to stencil
// we drop off the innermost fifth dimension
// assert( (disp==1)||(disp==-1) );
// assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
Compressor compressor(DaggerNo);
Stencil.HaloExchange(in,compressor);
int skip = (disp==1) ? 0 : 1;
int dirdisp = dir+skip*4;
int gamma = dir+(1-skip)*4;
assert(dirdisp<=7);
assert(dirdisp>=0);
PARALLEL_FOR_LOOP
for(int ss=0;ss<Umu._grid->oSites();ss++){
for(int s=0;s<Ls;s++){
int sU=ss;
int sF = s+Ls*sU;
Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sF,sU,in,out,dirdisp,gamma);
}
}
};
template<class Impl>
void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
DoubledGaugeField & U,
GaugeField &mat,
const FermionField &A,
const FermionField &B,
int dag)
{
assert((dag==DaggerNo) ||(dag==DaggerYes));
conformable(st._grid,A._grid);
conformable(st._grid,B._grid);
Compressor compressor(dag);
FermionField Btilde(B._grid);
FermionField Atilde(B._grid);
st.HaloExchange(B,compressor);
Atilde=A;
for(int mu=0;mu<Nd;mu++){
////////////////////////////////////////////////////////////////////////
// Flip gamma if dag
////////////////////////////////////////////////////////////////////////
int gamma = mu;
if ( !dag ) gamma+= Nd;
////////////////////////
// Call the single hop
////////////////////////
PARALLEL_FOR_LOOP
for(int sss=0;sss<U._grid->oSites();sss++){
for(int s=0;s<Ls;s++){
int sU=sss;
int sF = s+Ls*sU;
assert ( sF< B._grid->oSites());
assert ( sU< U._grid->oSites());
Kernels::DiracOptDhopDir(st,U,st.comm_buf,sF,sU,B,Btilde,mu,gamma);
////////////////////////////
// spin trace outer product
////////////////////////////
}
}
Impl::InsertForce5D(mat,Btilde,Atilde,mu);
}
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopDeriv( GaugeField &mat,
const FermionField &A,
const FermionField &B,
int dag)
{
conformable(A._grid,FermionGrid());
conformable(A._grid,B._grid);
conformable(GaugeGrid(),mat._grid);
mat.checkerboard = A.checkerboard;
DerivInternal(Stencil,Umu,mat,A,B,dag);
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
const FermionField &A,
const FermionField &B,
int dag)
{
conformable(A._grid,FermionRedBlackGrid());
conformable(GaugeRedBlackGrid(),mat._grid);
conformable(A._grid,B._grid);
assert(B.checkerboard==Odd);
assert(A.checkerboard==Even);
mat.checkerboard = Even;
DerivInternal(StencilOdd,UmuEven,mat,A,B,dag);
}
template<class Impl>
void WilsonFermion5D<Impl>::Report(void)
{
std::cout<<GridLogMessage << "******************** WilsonFermion"<<std::endl;
std::cout<<GridLogMessage << "Wilson5d time "<<alltime <<" us"<<std::endl;
std::cout<<GridLogMessage << "HaloBegin time "<<commtime <<" us"<<std::endl;
std::cout<<GridLogMessage << "Dslash time "<<dslashtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Dslash1 time "<<dslash1time<<" us"<<std::endl;
std::cout<<GridLogMessage << "HaloComplete time "<<jointime<<" us"<<std::endl;
std::cout<<GridLogMessage << "******************** Stencil"<<std::endl;
std::cout<<GridLogMessage << "Stencil all gather time "<<Stencil.halogtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil nosplice gather time "<<Stencil.nosplicetime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil splice gather time "<<Stencil.splicetime<<" us"<<std::endl;
std::cout<<GridLogMessage << "********************"<<std::endl;
std::cout<<GridLogMessage << "Stencil gather "<<Stencil.gathertime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil gather simd "<<Stencil.gathermtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil merge simd "<<Stencil.mergetime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil spin simd "<<Stencil.spintime<<" us"<<std::endl;
std::cout<<GridLogMessage << "********************"<<std::endl;
std::cout<<GridLogMessage << "Stencil MB/s "<<(double)Stencil.comms_bytes/Stencil.commtime<<std::endl;
std::cout<<GridLogMessage << "Stencil comm time "<<Stencil.commtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil join time "<<Stencil.jointime<<" us"<<std::endl;
std::cout<<GridLogMessage << "********************"<<std::endl;
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
const FermionField &A,
const FermionField &B,
int dag)
{
conformable(A._grid,FermionRedBlackGrid());
conformable(GaugeRedBlackGrid(),mat._grid);
conformable(A._grid,B._grid);
assert(B.checkerboard==Even);
assert(A.checkerboard==Odd);
mat.checkerboard = Odd;
DerivInternal(StencilEven,UmuOdd,mat,A,B,dag);
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag)
{
// if ( Impl::overlapCommsCompute () ) {
// DhopInternalCommsOverlapCompute(st,lo,U,in,out,dag);
// } else {
DhopInternalCommsThenCompute(st,lo,U,in,out,dag);
// }
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag)
{
// assert((dag==DaggerNo) ||(dag==DaggerYes));
alltime-=usecond();
Compressor compressor(dag);
// Assume balanced KMP_AFFINITY; this is forced in GridThread.h
int threads = GridThread::GetThreads();
int HT = GridThread::GetHyperThreads();
int cores = GridThread::GetCores();
int nwork = U._grid->oSites();
commtime -=usecond();
auto handle = st.HaloExchangeOptBegin(in,compressor);
st.HaloExchangeOptComplete(handle);
commtime +=usecond();
jointime -=usecond();
jointime +=usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
// Not loop ordering and data layout.
// Designed to create
// - per thread reuse in L1 cache for U
// - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
dslashtime -=usecond();
if ( dag == DaggerYes ) {
if( this->HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
}
}
} else {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
{
int sd;
for(sd=0;sd<Ls;sd++){
int sU=ss;
int sF = sd+Ls*sU;
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
}
}
}
}
} else {
if( this->AsmOptDslash ) {
// for(int i=0;i<1;i++){
// for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
// PerformanceCounter Counter(i);
// Counter.Start();
#pragma omp parallel for
for(int t=0;t<threads;t++){
int hyperthread = t%HT;
int core = t/HT;
int sswork, swork,soff,ssoff, sU,sF;
GridThread::GetWork(nwork,core,sswork,ssoff,cores);
GridThread::GetWork(Ls , hyperthread, swork, soff,HT);
for(int ss=0;ss<sswork;ss++){
for(int s=soff;s<soff+swork;s++){
sU=ss+ ssoff;
if ( LebesgueOrder::UseLebesgueOrder ) {
sU = lo.Reorder(sU);
}
sF = s+Ls*sU;
Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out,(uint64_t *)0);// &buf[0]
}
}
}
// Counter.Stop();
// Counter.Report();
// }
} else if( this->HandOptDslash ) {
/*
#pragma omp parallel for schedule(static)
for(int t=0;t<threads;t++){
int hyperthread = t%HT;
int core = t/HT;
int sswork, swork,soff,ssoff, sU,sF;
GridThread::GetWork(nwork,core,sswork,ssoff,cores);
GridThread::GetWork(Ls , hyperthread, swork, soff,HT);
for(int ss=0;ss<sswork;ss++){
sU=ss+ ssoff;
for(int s=soff;s<soff+swork;s++){
sF = s+Ls*sU;
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
}
}
}
*/
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
}
}
} else {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
}
}
}
}
dslashtime +=usecond();
alltime+=usecond();
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag)
{
assert(0);
// assert((dag==DaggerNo) ||(dag==DaggerYes));
alltime-=usecond();
Compressor compressor(dag);
// Assume balanced KMP_AFFINITY; this is forced in GridThread.h
int threads = GridThread::GetThreads();
int HT = GridThread::GetHyperThreads();
int cores = GridThread::GetCores();
int nwork = U._grid->oSites();
commtime -=usecond();
auto handle = st.HaloExchangeBegin(in,compressor);
commtime +=usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
// Not loop ordering and data layout.
// Designed to create
// - per thread reuse in L1 cache for U
// - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
bool local = true;
bool nonlocal = false;
dslashtime -=usecond();
if ( dag == DaggerYes ) {
if( this->HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
} else {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
{
int sd;
for(sd=0;sd<Ls;sd++){
int sU=ss;
int sF = sd+Ls*sU;
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
}
}
} else {
if( this->HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
} else {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
}
}
dslashtime +=usecond();
jointime -=usecond();
st.HaloExchangeComplete(handle);
jointime +=usecond();
local = false;
nonlocal = true;
dslash1time -=usecond();
if ( dag == DaggerYes ) {
if( this->HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
} else {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
{
int sd;
for(sd=0;sd<Ls;sd++){
int sU=ss;
int sF = sd+Ls*sU;
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
}
}
} else {
if( this->HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
} else {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
}
}
dslash1time +=usecond();
alltime+=usecond();
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
{
conformable(in._grid,FermionRedBlackGrid()); // verifies half grid
conformable(in._grid,out._grid); // drops the cb check
assert(in.checkerboard==Even);
out.checkerboard = Odd;
DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag);
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
{
conformable(in._grid,FermionRedBlackGrid()); // verifies half grid
conformable(in._grid,out._grid); // drops the cb check
assert(in.checkerboard==Odd);
out.checkerboard = Even;
DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag);
}
template<class Impl>
void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
{
conformable(in._grid,FermionGrid()); // verifies full grid
conformable(in._grid,out._grid);
out.checkerboard = in.checkerboard;
DhopInternal(Stencil,Lebesgue,Umu,in,out,dag);
}
template<class Impl>
void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag)
{
out.checkerboard=in.checkerboard;
Dhop(in,out,dag); // -0.5 is included
axpy(out,4.0-M5,in,out);
}
FermOpTemplateInstantiate(WilsonFermion5D);
}}