1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-09-20 09:15:38 +01:00

Merge branch 'feature/fft-opt' into feature/feynman-rules

# Conflicts:
#	lib/FFT.h
#	lib/qcd/action/fermion/WilsonFermion5D.h
#	tests/core/Test_fft.cc
This commit is contained in:
Antonin Portelli 2016-10-26 18:44:47 +01:00
commit ca21003f01
34 changed files with 2498 additions and 2203 deletions

View File

@ -153,9 +153,10 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl; std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl; std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl; std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NP<<std::endl; std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
err = ref-result; err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl; std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
assert (norm2(err)< 1.0e-5 );
Dw.Report(); Dw.Report();
} }
@ -192,7 +193,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl; std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl; std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NP<<std::endl; std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
sDw.Report(); sDw.Report();
if(0){ if(0){
@ -208,8 +209,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage<< "res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl; std::cout<<GridLogMessage<< "res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl;
RealD sum=0;
RealF sum=0;
for(int x=0;x<latt4[0];x++){ for(int x=0;x<latt4[0];x++){
for(int y=0;y<latt4[1];y++){ for(int y=0;y<latt4[1];y++){
for(int z=0;z<latt4[2];z++){ for(int z=0;z<latt4[2];z++){
@ -221,18 +221,18 @@ int main (int argc, char ** argv)
peekSite(simd,sresult,site); peekSite(simd,sresult,site);
sum=sum+norm2(normal-simd); sum=sum+norm2(normal-simd);
if (norm2(normal-simd) > 1.0e-6 ) { if (norm2(normal-simd) > 1.0e-6 ) {
std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl; std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" normal "<<normal<<std::endl; std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" normal "<<normal<<std::endl;
std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" simd "<<simd<<std::endl; std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" simd "<<simd<<std::endl;
} }
}}}}} }}}}}
std::cout<<GridLogMessage<<" difference between normal and simd is "<<sum<<std::endl; std::cout<<GridLogMessage<<" difference between normal and simd is "<<sum<<std::endl;
assert (sum< 1.0e-5 );
if (1) { if (1) {
LatticeFermion sr_eo(sFGrid); LatticeFermion sr_eo(sFGrid);
LatticeFermion serr(sFGrid);
LatticeFermion ssrc_e (sFrbGrid); LatticeFermion ssrc_e (sFrbGrid);
LatticeFermion ssrc_o (sFrbGrid); LatticeFermion ssrc_o (sFrbGrid);
@ -244,8 +244,6 @@ int main (int argc, char ** argv)
setCheckerboard(sr_eo,ssrc_o); setCheckerboard(sr_eo,ssrc_o);
setCheckerboard(sr_eo,ssrc_e); setCheckerboard(sr_eo,ssrc_e);
serr = sr_eo-ssrc;
std::cout<<GridLogMessage << "EO src norm diff "<< norm2(serr)<<std::endl;
sr_e = zero; sr_e = zero;
sr_o = zero; sr_o = zero;
@ -263,7 +261,7 @@ int main (int argc, char ** argv)
double flops=(1344.0*volume*ncall)/2; double flops=(1344.0*volume*ncall)/2;
std::cout<<GridLogMessage << "sDeo mflop/s = "<< flops/(t1-t0)<<std::endl; std::cout<<GridLogMessage << "sDeo mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "sDeo mflop/s per node "<< flops/(t1-t0)/NP<<std::endl; std::cout<<GridLogMessage << "sDeo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
sDw.Report(); sDw.Report();
sDw.DhopEO(ssrc_o,sr_e,DaggerNo); sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
@ -273,9 +271,18 @@ int main (int argc, char ** argv)
pickCheckerboard(Even,ssrc_e,sresult); pickCheckerboard(Even,ssrc_e,sresult);
pickCheckerboard(Odd ,ssrc_o,sresult); pickCheckerboard(Odd ,ssrc_o,sresult);
ssrc_e = ssrc_e - sr_e; ssrc_e = ssrc_e - sr_e;
RealD error = norm2(ssrc_e);
std::cout<<GridLogMessage << "sE norm diff "<< norm2(ssrc_e)<< " vec nrm"<<norm2(sr_e) <<std::endl; std::cout<<GridLogMessage << "sE norm diff "<< norm2(ssrc_e)<< " vec nrm"<<norm2(sr_e) <<std::endl;
ssrc_o = ssrc_o - sr_o; ssrc_o = ssrc_o - sr_o;
error+= norm2(ssrc_o);
std::cout<<GridLogMessage << "sO norm diff "<< norm2(ssrc_o)<< " vec nrm"<<norm2(sr_o) <<std::endl; std::cout<<GridLogMessage << "sO norm diff "<< norm2(ssrc_o)<< " vec nrm"<<norm2(sr_o) <<std::endl;
if(error>1.0e-5) {
setCheckerboard(ssrc,ssrc_o);
setCheckerboard(ssrc,ssrc_e);
std::cout<< ssrc << std::endl;
}
} }
@ -307,7 +314,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl; std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
err = ref-result; err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl; std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
assert(norm2(err)<1.0e-5);
LatticeFermion src_e (FrbGrid); LatticeFermion src_e (FrbGrid);
LatticeFermion src_o (FrbGrid); LatticeFermion src_o (FrbGrid);
LatticeFermion r_e (FrbGrid); LatticeFermion r_e (FrbGrid);
@ -334,7 +341,7 @@ int main (int argc, char ** argv)
double flops=(1344.0*volume*ncall)/2; double flops=(1344.0*volume*ncall)/2;
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl; std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "Deo mflop/s per node "<< flops/(t1-t0)/NP<<std::endl; std::cout<<GridLogMessage << "Deo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
Dw.Report(); Dw.Report();
} }
Dw.DhopEO(src_o,r_e,DaggerNo); Dw.DhopEO(src_o,r_e,DaggerNo);
@ -350,11 +357,14 @@ int main (int argc, char ** argv)
err = r_eo-result; err = r_eo-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl; std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
assert(norm2(err)<1.0e-5);
pickCheckerboard(Even,src_e,err); pickCheckerboard(Even,src_e,err);
pickCheckerboard(Odd,src_o,err); pickCheckerboard(Odd,src_o,err);
std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl; std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl;
std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl; std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl;
assert(norm2(src_e)<1.0e-5);
assert(norm2(src_o)<1.0e-5);
} }

View File

@ -145,7 +145,7 @@ public:
if ( bcast != ptr ) { if ( bcast != ptr ) {
std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout); std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout);
BACKTRACEFILE(); // BACKTRACEFILE();
exit(0); exit(0);
} }
assert( bcast == (void *) ptr); assert( bcast == (void *) ptr);
@ -155,15 +155,6 @@ public:
void deallocate(pointer __p, size_type) { void deallocate(pointer __p, size_type) {
shmem_free((void *)__p); shmem_free((void *)__p);
} }
#elif defined(GRID_COMMS_MPI3)
pointer allocate(size_type __n, const void* _p= 0)
{
#error "implement MPI3 windowed allocate"
}
void deallocate(pointer __p, size_type) {
#error "implement MPI3 windowed allocate"
}
#else #else
pointer allocate(size_type __n, const void* _p= 0) pointer allocate(size_type __n, const void* _p= 0)
{ {

247
lib/FFT.h
View File

@ -100,44 +100,44 @@ namespace Grid {
#define FFTW_BACKWARD (+1) #define FFTW_BACKWARD (+1)
#endif #endif
class FFT { class FFT {
private: private:
GridCartesian *vgrid; GridCartesian *vgrid;
GridCartesian *sgrid; GridCartesian *sgrid;
int Nd; int Nd;
double flops; double flops;
double flops_call; double flops_call;
uint64_t usec; uint64_t usec;
std::vector<int> dimensions; std::vector<int> dimensions;
std::vector<int> processors; std::vector<int> processors;
std::vector<int> processor_coor; std::vector<int> processor_coor;
public: public:
static const int forward =FFTW_FORWARD; static const int forward=FFTW_FORWARD;
static const int backward=FFTW_BACKWARD; static const int backward=FFTW_BACKWARD;
double Flops(void) {return flops;} double Flops(void) {return flops;}
double MFlops(void) {return flops/usec;} double MFlops(void) {return flops/usec;}
FFT ( GridCartesian * grid ) : FFT ( GridCartesian * grid ) :
vgrid(grid), vgrid(grid),
Nd(grid->_ndimension), Nd(grid->_ndimension),
dimensions(grid->_fdimensions), dimensions(grid->_fdimensions),
processors(grid->_processors), processors(grid->_processors),
processor_coor(grid->_processor_coor) processor_coor(grid->_processor_coor)
{ {
flops=0; flops=0;
usec =0; usec =0;
std::vector<int> layout(Nd,1); std::vector<int> layout(Nd,1);
sgrid = new GridCartesian(dimensions,layout,processors); sgrid = new GridCartesian(dimensions,layout,processors);
}; };
~FFT ( void) { ~FFT ( void) {
delete sgrid; delete sgrid;
} }
template<class vobj> template<class vobj>
@ -164,145 +164,118 @@ namespace Grid {
template<class vobj> template<class vobj>
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){ void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
#ifndef HAVE_FFTW
assert(0);
#else
conformable(result._grid,vgrid); conformable(result._grid,vgrid);
conformable(source._grid,vgrid); conformable(source._grid,vgrid);
int L = vgrid->_ldimensions[dim]; int L = vgrid->_ldimensions[dim];
int G = vgrid->_fdimensions[dim]; int G = vgrid->_fdimensions[dim];
std::vector<int> layout(Nd,1); std::vector<int> layout(Nd,1);
std::vector<int> pencil_gd(vgrid->_fdimensions); std::vector<int> pencil_gd(vgrid->_fdimensions);
pencil_gd[dim] = G*processors[dim]; pencil_gd[dim] = G*processors[dim];
// Pencil global vol LxLxGxLxL per node // Pencil global vol LxLxGxLxL per node
GridCartesian pencil_g(pencil_gd,layout,processors); GridCartesian pencil_g(pencil_gd,layout,processors);
// Construct pencils // Construct pencils
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename sobj::scalar_type scalar; typedef typename sobj::scalar_type scalar;
/* Lattice<sobj> pgbuf(&pencil_g);
std::cout << "FFT : vobj "<<demangle(typeid(vobj).name()) <<std::endl;
std::cout << "FFT : sobj "<<demangle(typeid(sobj).name()) <<std::endl;
std::cout << "FFT : scalar "<<demangle(typeid(scalar).name()) <<std::endl;
*/
Lattice<vobj> ssource(vgrid); ssource =source;
Lattice<sobj> pgsource(&pencil_g);
Lattice<sobj> pgresult(&pencil_g); pgresult=zero;
#ifndef HAVE_FFTW
assert(0);
#else
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar; typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan; typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
{ int Ncomp = sizeof(sobj)/sizeof(scalar);
int Ncomp = sizeof(sobj)/sizeof(scalar); int Nlow = 1;
int Nlow = 1; for(int d=0;d<dim;d++){
for(int d=0;d<dim;d++){ Nlow*=vgrid->_ldimensions[d];
Nlow*=vgrid->_ldimensions[d];
}
int rank = 1; /* 1d transforms */
int n[] = {G}; /* 1d transforms of length G */
int howmany = Ncomp;
int odist,idist,istride,ostride;
idist = odist = 1; /* Distance between consecutive FT's */
istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
int *inembed = n, *onembed = n;
scalar div;
if ( sign == backward ) div = 1.0/G;
else if ( sign == forward ) div = 1.0;
else assert(0);
FFTW_plan p;
{
FFTW_scalar *in = (FFTW_scalar *)&pgsource._odata[0];
FFTW_scalar *out= (FFTW_scalar *)&pgresult._odata[0];
p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
in,inembed,
istride,idist,
out,onembed,
ostride, odist,
sign,FFTW_ESTIMATE);
}
double add,mul,fma;
FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
flops_call = add+mul+2.0*fma;
GridStopWatch timer;
// Barrel shift and collect global pencil
for(int p=0;p<processors[dim];p++) {
for(int idx=0;idx<sgrid->lSites();idx++) {
std::vector<int> lcoor(Nd);
sgrid->LocalIndexToLocalCoor(idx,lcoor);
sobj s;
peekLocalSite(s,ssource,lcoor);
lcoor[dim]+=p*L;
pokeLocalSite(s,pgsource,lcoor);
}
ssource = Cshift(ssource,dim,L);
}
// Loop over orthog coords
int NN=pencil_g.lSites();
GridStopWatch Timer;
Timer.Start();
PARALLEL_FOR_LOOP
for(int idx=0;idx<NN;idx++) {
std::vector<int> lcoor(Nd);
pencil_g.LocalIndexToLocalCoor(idx,lcoor);
if ( lcoor[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0
FFTW_scalar *in = (FFTW_scalar *)&pgsource._odata[idx];
FFTW_scalar *out= (FFTW_scalar *)&pgresult._odata[idx];
FFTW<scalar>::fftw_execute_dft(p,in,out);
}
}
Timer.Stop();
usec += Timer.useconds();
flops+= flops_call*NN;
int pc = processor_coor[dim];
for(int idx=0;idx<sgrid->lSites();idx++) {
std::vector<int> lcoor(Nd);
sgrid->LocalIndexToLocalCoor(idx,lcoor);
std::vector<int> gcoor = lcoor;
// extract the result
sobj s;
gcoor[dim] = lcoor[dim]+L*pc;
peekLocalSite(s,pgresult,gcoor);
s = s * div;
pokeLocalSite(s,result,lcoor);
}
FFTW<scalar>::fftw_destroy_plan(p);
} }
int rank = 1; /* 1d transforms */
int n[] = {G}; /* 1d transforms of length G */
int howmany = Ncomp;
int odist,idist,istride,ostride;
idist = odist = 1; /* Distance between consecutive FT's */
istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
int *inembed = n, *onembed = n;
scalar div;
if ( sign == backward ) div = 1.0/G;
else if ( sign == forward ) div = 1.0;
else assert(0);
FFTW_plan p;
{
FFTW_scalar *in = (FFTW_scalar *)&pgbuf._odata[0];
FFTW_scalar *out= (FFTW_scalar *)&pgbuf._odata[0];
p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
in,inembed,
istride,idist,
out,onembed,
ostride, odist,
sign,FFTW_ESTIMATE);
}
// Barrel shift and collect global pencil
std::vector<int> lcoor(Nd), gcoor(Nd);
result = source;
for(int p=0;p<processors[dim];p++) {
for(int idx=0;idx<sgrid->lSites();idx++) {
sgrid->LocalIndexToLocalCoor(idx,lcoor);
sobj s;
peekLocalSite(s,result,lcoor);
lcoor[dim]+=p*L;
pokeLocalSite(s,pgbuf,lcoor);
}
result = Cshift(result,dim,L);
}
// Loop over orthog coords
int NN=pencil_g.lSites();
GridStopWatch timer;
timer.Start();
//PARALLEL_FOR_LOOP
for(int idx=0;idx<NN;idx++) {
pencil_g.LocalIndexToLocalCoor(idx,lcoor);
if ( lcoor[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0
FFTW_scalar *in = (FFTW_scalar *)&pgbuf._odata[idx];
FFTW_scalar *out= (FFTW_scalar *)&pgbuf._odata[idx];
FFTW<scalar>::fftw_execute_dft(p,in,out);
}
}
timer.Stop();
// performance counting
double add,mul,fma;
FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
flops_call = add+mul+2.0*fma;
usec += timer.useconds();
flops+= flops_call*NN;
// writing out result
int pc = processor_coor[dim];
for(int idx=0;idx<sgrid->lSites();idx++) {
sgrid->LocalIndexToLocalCoor(idx,lcoor);
gcoor = lcoor;
sobj s;
gcoor[dim] = lcoor[dim]+L*pc;
peekLocalSite(s,pgbuf,gcoor);
s = s * div;
pokeLocalSite(s,result,lcoor);
}
// destroying plan
FFTW<scalar>::fftw_destroy_plan(p);
#endif #endif
} }
}; };
} }
#endif #endif

View File

@ -195,14 +195,17 @@ std::string GridCmdVectorIntToString(const std::vector<int> & vec){
///////////////////////////////////////////////////////// /////////////////////////////////////////////////////////
// //
///////////////////////////////////////////////////////// /////////////////////////////////////////////////////////
static int Grid_is_initialised = 0;
void Grid_init(int *argc,char ***argv) void Grid_init(int *argc,char ***argv)
{ {
GridLogger::StopWatch.Start();
CartesianCommunicator::Init(argc,argv); CartesianCommunicator::Init(argc,argv);
// Parse command line args. // Parse command line args.
GridLogger::StopWatch.Start();
std::string arg; std::string arg;
std::vector<std::string> logstreams; std::vector<std::string> logstreams;
std::string defaultLog("Error,Warning,Message,Performance"); std::string defaultLog("Error,Warning,Message,Performance");
@ -240,11 +243,14 @@ void Grid_init(int *argc,char ***argv)
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
LebesgueOrder::UseLebesgueOrder=1; LebesgueOrder::UseLebesgueOrder=1;
} }
if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
GridCmdOptionIntVector(arg,LebesgueOrder::Block); GridCmdOptionIntVector(arg,LebesgueOrder::Block);
} }
if( GridCmdOptionExists(*argv,*argv+*argc,"--timestamp") ){
GridLogTimestamp(1);
}
GridParseLayout(*argv,*argc, GridParseLayout(*argv,*argc,
Grid_default_latt, Grid_default_latt,
Grid_default_mpi); Grid_default_mpi);
@ -298,12 +304,14 @@ void Grid_init(int *argc,char ***argv)
std::cout << "GNU General Public License for more details."<<std::endl; std::cout << "GNU General Public License for more details."<<std::endl;
std::cout << COL_BACKGROUND <<std::endl; std::cout << COL_BACKGROUND <<std::endl;
std::cout << std::endl; std::cout << std::endl;
Grid_is_initialised = 1;
} }
void Grid_finalize(void) void Grid_finalize(void)
{ {
#ifdef GRID_COMMS_MPI #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
MPI_Finalize(); MPI_Finalize();
Grid_unquiesce_nodes(); Grid_unquiesce_nodes();
#endif #endif

View File

@ -33,6 +33,7 @@ namespace Grid {
void Grid_init(int *argc,char ***argv); void Grid_init(int *argc,char ***argv);
void Grid_finalize(void); void Grid_finalize(void);
// internal, controled with --handle // internal, controled with --handle
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr); void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
void Grid_debug_handler_init(void); void Grid_debug_handler_init(void);
@ -44,6 +45,7 @@ namespace Grid {
const std::vector<int> &GridDefaultMpi(void); const std::vector<int> &GridDefaultMpi(void);
const int &GridThreads(void) ; const int &GridThreads(void) ;
void GridSetThreads(int t) ; void GridSetThreads(int t) ;
void GridLogTimestamp(int);
// Common parsing chores // Common parsing chores
std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option); std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option);

View File

@ -49,8 +49,13 @@ namespace Grid {
} }
GridStopWatch Logger::StopWatch; GridStopWatch Logger::StopWatch;
int Logger::timestamp;
std::ostream Logger::devnull(0); std::ostream Logger::devnull(0);
void GridLogTimestamp(int on){
Logger::Timestamp(on);
}
Colours GridLogColours(0); Colours GridLogColours(0);
GridLogger GridLogError(1, "Error", GridLogColours, "RED"); GridLogger GridLogError(1, "Error", GridLogColours, "RED");
GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW"); GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
@ -88,7 +93,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
void Grid_quiesce_nodes(void) { void Grid_quiesce_nodes(void) {
int me = 0; int me = 0;
#ifdef GRID_COMMS_MPI #if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3)
MPI_Comm_rank(MPI_COMM_WORLD, &me); MPI_Comm_rank(MPI_COMM_WORLD, &me);
#endif #endif
#ifdef GRID_COMMS_SHMEM #ifdef GRID_COMMS_SHMEM

View File

@ -37,10 +37,11 @@
#include <execinfo.h> #include <execinfo.h>
#endif #endif
namespace Grid { namespace Grid {
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dress the output; use std::chrono for time stamping via the StopWatch class // Dress the output; use std::chrono for time stamping via the StopWatch class
int Rank(void); // used for early stage debug before library init //////////////////////////////////////////////////////////////////////////////////////////////////
class Colours{ class Colours{
@ -55,7 +56,6 @@ public:
void Active(bool activate){ void Active(bool activate){
is_active=activate; is_active=activate;
if (is_active){ if (is_active){
colour["BLACK"] ="\033[30m"; colour["BLACK"] ="\033[30m";
colour["RED"] ="\033[31m"; colour["RED"] ="\033[31m";
@ -66,21 +66,18 @@ public:
colour["CYAN"] ="\033[36m"; colour["CYAN"] ="\033[36m";
colour["WHITE"] ="\033[37m"; colour["WHITE"] ="\033[37m";
colour["NORMAL"] ="\033[0;39m"; colour["NORMAL"] ="\033[0;39m";
} else { } else {
colour["BLACK"] =""; colour["BLACK"] ="";
colour["RED"] =""; colour["RED"] ="";
colour["GREEN"] =""; colour["GREEN"] ="";
colour["YELLOW"]=""; colour["YELLOW"]="";
colour["BLUE"] =""; colour["BLUE"] ="";
colour["PURPLE"]=""; colour["PURPLE"]="";
colour["CYAN"] =""; colour["CYAN"] ="";
colour["WHITE"] =""; colour["WHITE"] ="";
colour["NORMAL"]=""; colour["NORMAL"]="";
} }
};
};
}; };
@ -88,6 +85,7 @@ class Logger {
protected: protected:
Colours &Painter; Colours &Painter;
int active; int active;
static int timestamp;
std::string name, topName; std::string name, topName;
std::string COLOUR; std::string COLOUR;
@ -99,25 +97,28 @@ public:
std::string evidence() {return Painter.colour["YELLOW"];} std::string evidence() {return Painter.colour["YELLOW"];}
std::string colour() {return Painter.colour[COLOUR];} std::string colour() {return Painter.colour[COLOUR];}
Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col) Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col) : active(on),
: active(on), name(nm),
name(nm), topName(topNm),
topName(topNm), Painter(col_class),
Painter(col_class), COLOUR(col) {} ;
COLOUR(col){} ;
void Active(int on) {active = on;}; void Active(int on) {active = on;};
int isActive(void) {return active;}; int isActive(void) {return active;};
static void Timestamp(int on) {timestamp = on;};
friend std::ostream& operator<< (std::ostream& stream, Logger& log){ friend std::ostream& operator<< (std::ostream& stream, Logger& log){
if ( log.active ) { if ( log.active ) {
StopWatch.Stop();
GridTime now = StopWatch.Elapsed();
StopWatch.Start();
stream << log.background()<< log.topName << log.background()<< " : "; stream << log.background()<< log.topName << log.background()<< " : ";
stream << log.colour() <<std::setw(14) << std::left << log.name << log.background() << " : "; stream << log.colour() <<std::setw(14) << std::left << log.name << log.background() << " : ";
stream << log.evidence()<< now << log.background() << " : " << log.colour(); if ( log.timestamp ) {
StopWatch.Stop();
GridTime now = StopWatch.Elapsed();
StopWatch.Start();
stream << log.evidence()<< now << log.background() << " : " ;
}
stream << log.colour();
return stream; return stream;
} else { } else {
return devnull; return devnull;
@ -150,7 +151,7 @@ extern void * Grid_backtrace_buffer[_NBACKTRACE];
#define BACKTRACEFILE() {\ #define BACKTRACEFILE() {\
char string[20]; \ char string[20]; \
std::sprintf(string,"backtrace.%d",Rank()); \ std::sprintf(string,"backtrace.%d",CartesianCommunicator::RankWorld()); \
std::FILE * fp = std::fopen(string,"w"); \ std::FILE * fp = std::fopen(string,"w"); \
BACKTRACEFP(fp)\ BACKTRACEFP(fp)\
std::fclose(fp); \ std::fclose(fp); \

View File

@ -1,18 +1,22 @@
extra_sources= extra_sources=
if BUILD_COMMS_MPI if BUILD_COMMS_MPI
extra_sources+=communicator/Communicator_mpi.cc extra_sources+=communicator/Communicator_mpi.cc
extra_sources+=communicator/Communicator_base.cc
endif endif
if BUILD_COMMS_MPI3 if BUILD_COMMS_MPI3
extra_sources+=communicator/Communicator_mpi3.cc extra_sources+=communicator/Communicator_mpi3.cc
extra_sources+=communicator/Communicator_base.cc
endif endif
if BUILD_COMMS_SHMEM if BUILD_COMMS_SHMEM
extra_sources+=communicator/Communicator_shmem.cc extra_sources+=communicator/Communicator_shmem.cc
extra_sources+=communicator/Communicator_base.cc
endif endif
if BUILD_COMMS_NONE if BUILD_COMMS_NONE
extra_sources+=communicator/Communicator_none.cc extra_sources+=communicator/Communicator_none.cc
extra_sources+=communicator/Communicator_base.cc
endif endif
# #

File diff suppressed because it is too large Load Diff

View File

@ -127,6 +127,22 @@ class GridThread {
ThreadBarrier(); ThreadBarrier();
}; };
static void bcopy(const void *src, void *dst, size_t len) {
#ifdef GRID_OMP
#pragma omp parallel
{
const char *c_src =(char *) src;
char *c_dest=(char *) dst;
int me,mywork,myoff;
GridThread::GetWorkBarrier(len,me, mywork,myoff);
bcopy(&c_src[myoff],&c_dest[myoff],mywork);
}
#else
bcopy(src,dst,len);
#endif
}
}; };
} }

View File

@ -77,7 +77,7 @@ public:
// GridCartesian / GridRedBlackCartesian // GridCartesian / GridRedBlackCartesian
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
virtual int CheckerBoarded(int dim)=0; virtual int CheckerBoarded(int dim)=0;
virtual int CheckerBoard(std::vector<int> site)=0; virtual int CheckerBoard(std::vector<int> &site)=0;
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0; virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0; virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0; virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;

View File

@ -49,7 +49,7 @@ public:
virtual int CheckerBoarded(int dim){ virtual int CheckerBoarded(int dim){
return 0; return 0;
} }
virtual int CheckerBoard(std::vector<int> site){ virtual int CheckerBoard(std::vector<int> &site){
return 0; return 0;
} }
virtual int CheckerBoardDestination(int cb,int shift,int dim){ virtual int CheckerBoardDestination(int cb,int shift,int dim){

View File

@ -49,7 +49,7 @@ public:
if( dim==_checker_dim) return 1; if( dim==_checker_dim) return 1;
else return 0; else return 0;
} }
virtual int CheckerBoard(std::vector<int> site){ virtual int CheckerBoard(std::vector<int> &site){
int linear=0; int linear=0;
assert(site.size()==_ndimension); assert(site.size()==_ndimension);
for(int d=0;d<_ndimension;d++){ for(int d=0;d<_ndimension;d++){

View File

@ -0,0 +1,132 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/Communicator_none.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include "Grid.h"
namespace Grid {
///////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////
int CartesianCommunicator::ShmRank;
int CartesianCommunicator::ShmSize;
int CartesianCommunicator::GroupRank;
int CartesianCommunicator::GroupSize;
int CartesianCommunicator::WorldRank;
int CartesianCommunicator::WorldSize;
int CartesianCommunicator::Slave;
void * CartesianCommunicator::ShmCommBuf;
/////////////////////////////////
// Alloc, free shmem region
/////////////////////////////////
void *CartesianCommunicator::ShmBufferMalloc(size_t bytes){
// bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
void *ptr = (void *)heap_top;
heap_top += bytes;
heap_bytes+= bytes;
std::cout <<"Shm alloc "<<ptr<<std::endl;
assert(heap_bytes < MAX_MPI_SHM_BYTES);
return ptr;
}
void CartesianCommunicator::ShmBufferFreeAll(void) {
heap_top =(size_t)ShmBufferSelf();
heap_bytes=0;
}
/////////////////////////////////
// Grid information queries
/////////////////////////////////
int CartesianCommunicator::IsBoss(void) { return _processor==0; };
int CartesianCommunicator::BossRank(void) { return 0; };
int CartesianCommunicator::ThisRank(void) { return _processor; };
const std::vector<int> & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; };
const std::vector<int> & CartesianCommunicator::ProcessorGrid(void) { return _processors; };
int CartesianCommunicator::ProcessorCount(void) { return _Nprocessors; };
////////////////////////////////////////////////////////////////////////////////
// very VERY rarely (Log, serial RNG) we need world without a grid
////////////////////////////////////////////////////////////////////////////////
int CartesianCommunicator::RankWorld(void){ return WorldRank; };
int CartesianCommunicator::Ranks (void) { return WorldSize; };
int CartesianCommunicator::Nodes (void) { return GroupSize; };
int CartesianCommunicator::Cores (void) { return ShmSize; };
int CartesianCommunicator::NodeRank (void) { return GroupRank; };
int CartesianCommunicator::CoreRank (void) { return ShmRank; };
void CartesianCommunicator::GlobalSum(ComplexF &c)
{
GlobalSumVector((float *)&c,2);
}
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
{
GlobalSumVector((float *)c,2*N);
}
void CartesianCommunicator::GlobalSum(ComplexD &c)
{
GlobalSumVector((double *)&c,2);
}
void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
{
GlobalSumVector((double *)c,2*N);
}
#ifndef GRID_COMMS_MPI3
void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes)
{
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
{
SendToRecvFromComplete(waitall);
}
void CartesianCommunicator::StencilBarrier(void){};
commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; }
void *CartesianCommunicator::ShmBuffer(int rank) {
return NULL;
}
void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
return NULL;
}
void CartesianCommunicator::ShmInitGeneric(void){
ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
ShmCommBuf=(void *)&ShmBufStorageVector[0];
}
#endif
}

View File

@ -40,143 +40,188 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifdef GRID_COMMS_SHMEM #ifdef GRID_COMMS_SHMEM
#include <mpp/shmem.h> #include <mpp/shmem.h>
#endif #endif
namespace Grid { namespace Grid {
class CartesianCommunicator { class CartesianCommunicator {
public: public:
// 65536 ranks per node adequate for now
// 128MB shared memory for comms enought for 48^4 local vol comms
// Give external control (command line override?) of this
static const int MAXLOG2RANKSPERNODE = 16;
static const uint64_t MAX_MPI_SHM_BYTES = 128*1024*1024;
// Communicator should know nothing of the physics grid, only processor grid. // Communicator should know nothing of the physics grid, only processor grid.
int _Nprocessors; // How many in all
std::vector<int> _processors; // Which dimensions get relayed out over processors lanes.
int _processor; // linear processor rank
std::vector<int> _processor_coor; // linear processor coordinate
unsigned long _ndimension;
int _Nprocessors; // How many in all #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
std::vector<int> _processors; // Which dimensions get relayed out over processors lanes. MPI_Comm communicator;
int _processor; // linear processor rank static MPI_Comm communicator_world;
std::vector<int> _processor_coor; // linear processor coordinate typedef MPI_Request CommsRequest_t;
unsigned long _ndimension;
#ifdef GRID_COMMS_MPI
MPI_Comm communicator;
typedef MPI_Request CommsRequest_t;
#elif GRID_COMMS_MPI3
MPI_Comm communicator;
typedef MPI_Request CommsRequest_t;
const int MAXLOG2RANKSPERNODE = 16; // 65536 ranks per node adequate for now
std::vector<int> WorldDims;
std::vector<int> GroupDims;
std::vector<int> ShmDims;
std::vector<int> GroupCoor;
std::vector<int> ShmCoor;
std::vector<int> WorldCoor;
int GroupRank;
int ShmRank;
int WorldRank;
int GroupSize;
int ShmSize;
int WorldSize;
std::vector<int> LexicographicToWorldRank;
#else #else
typedef int CommsRequest_t; typedef int CommsRequest_t;
#endif #endif
static void Init(int *argc, char ***argv); ////////////////////////////////////////////////////////////////////
// Helper functionality for SHM Windows common to all other impls
////////////////////////////////////////////////////////////////////
// Longer term; drop this in favour of a master / slave model with
// cartesian communicator on a subset of ranks, slave ranks controlled
// by group leader with data xfer via shared memory
////////////////////////////////////////////////////////////////////
#ifdef GRID_COMMS_MPI3
std::vector<int> WorldDims;
std::vector<int> GroupDims;
std::vector<int> ShmDims;
std::vector<int> GroupCoor;
std::vector<int> ShmCoor;
std::vector<int> WorldCoor;
static std::vector<int> GroupRanks;
static std::vector<int> MyGroup;
static int ShmSetup;
static MPI_Win ShmWindow;
static MPI_Comm ShmComm;
std::vector<int> LexicographicToWorldRank;
static std::vector<void *> ShmCommBufs;
#else
static void ShmInitGeneric(void);
static commVector<uint8_t> ShmBufStorageVector;
#endif
static void * ShmCommBuf;
size_t heap_top;
size_t heap_bytes;
void *ShmBufferSelf(void);
void *ShmBuffer(int rank);
void *ShmBufferTranslate(int rank,void * local_p);
void *ShmBufferMalloc(size_t bytes);
void ShmBufferFreeAll(void) ;
////////////////////////////////////////////////
// Must call in Grid startup
////////////////////////////////////////////////
static void Init(int *argc, char ***argv);
////////////////////////////////////////////////
// Constructor of any given grid
////////////////////////////////////////////////
CartesianCommunicator(const std::vector<int> &pdimensions_in);
////////////////////////////////////////////////////////////////////////////////////////
// Wraps MPI_Cart routines, or implements equivalent on other impls
////////////////////////////////////////////////////////////////////////////////////////
void ShiftedRanks(int dim,int shift,int & source, int & dest);
int RankFromProcessorCoor(std::vector<int> &coor);
void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
/////////////////////////////////
// Grid information and queries
/////////////////////////////////
static int ShmRank;
static int ShmSize;
static int GroupSize;
static int GroupRank;
static int WorldRank;
static int WorldSize;
static int Slave;
int IsBoss(void) ;
int BossRank(void) ;
int ThisRank(void) ;
const std::vector<int> & ThisProcessorCoor(void) ;
const std::vector<int> & ProcessorGrid(void) ;
int ProcessorCount(void) ;
static int Ranks (void);
static int Nodes (void);
static int Cores (void);
static int NodeRank (void);
static int CoreRank (void);
// Constructor ////////////////////////////////////////////////////////////////////////////////
CartesianCommunicator(const std::vector<int> &pdimensions_in); // very VERY rarely (Log, serial RNG) we need world without a grid
////////////////////////////////////////////////////////////////////////////////
static int RankWorld(void) ;
static void BroadcastWorld(int root,void* data, int bytes);
////////////////////////////////////////////////////////////
// Reduction
////////////////////////////////////////////////////////////
void GlobalSum(RealF &);
void GlobalSumVector(RealF *,int N);
void GlobalSum(RealD &);
void GlobalSumVector(RealD *,int N);
void GlobalSum(uint32_t &);
void GlobalSum(uint64_t &);
void GlobalSum(ComplexF &c);
void GlobalSumVector(ComplexF *c,int N);
void GlobalSum(ComplexD &c);
void GlobalSumVector(ComplexD *c,int N);
template<class obj> void GlobalSum(obj &o){
typedef typename obj::scalar_type scalar_type;
int words = sizeof(obj)/sizeof(scalar_type);
scalar_type * ptr = (scalar_type *)& o;
GlobalSumVector(ptr,words);
}
////////////////////////////////////////////////////////////
// Face exchange, buffer swap in translational invariant way
////////////////////////////////////////////////////////////
void SendToRecvFrom(void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes);
void SendRecvPacket(void *xmit,
void *recv,
int xmit_to_rank,
int recv_from_rank,
int bytes);
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes);
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
// Wraps MPI_Cart routines void StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void ShiftedRanks(int dim,int shift,int & source, int & dest); void *xmit,
int RankFromProcessorCoor(std::vector<int> &coor); int xmit_to_rank,
void ProcessorCoorFromRank(int rank,std::vector<int> &coor); void *recv,
int recv_from_rank,
int bytes);
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
void StencilBarrier(void);
///////////////////////////////// ////////////////////////////////////////////////////////////
// Grid information queries // Barrier
///////////////////////////////// ////////////////////////////////////////////////////////////
int IsBoss(void) { return _processor==0; }; void Barrier(void);
int BossRank(void) { return 0; };
int ThisRank(void) { return _processor; }; ////////////////////////////////////////////////////////////
const std::vector<int> & ThisProcessorCoor(void) { return _processor_coor; }; // Broadcast a buffer and composite larger
const std::vector<int> & ProcessorGrid(void) { return _processors; }; ////////////////////////////////////////////////////////////
int ProcessorCount(void) { return _Nprocessors; }; void Broadcast(int root,void* data, int bytes);
//////////////////////////////////////////////////////////// template<class obj> void Broadcast(int root,obj &data)
// Reduction
////////////////////////////////////////////////////////////
void GlobalSum(RealF &);
void GlobalSumVector(RealF *,int N);
void GlobalSum(RealD &);
void GlobalSumVector(RealD *,int N);
void GlobalSum(uint32_t &);
void GlobalSum(uint64_t &);
void GlobalSum(ComplexF &c)
{
GlobalSumVector((float *)&c,2);
}
void GlobalSumVector(ComplexF *c,int N)
{
GlobalSumVector((float *)c,2*N);
}
void GlobalSum(ComplexD &c)
{
GlobalSumVector((double *)&c,2);
}
void GlobalSumVector(ComplexD *c,int N)
{
GlobalSumVector((double *)c,2*N);
}
template<class obj> void GlobalSum(obj &o){
typedef typename obj::scalar_type scalar_type;
int words = sizeof(obj)/sizeof(scalar_type);
scalar_type * ptr = (scalar_type *)& o;
GlobalSumVector(ptr,words);
}
////////////////////////////////////////////////////////////
// Face exchange, buffer swap in translational invariant way
////////////////////////////////////////////////////////////
void SendToRecvFrom(void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes);
void SendRecvPacket(void *xmit,
void *recv,
int xmit_to_rank,
int recv_from_rank,
int bytes);
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes);
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
////////////////////////////////////////////////////////////
// Barrier
////////////////////////////////////////////////////////////
void Barrier(void);
////////////////////////////////////////////////////////////
// Broadcast a buffer and composite larger
////////////////////////////////////////////////////////////
void Broadcast(int root,void* data, int bytes);
template<class obj> void Broadcast(int root,obj &data)
{ {
Broadcast(root,(void *)&data,sizeof(data)); Broadcast(root,(void *)&data,sizeof(data));
}; };
static void BroadcastWorld(int root,void* data, int bytes);
}; };
} }

View File

@ -30,21 +30,30 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
namespace Grid { namespace Grid {
// Should error check all MPI calls.
///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Comm CartesianCommunicator::communicator_world;
// Should error check all MPI calls.
void CartesianCommunicator::Init(int *argc, char ***argv) { void CartesianCommunicator::Init(int *argc, char ***argv) {
int flag; int flag;
MPI_Initialized(&flag); // needed to coexist with other libs apparently MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) { if ( !flag ) {
MPI_Init(argc,argv); MPI_Init(argc,argv);
} }
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
MPI_Comm_rank(communicator_world,&WorldRank);
MPI_Comm_size(communicator_world,&WorldSize);
ShmRank=0;
ShmSize=1;
GroupRank=WorldRank;
GroupSize=WorldSize;
Slave =0;
ShmInitGeneric();
} }
int Rank(void) {
int pe;
MPI_Comm_rank(MPI_COMM_WORLD,&pe);
return pe;
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{ {
_ndimension = processors.size(); _ndimension = processors.size();
@ -54,7 +63,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
_processors = processors; _processors = processors;
_processor_coor.resize(_ndimension); _processor_coor.resize(_ndimension);
MPI_Cart_create(MPI_COMM_WORLD, _ndimension,&_processors[0],&periodic[0],1,&communicator); MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
MPI_Comm_rank(communicator,&_processor); MPI_Comm_rank(communicator,&_processor);
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]); MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
@ -67,7 +76,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
assert(Size==_Nprocessors); assert(Size==_Nprocessors);
} }
void CartesianCommunicator::GlobalSum(uint32_t &u){ void CartesianCommunicator::GlobalSum(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0); assert(ierr==0);
@ -168,7 +176,6 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
int nreq=list.size(); int nreq=list.size();
std::vector<MPI_Status> status(nreq); std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]); int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0); assert(ierr==0);
} }
@ -187,14 +194,17 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
communicator); communicator);
assert(ierr==0); assert(ierr==0);
} }
///////////////////////////////////////////////////////
// Should only be used prior to Grid Init finished.
// Check for this?
///////////////////////////////////////////////////////
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{ {
int ierr= MPI_Bcast(data, int ierr= MPI_Bcast(data,
bytes, bytes,
MPI_BYTE, MPI_BYTE,
root, root,
MPI_COMM_WORLD); communicator_world);
assert(ierr==0); assert(ierr==0);
} }

View File

@ -30,25 +30,199 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
namespace Grid { namespace Grid {
// Global used by Init and nowhere else. How to hide?
int Rank(void) { ///////////////////////////////////////////////////////////////////////////////////////////////////
int pe; // Info that is setup once and indept of cartesian layout
MPI_Comm_rank(MPI_COMM_WORLD,&pe); ///////////////////////////////////////////////////////////////////////////////////////////////////
return pe; int CartesianCommunicator::ShmSetup = 0;
MPI_Comm CartesianCommunicator::communicator_world;
MPI_Comm CartesianCommunicator::ShmComm;
MPI_Win CartesianCommunicator::ShmWindow;
std::vector<int> CartesianCommunicator::GroupRanks;
std::vector<int> CartesianCommunicator::MyGroup;
std::vector<void *> CartesianCommunicator::ShmCommBufs;
void *CartesianCommunicator::ShmBufferSelf(void)
{
return ShmCommBufs[ShmRank];
} }
// Should error check all MPI calls. void *CartesianCommunicator::ShmBuffer(int rank)
{
int gpeer = GroupRanks[rank];
if (gpeer == MPI_UNDEFINED){
return NULL;
} else {
return ShmCommBufs[gpeer];
}
}
void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p)
{
int gpeer = GroupRanks[rank];
if (gpeer == MPI_UNDEFINED){
return NULL;
} else {
uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
return (void *) remote;
}
}
void CartesianCommunicator::Init(int *argc, char ***argv) { void CartesianCommunicator::Init(int *argc, char ***argv) {
int flag; int flag;
MPI_Initialized(&flag); // needed to coexist with other libs apparently MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) { if ( !flag ) {
MPI_Init(argc,argv); MPI_Init(argc,argv);
} }
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Want to implement some magic ... Group sub-cubes into those on same node
//
////////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
MPI_Comm_rank(communicator_world,&WorldRank);
MPI_Comm_size(communicator_world,&WorldSize);
/////////////////////////////////////////////////////////////////////
// Split into groups that can share memory
/////////////////////////////////////////////////////////////////////
MPI_Comm_split_type(communicator_world, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
MPI_Comm_rank(ShmComm ,&ShmRank);
MPI_Comm_size(ShmComm ,&ShmSize);
GroupSize = WorldSize/ShmSize;
/////////////////////////////////////////////////////////////////////
// find world ranks in our SHM group (i.e. which ranks are on our node)
/////////////////////////////////////////////////////////////////////
MPI_Group WorldGroup, ShmGroup;
MPI_Comm_group (communicator_world, &WorldGroup);
MPI_Comm_group (ShmComm, &ShmGroup);
std::vector<int> world_ranks(WorldSize);
GroupRanks.resize(WorldSize);
MyGroup.resize(ShmSize);
for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &GroupRanks[0]);
///////////////////////////////////////////////////////////////////
// Identify who is in my group and noninate the leader
///////////////////////////////////////////////////////////////////
int g=0;
for(int rank=0;rank<WorldSize;rank++){
if(GroupRanks[rank]!=MPI_UNDEFINED){
assert(g<ShmSize);
MyGroup[g++] = rank;
}
}
std::sort(MyGroup.begin(),MyGroup.end(),std::less<int>());
int myleader = MyGroup[0];
std::vector<int> leaders_1hot(WorldSize,0);
std::vector<int> leaders_group(GroupSize,0);
leaders_1hot [ myleader ] = 1;
///////////////////////////////////////////////////////////////////
// global sum leaders over comm world
///////////////////////////////////////////////////////////////////
int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator_world);
assert(ierr==0);
///////////////////////////////////////////////////////////////////
// find the group leaders world rank
///////////////////////////////////////////////////////////////////
int group=0;
for(int l=0;l<WorldSize;l++){
if(leaders_1hot[l]){
leaders_group[group++] = l;
}
}
///////////////////////////////////////////////////////////////////
// Identify the rank of the group in which I (and my leader) live
///////////////////////////////////////////////////////////////////
GroupRank=-1;
for(int g=0;g<GroupSize;g++){
if (myleader == leaders_group[g]){
GroupRank=g;
}
}
assert(GroupRank!=-1);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared window for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
ShmCommBuf = 0;
ierr = MPI_Win_allocate_shared(MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,ShmComm,&ShmCommBuf,&ShmWindow);
assert(ierr==0);
// KNL hack -- force to numa-domain 1 in flat
#if 0
//#include <numaif.h>
for(uint64_t page=0;page<MAX_MPI_SHM_BYTES;page+=4096){
void *pages = (void *) ( page + ShmCommBuf );
int status;
int flags=MPOL_MF_MOVE_ALL;
int nodes=1; // numa domain == MCDRAM
unsigned long count=1;
ierr= move_pages(0,count, &pages,&nodes,&status,flags);
if (ierr && (page==0)) perror("numa relocate command failed");
}
#endif
MPI_Win_lock_all (MPI_MODE_NOCHECK, ShmWindow);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Plan: allocate a fixed SHM region. Scratch that is just used via some scheme during stencil comms, with no allocate free.
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
ShmCommBufs.resize(ShmSize);
for(int r=0;r<ShmSize;r++){
MPI_Aint sz;
int dsp_unit;
MPI_Win_shared_query (ShmWindow, r, &sz, &dsp_unit, &ShmCommBufs[r]);
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// Verbose for now
//////////////////////////////////////////////////////////////////////////////////////////////////////////
if (WorldRank == 0){
std::cout<<GridLogMessage<< "Grid MPI-3 configuration: detected ";
std::cout<< WorldSize << " Ranks " ;
std::cout<< GroupSize << " Nodes " ;
std::cout<< ShmSize << " with ranks-per-node "<<std::endl;
std::cout<<GridLogMessage <<"Grid MPI-3 configuration: allocated shared memory region of size ";
std::cout<<std::hex << MAX_MPI_SHM_BYTES <<" ShmCommBuf address = "<<ShmCommBuf << std::dec<<std::endl;
for(int g=0;g<GroupSize;g++){
std::cout<<GridLogMessage<<" Node "<<g<<" led by MPI rank "<<leaders_group[g]<<std::endl;
}
std::cout<<GridLogMessage<<" Boss Node Shm Pointers are {";
for(int g=0;g<ShmSize;g++){
std::cout<<std::hex<<ShmCommBufs[g]<<std::dec;
if(g!=ShmSize-1) std::cout<<",";
else std::cout<<"}"<<std::endl;
}
}
for(int g=0;g<GroupSize;g++){
if ( (ShmRank == 0) && (GroupRank==g) ) std::cout<<GridLogMessage<<"["<<g<<"] Node Group "<<g<<" is ranks {";
for(int r=0;r<ShmSize;r++){
if ( (ShmRank == 0) && (GroupRank==g) ) {
std::cout<<MyGroup[r];
if(r<ShmSize-1) std::cout<<",";
else std::cout<<"}"<<std::endl;
}
MPI_Barrier(communicator_world);
}
}
assert(ShmSetup==0); ShmSetup=1;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Want to implement some magic ... Group sub-cubes into those on same node
////////////////////////////////////////////////////////////////////////////////////////////////////////////
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{ {
std::vector<int> coor = _processor_coor; std::vector<int> coor = _processor_coor;
@ -78,27 +252,11 @@ void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &c
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{ {
int ierr;
communicator=communicator_world;
_ndimension = processors.size(); _ndimension = processors.size();
std::cout << "Creating "<< _ndimension << " dim communicator "<<std::endl;
for(int d =0;d<_ndimension;d++){
std::cout << processors[d]<<" ";
};
std::cout << std::endl;
WorldDims = processors;
communicator = MPI_COMM_WORLD;
MPI_Comm shmcomm;
MPI_Comm_split_type(communicator, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&shmcomm);
MPI_Comm_rank(communicator,&WorldRank);
MPI_Comm_size(communicator,&WorldSize);
MPI_Comm_rank(shmcomm ,&ShmRank);
MPI_Comm_size(shmcomm ,&ShmSize);
GroupSize = WorldSize/ShmSize;
std::cout<< "Ranks per node "<< ShmSize << std::endl;
std::cout<< "Nodes "<< GroupSize << std::endl;
std::cout<< "Ranks "<< WorldSize << std::endl;
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Assert power of two shm_size. // Assert power of two shm_size.
@ -118,23 +276,20 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
int dim = 0; int dim = 0;
std::vector<int> WorldDims = processors;
ShmDims.resize(_ndimension,1); ShmDims.resize(_ndimension,1);
GroupDims.resize(_ndimension); GroupDims.resize(_ndimension);
ShmCoor.resize(_ndimension); ShmCoor.resize(_ndimension);
GroupCoor.resize(_ndimension); GroupCoor.resize(_ndimension);
WorldCoor.resize(_ndimension); WorldCoor.resize(_ndimension);
for(int l2=0;l2<log2size;l2++){ for(int l2=0;l2<log2size;l2++){
while ( WorldDims[dim] / ShmDims[dim] <= 1 ) dim=(dim+1)%_ndimension; while ( WorldDims[dim] / ShmDims[dim] <= 1 ) dim=(dim+1)%_ndimension;
ShmDims[dim]*=2; ShmDims[dim]*=2;
dim=(dim+1)%_ndimension; dim=(dim+1)%_ndimension;
} }
std::cout << "Shm group dims "<<std::endl;
for(int d =0;d<_ndimension;d++){
std::cout << ShmDims[d]<<" ";
};
std::cout << std::endl;
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings // Establish torus of processes and nodes with sub-blockings
@ -142,22 +297,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
for(int d=0;d<_ndimension;d++){ for(int d=0;d<_ndimension;d++){
GroupDims[d] = WorldDims[d]/ShmDims[d]; GroupDims[d] = WorldDims[d]/ShmDims[d];
} }
std::cout << "Group dims "<<std::endl;
for(int d =0;d<_ndimension;d++){
std::cout << GroupDims[d]<<" ";
};
std::cout << std::endl;
MPI_Group WorldGroup, ShmGroup;
MPI_Comm_group (communicator, &WorldGroup);
MPI_Comm_group (shmcomm, &ShmGroup);
std::vector<int> world_ranks(WorldSize);
std::vector<int> group_ranks(WorldSize);
std::vector<int> mygroup(GroupSize);
for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &group_ranks[0]);
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Check processor counts match // Check processor counts match
@ -166,55 +305,9 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
_processors = processors; _processors = processors;
_processor_coor.resize(_ndimension); _processor_coor.resize(_ndimension);
for(int i=0;i<_ndimension;i++){ for(int i=0;i<_ndimension;i++){
std::cout << " p " << _processors[i]<<std::endl;
_Nprocessors*=_processors[i]; _Nprocessors*=_processors[i];
} }
std::cout << " World " <<WorldSize <<" Nproc "<<_Nprocessors<<std::endl;
assert(WorldSize==_Nprocessors); assert(WorldSize==_Nprocessors);
///////////////////////////////////////////////////////////////////
// Identify who is in my group and noninate the leader
///////////////////////////////////////////////////////////////////
int g=0;
for(int rank=0;rank<WorldSize;rank++){
if(group_ranks[rank]!=MPI_UNDEFINED){
mygroup[g] = rank;
}
}
std::sort(mygroup.begin(),mygroup.end(),std::greater<int>());
int myleader = mygroup[0];
std::vector<int> leaders_1hot(WorldSize,0);
std::vector<int> leaders_group(GroupSize,0);
leaders_1hot [ myleader ] = 1;
///////////////////////////////////////////////////////////////////
// global sum leaders over comm world
///////////////////////////////////////////////////////////////////
int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator);
assert(ierr==0);
///////////////////////////////////////////////////////////////////
// find the group leaders world rank
///////////////////////////////////////////////////////////////////
int group=0;
for(int l=0;l<WorldSize;l++){
if(leaders_1hot[l]){
leaders_group[group++] = l;
}
}
///////////////////////////////////////////////////////////////////
// Identify the rank of the group in which I (and my leader) live
///////////////////////////////////////////////////////////////////
GroupRank=-1;
for(int g=0;g<GroupSize;g++){
if (myleader == leaders_group[g]){
GroupRank=g;
}
}
assert(GroupRank!=-1);
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Establish mapping between lexico physics coord and WorldRank // Establish mapping between lexico physics coord and WorldRank
@ -307,6 +400,80 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
int from, int from,
int bytes) int bytes)
{ {
#if 0
this->StencilBarrier();
MPI_Request xrq;
MPI_Request rrq;
static int sequence;
int ierr;
int tag;
int check;
assert(dest != _processor);
assert(from != _processor);
int gdest = GroupRanks[dest];
int gfrom = GroupRanks[from];
int gme = GroupRanks[_processor];
sequence++;
char *from_ptr = (char *)ShmCommBufs[ShmRank];
int small = (bytes<MAX_MPI_SHM_BYTES);
typedef uint64_t T;
int words = bytes/sizeof(T);
assert(((size_t)bytes &(sizeof(T)-1))==0);
assert(gme == ShmRank);
if ( small && (gdest !=MPI_UNDEFINED) ) {
char *to_ptr = (char *)ShmCommBufs[gdest];
assert(gme != gdest);
T *ip = (T *)xmit;
T *op = (T *)to_ptr;
PARALLEL_FOR_LOOP
for(int w=0;w<words;w++) {
op[w]=ip[w];
}
bcopy(&_processor,&to_ptr[bytes],sizeof(_processor));
bcopy(& sequence,&to_ptr[bytes+4],sizeof(sequence));
} else {
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
assert(ierr==0);
list.push_back(xrq);
}
this->StencilBarrier();
if (small && (gfrom !=MPI_UNDEFINED) ) {
T *ip = (T *)from_ptr;
T *op = (T *)recv;
PARALLEL_FOR_LOOP
for(int w=0;w<words;w++) {
op[w]=ip[w];
}
bcopy(&from_ptr[bytes] ,&tag ,sizeof(tag));
bcopy(&from_ptr[bytes+4],&check,sizeof(check));
assert(check==sequence);
assert(tag==from);
} else {
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
assert(ierr==0);
list.push_back(rrq);
}
this->StencilBarrier();
#else
MPI_Request xrq; MPI_Request xrq;
MPI_Request rrq; MPI_Request rrq;
int rank = _processor; int rank = _processor;
@ -318,13 +485,62 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
list.push_back(xrq); list.push_back(xrq);
list.push_back(rrq); list.push_back(rrq);
#endif
} }
void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
MPI_Request xrq;
MPI_Request rrq;
int ierr;
assert(dest != _processor);
assert(from != _processor);
int gdest = GroupRanks[dest];
int gfrom = GroupRanks[from];
int gme = GroupRanks[_processor];
assert(gme == ShmRank);
if ( gdest == MPI_UNDEFINED ) {
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
assert(ierr==0);
list.push_back(xrq);
}
if ( gfrom ==MPI_UNDEFINED) {
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
assert(ierr==0);
list.push_back(rrq);
}
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
SendToRecvFromComplete(list);
}
void CartesianCommunicator::StencilBarrier(void)
{
MPI_Win_sync (ShmWindow);
MPI_Barrier (ShmComm);
MPI_Win_sync (ShmWindow);
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list) void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{ {
int nreq=list.size(); int nreq=list.size();
std::vector<MPI_Status> status(nreq); std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]); int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0); assert(ierr==0);
} }
@ -350,7 +566,7 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
bytes, bytes,
MPI_BYTE, MPI_BYTE,
root, root,
MPI_COMM_WORLD); communicator_world);
assert(ierr==0); assert(ierr==0);
} }

View File

@ -28,12 +28,22 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include "Grid.h" #include "Grid.h"
namespace Grid { namespace Grid {
///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////////////////////////////////////////
void CartesianCommunicator::Init(int *argc, char *** arv) void CartesianCommunicator::Init(int *argc, char *** arv)
{ {
WorldRank = 0;
WorldSize = 1;
ShmRank=0;
ShmSize=1;
GroupRank=WorldRank;
GroupSize=WorldSize;
Slave =0;
ShmInitGeneric();
} }
int Rank(void ){ return 0; };
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{ {
_processors = processors; _processors = processors;
@ -89,30 +99,16 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
assert(0); assert(0);
} }
void CartesianCommunicator::Barrier(void) void CartesianCommunicator::Barrier(void){}
{ void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
} void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) { return 0;}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){ assert(0);}
{
}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{
}
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{ {
source =0; source =0;
dest=0; dest=0;
} }
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
{
return 0;
}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
{
}
} }

View File

@ -39,17 +39,22 @@ namespace Grid {
BACKTRACEFILE(); \ BACKTRACEFILE(); \
}\ }\
} }
int Rank(void) {
return shmem_my_pe();
} ///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////////////////////////////////////////
typedef struct HandShake_t { typedef struct HandShake_t {
uint64_t seq_local; uint64_t seq_local;
uint64_t seq_remote; uint64_t seq_remote;
} HandShake; } HandShake;
static Vector< HandShake > XConnections; static Vector< HandShake > XConnections;
static Vector< HandShake > RConnections; static Vector< HandShake > RConnections;
void CartesianCommunicator::Init(int *argc, char ***argv) { void CartesianCommunicator::Init(int *argc, char ***argv) {
shmem_init(); shmem_init();
XConnections.resize(shmem_n_pes()); XConnections.resize(shmem_n_pes());
@ -60,8 +65,17 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
RConnections[pe].seq_local = 0; RConnections[pe].seq_local = 0;
RConnections[pe].seq_remote= 0; RConnections[pe].seq_remote= 0;
} }
WorldSize = shmem_n_pes();
WorldRank = shmem_my_pe();
ShmRank=0;
ShmSize=1;
GroupRank=WorldRank;
GroupSize=WorldSize;
Slave =0;
shmem_barrier_all(); shmem_barrier_all();
ShmInitGeneric();
} }
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{ {
_ndimension = processors.size(); _ndimension = processors.size();
@ -230,12 +244,9 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
if ( _processor == sender ) { if ( _processor == sender ) {
printf("Sender SHMEM pt2pt %d -> %d\n",sender,receiver);
// Check he has posted a receive // Check he has posted a receive
while(SendSeq->seq_remote == SendSeq->seq_local); while(SendSeq->seq_remote == SendSeq->seq_local);
printf("Sender receive %d posted\n",sender,receiver);
// Advance our send count // Advance our send count
seq = ++(SendSeq->seq_local); seq = ++(SendSeq->seq_local);
@ -244,26 +255,19 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
shmem_putmem(recv,xmit,bytes,receiver); shmem_putmem(recv,xmit,bytes,receiver);
shmem_fence(); shmem_fence();
printf("Sender sent payload %d\n",seq);
//Notify him we're done //Notify him we're done
shmem_putmem((void *)&(RecvSeq->seq_remote),&seq,sizeof(seq),receiver); shmem_putmem((void *)&(RecvSeq->seq_remote),&seq,sizeof(seq),receiver);
shmem_fence(); shmem_fence();
printf("Sender ringing door bell %d\n",seq);
} }
if ( _processor == receiver ) { if ( _processor == receiver ) {
printf("Receiver SHMEM pt2pt %d->%d\n",sender,receiver);
// Post a receive // Post a receive
seq = ++(RecvSeq->seq_local); seq = ++(RecvSeq->seq_local);
shmem_putmem((void *)&(SendSeq->seq_remote),&seq,sizeof(seq),sender); shmem_putmem((void *)&(SendSeq->seq_remote),&seq,sizeof(seq),sender);
printf("Receiver Opening letter box %d\n",seq);
// Now wait until he has advanced our reception counter // Now wait until he has advanced our reception counter
while(RecvSeq->seq_remote != RecvSeq->seq_local); while(RecvSeq->seq_remote != RecvSeq->seq_local);
printf("Receiver Got the mail %d\n",seq);
} }
} }

View File

@ -154,7 +154,7 @@ PARALLEL_FOR_LOOP
template<class vobj,class sobj> template<class vobj,class sobj>
void peekLocalSite(sobj &s,const Lattice<vobj> &l,std::vector<int> &site){ void peekLocalSite(sobj &s,const Lattice<vobj> &l,std::vector<int> &site){
GridBase *grid=l._grid; GridBase *grid = l._grid;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
@ -164,16 +164,18 @@ PARALLEL_FOR_LOOP
assert( l.checkerboard== l._grid->CheckerBoard(site)); assert( l.checkerboard== l._grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj)); assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type);
int odx,idx; int odx,idx;
idx= grid->iIndex(site); idx= grid->iIndex(site);
odx= grid->oIndex(site); odx= grid->oIndex(site);
std::vector<sobj> buf(Nsimd); scalar_type * vp = (scalar_type *)&l._odata[odx];
scalar_type * pt = (scalar_type *)&s;
extract(l._odata[odx],buf);
for(int w=0;w<words;w++){
pt[w] = vp[idx+w*Nsimd];
}
s = buf[idx];
return; return;
}; };
@ -190,18 +192,17 @@ PARALLEL_FOR_LOOP
assert( l.checkerboard== l._grid->CheckerBoard(site)); assert( l.checkerboard== l._grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj)); assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type);
int odx,idx; int odx,idx;
idx= grid->iIndex(site); idx= grid->iIndex(site);
odx= grid->oIndex(site); odx= grid->oIndex(site);
std::vector<sobj> buf(Nsimd); scalar_type * vp = (scalar_type *)&l._odata[odx];
scalar_type * pt = (scalar_type *)&s;
// extract-modify-merge cycle is easiest way and this is not perf critical
extract(l._odata[odx],buf);
buf[idx] = s; for(int w=0;w<words;w++){
vp[idx+w*Nsimd] = pt[w];
merge(l._odata[odx],buf); }
return; return;
}; };

View File

@ -294,11 +294,12 @@ namespace Grid {
int rank,o_idx,i_idx; int rank,o_idx,i_idx;
_grid->GlobalIndexToGlobalCoor(gidx,gcoor); _grid->GlobalIndexToGlobalCoor(gidx,gcoor);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
int l_idx=generator_idx(o_idx,i_idx); int l_idx=generator_idx(o_idx,i_idx);
std::vector<int> site_seeds(4); const int num_rand_seed=16;
for(int i=0;i<4;i++){ std::vector<int> site_seeds(num_rand_seed);
for(int i=0;i<site_seeds.size();i++){
site_seeds[i]= ui(pseeder); site_seeds[i]= ui(pseeder);
} }

View File

@ -33,511 +33,500 @@ directory
#define GRID_QCD_FERMION_OPERATOR_IMPL_H #define GRID_QCD_FERMION_OPERATOR_IMPL_H
namespace Grid { namespace Grid {
namespace QCD {
namespace QCD {
////////////////////////////////////////////// //////////////////////////////////////////////
// Template parameter class constructs to package // Template parameter class constructs to package
// externally control Fermion implementations // externally control Fermion implementations
// in orthogonal directions // in orthogonal directions
// //
// Ultimately need Impl to always define types where XXX is opaque // Ultimately need Impl to always define types where XXX is opaque
// //
// typedef typename XXX Simd; // typedef typename XXX Simd;
// typedef typename XXX GaugeLinkField; // typedef typename XXX GaugeLinkField;
// typedef typename XXX GaugeField; // typedef typename XXX GaugeField;
// typedef typename XXX GaugeActField; // typedef typename XXX GaugeActField;
// typedef typename XXX FermionField; // typedef typename XXX FermionField;
// typedef typename XXX DoubledGaugeField; // typedef typename XXX DoubledGaugeField;
// typedef typename XXX SiteSpinor; // typedef typename XXX SiteSpinor;
// typedef typename XXX SiteHalfSpinor; // typedef typename XXX SiteHalfSpinor;
// typedef typename XXX Compressor; // typedef typename XXX Compressor;
// //
// and Methods: // and Methods:
// void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) // void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
// void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) // void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
// void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St) // void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
// void InsertForce4D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu) // void InsertForce4D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
// void InsertForce5D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu) // void InsertForce5D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
// //
// //
// To acquire the typedefs from "Base" (either a base class or template param) use: // To acquire the typedefs from "Base" (either a base class or template param) use:
// //
// INHERIT_GIMPL_TYPES(Base) // INHERIT_GIMPL_TYPES(Base)
// INHERIT_FIMPL_TYPES(Base) // INHERIT_FIMPL_TYPES(Base)
// INHERIT_IMPL_TYPES(Base) // INHERIT_IMPL_TYPES(Base)
// //
// The Fermion operators will do the following: // The Fermion operators will do the following:
// //
// struct MyOpParams { // struct MyOpParams {
// RealD mass; // RealD mass;
// }; // };
// //
// //
// template<class Impl> // template<class Impl>
// class MyOp : public<Impl> { // class MyOp : public<Impl> {
// public: // public:
// //
// INHERIT_ALL_IMPL_TYPES(Impl); // INHERIT_ALL_IMPL_TYPES(Impl);
// //
// MyOp(MyOpParams Myparm, ImplParams &ImplParam) : Impl(ImplParam) // MyOp(MyOpParams Myparm, ImplParams &ImplParam) : Impl(ImplParam)
// { // {
// //
// }; // };
// //
// } // }
////////////////////////////////////////////// //////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////
// Implementation dependent fermion types
////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////
// Implementation dependent fermion types
////////////////////////////////////////////////////////////////////////
#define INHERIT_FIMPL_TYPES(Impl)\ #define INHERIT_FIMPL_TYPES(Impl)\
typedef typename Impl::FermionField FermionField; \ typedef typename Impl::FermionField FermionField; \
typedef typename Impl::DoubledGaugeField DoubledGaugeField; \ typedef typename Impl::DoubledGaugeField DoubledGaugeField; \
typedef typename Impl::SiteSpinor SiteSpinor; \ typedef typename Impl::SiteSpinor SiteSpinor; \
typedef typename Impl::SiteHalfSpinor SiteHalfSpinor; \ typedef typename Impl::SiteHalfSpinor SiteHalfSpinor; \
typedef typename Impl::Compressor Compressor; \ typedef typename Impl::Compressor Compressor; \
typedef typename Impl::StencilImpl StencilImpl; \ typedef typename Impl::StencilImpl StencilImpl; \
typedef typename Impl::ImplParams ImplParams; \ typedef typename Impl::ImplParams ImplParams; \
typedef typename Impl::Coeff_t Coeff_t; typedef typename Impl::Coeff_t Coeff_t;
#define INHERIT_IMPL_TYPES(Base) \ #define INHERIT_IMPL_TYPES(Base) \
INHERIT_GIMPL_TYPES(Base) \ INHERIT_GIMPL_TYPES(Base) \
INHERIT_FIMPL_TYPES(Base) INHERIT_FIMPL_TYPES(Base)
/////////////////////////////////////////////////////////////////////////////
// Single flavour four spinors with colour index
/////////////////////////////////////////////////////////////////////////////
template <class S, class Representation = FundamentalRepresentation,class _Coeff_t = RealD >
class WilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
public:
static const int Dimension = Representation::Dimension;
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
//Necessary?
constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
/////// const bool LsVectorised=false;
// Single flavour four spinors with colour index typedef _Coeff_t Coeff_t;
///////
template <class S, class Representation = FundamentalRepresentation,class _Coeff_t = RealD >
class WilsonImpl
: public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
public:
static const int Dimension = Representation::Dimension;
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
//Necessary?
constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
const bool LsVectorised=false; INHERIT_GIMPL_TYPES(Gimpl);
typedef _Coeff_t Coeff_t;
INHERIT_GIMPL_TYPES(Gimpl);
template <typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Dimension>, Ns> >; template <typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
template <typename vtype> using iImplHalfSpinor = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >; template <typename vtype> using iImplHalfSpinor = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>; template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
typedef iImplSpinor<Simd> SiteSpinor;
typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
typedef Lattice<SiteSpinor> FermionField;
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
typedef WilsonImplParams ImplParams;
typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
ImplParams Params;
WilsonImpl(const ImplParams &p = ImplParams()) : Params(p){};
typedef iImplSpinor<Simd> SiteSpinor; bool overlapCommsCompute(void) { return Params.overlapCommsCompute; };
typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
typedef Lattice<SiteSpinor> FermionField; inline void multLink(SiteHalfSpinor &phi,
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField; const SiteDoubledGaugeField &U,
const SiteHalfSpinor &chi,
int mu,
StencilEntry *SE,
StencilImpl &St) {
mult(&phi(), &U(mu), &chi());
}
typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor; template <class ref>
typedef WilsonImplParams ImplParams; inline void loadLinkElement(Simd &reg, ref &memory) {
typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl; reg = memory;
}
ImplParams Params; inline void DoubleStore(GridBase *GaugeGrid,
DoubledGaugeField &Uds,
WilsonImpl(const ImplParams &p = ImplParams()) : Params(p){}; const GaugeField &Umu) {
conformable(Uds._grid, GaugeGrid);
bool overlapCommsCompute(void) { return Params.overlapCommsCompute; }; conformable(Umu._grid, GaugeGrid);
GaugeLinkField U(GaugeGrid);
inline void multLink(SiteHalfSpinor &phi, for (int mu = 0; mu < Nd; mu++) {
const SiteDoubledGaugeField &U, U = PeekIndex<LorentzIndex>(Umu, mu);
const SiteHalfSpinor &chi, PokeIndex<LorentzIndex>(Uds, U, mu);
int mu, U = adj(Cshift(U, mu, -1));
StencilEntry *SE, PokeIndex<LorentzIndex>(Uds, U, mu + 4);
StencilImpl &St) {
mult(&phi(), &U(mu), &chi());
} }
}
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
GaugeLinkField link(mat._grid);
link = TraceIndex<SpinIndex>(outerProduct(Btilde,A));
PokeIndex<LorentzIndex>(mat,link,mu);
}
template <class ref> inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
inline void loadLinkElement(Simd &reg,
ref &memory) {
reg = memory;
}
inline void DoubleStore(GridBase *GaugeGrid, int Ls=Btilde._grid->_fdimensions[0];
DoubledGaugeField &Uds, GaugeLinkField tmp(mat._grid);
const GaugeField &Umu) { tmp = zero;
conformable(Uds._grid, GaugeGrid);
conformable(Umu._grid, GaugeGrid); PARALLEL_FOR_LOOP
GaugeLinkField U(GaugeGrid); for(int sss=0;sss<tmp._grid->oSites();sss++){
for (int mu = 0; mu < Nd; mu++) { int sU=sss;
U = PeekIndex<LorentzIndex>(Umu, mu); for(int s=0;s<Ls;s++){
PokeIndex<LorentzIndex>(Uds, U, mu); int sF = s+Ls*sU;
U = adj(Cshift(U, mu, -1)); tmp[sU] = tmp[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde[sF],Atilde[sF])); // ordering here
PokeIndex<LorentzIndex>(Uds, U, mu + 4);
} }
} }
PokeIndex<LorentzIndex>(mat,tmp,mu);
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
GaugeLinkField link(mat._grid);
link = TraceIndex<SpinIndex>(outerProduct(Btilde,A));
PokeIndex<LorentzIndex>(mat,link,mu);
}
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){ }
};
int Ls=Btilde._grid->_fdimensions[0];
GaugeLinkField tmp(mat._grid);
tmp = zero;
PARALLEL_FOR_LOOP ////////////////////////////////////////////////////////////////////////////////////
for(int sss=0;sss<tmp._grid->oSites();sss++){ // Single flavour four spinors with colour index, 5d redblack
int sU=sss; ////////////////////////////////////////////////////////////////////////////////////
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
tmp[sU] = tmp[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde[sF],Atilde[sF])); // ordering here
}
}
PokeIndex<LorentzIndex>(mat,tmp,mu);
template<class S,int Nrepresentation=Nc,class _Coeff_t = RealD>
class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > {
public:
static const int Dimension = Nrepresentation;
const bool LsVectorised=true;
typedef _Coeff_t Coeff_t;
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Nrepresentation> > Gimpl;
INHERIT_GIMPL_TYPES(Gimpl);
template <typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
template <typename vtype> using iImplHalfSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>;
template <typename vtype> using iImplGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd>;
template <typename vtype> using iImplGaugeLink = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
typedef iImplSpinor<Simd> SiteSpinor;
typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
typedef Lattice<SiteSpinor> FermionField;
// Make the doubled gauge field a *scalar*
typedef iImplDoubledGaugeField<typename Simd::scalar_type> SiteDoubledGaugeField; // This is a scalar
typedef iImplGaugeField<typename Simd::scalar_type> SiteScalarGaugeField; // scalar
typedef iImplGaugeLink<typename Simd::scalar_type> SiteScalarGaugeLink; // scalar
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
typedef WilsonImplParams ImplParams;
typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
ImplParams Params;
DomainWallVec5dImpl(const ImplParams &p = ImplParams()) : Params(p){};
bool overlapCommsCompute(void) { return false; };
template <class ref>
inline void loadLinkElement(Simd &reg, ref &memory) {
vsplat(reg, memory);
}
inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
StencilImpl &St) {
SiteGaugeLink UU;
for (int i = 0; i < Nrepresentation; i++) {
for (int j = 0; j < Nrepresentation; j++) {
vsplat(UU()()(i, j), U(mu)()(i, j));
} }
}; }
mult(&phi(), &UU(), &chi());
/////// }
// Single flavour four spinors with colour index, 5d redblack
///////
template<class S,int Nrepresentation=Nc,class _Coeff_t = RealD>
class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > {
public:
static const int Dimension = Nrepresentation; inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu)
const bool LsVectorised=true; {
typedef _Coeff_t Coeff_t; SiteScalarGaugeField ScalarUmu;
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Nrepresentation> > Gimpl; SiteDoubledGaugeField ScalarUds;
INHERIT_GIMPL_TYPES(Gimpl); GaugeLinkField U(Umu._grid);
GaugeField Uadj(Umu._grid);
for (int mu = 0; mu < Nd; mu++) {
U = PeekIndex<LorentzIndex>(Umu, mu);
U = adj(Cshift(U, mu, -1));
PokeIndex<LorentzIndex>(Uadj, U, mu);
}
for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
std::vector<int> lcoor;
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
template <typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >; peekLocalSite(ScalarUmu, Umu, lcoor);
template <typename vtype> using iImplHalfSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >; for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>;
template <typename vtype> using iImplGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd>;
template <typename vtype> using iImplGaugeLink = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
typedef iImplSpinor<Simd> SiteSpinor; peekLocalSite(ScalarUmu, Uadj, lcoor);
typedef iImplHalfSpinor<Simd> SiteHalfSpinor; for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
typedef Lattice<SiteSpinor> FermionField;
// Make the doubled gauge field a *scalar* pokeLocalSite(ScalarUds, Uds, lcoor);
typedef iImplDoubledGaugeField<typename Simd::scalar_type> }
SiteDoubledGaugeField; // This is a scalar }
typedef iImplGaugeField<typename Simd::scalar_type>
SiteScalarGaugeField; // scalar
typedef iImplGaugeLink<typename Simd::scalar_type>
SiteScalarGaugeLink; // scalar
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField; inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu)
{
assert(0);
}
typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor; inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,FermionField &Atilde, int mu)
typedef WilsonImplParams ImplParams; {
typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
ImplParams Params;
DomainWallVec5dImpl(const ImplParams &p = ImplParams()) : Params(p){};
bool overlapCommsCompute(void) { return false; };
template <class ref>
inline void loadLinkElement(Simd &reg, ref &memory) {
vsplat(reg, memory);
}
inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
StencilImpl &St) {
SiteGaugeLink UU;
for (int i = 0; i < Nrepresentation; i++) {
for (int j = 0; j < Nrepresentation; j++) {
vsplat(UU()()(i, j), U(mu)()(i, j));
}
}
mult(&phi(), &UU(), &chi());
}
inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,
const GaugeField &Umu) {
SiteScalarGaugeField ScalarUmu;
SiteDoubledGaugeField ScalarUds;
GaugeLinkField U(Umu._grid);
GaugeField Uadj(Umu._grid);
for (int mu = 0; mu < Nd; mu++) {
U = PeekIndex<LorentzIndex>(Umu, mu);
U = adj(Cshift(U, mu, -1));
PokeIndex<LorentzIndex>(Uadj, U, mu);
}
for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
std::vector<int> lcoor;
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
peekLocalSite(ScalarUmu, Umu, lcoor);
for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
peekLocalSite(ScalarUmu, Uadj, lcoor);
for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
pokeLocalSite(ScalarUds, Uds, lcoor);
}
}
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,
FermionField &A, int mu) {
assert(0); assert(0);
} }
};
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,
FermionField &Atilde, int mu) {
assert(0);
}
};
//////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////
// Flavour doubled spinors; is Gparity the only? what about C*? // Flavour doubled spinors; is Gparity the only? what about C*?
//////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////
template <class S, int Nrepresentation,class _Coeff_t = RealD> template <class S, int Nrepresentation,class _Coeff_t = RealD>
class GparityWilsonImpl class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
: public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > { public:
public:
static const int Dimension = Nrepresentation;
const bool LsVectorised=false; static const int Dimension = Nrepresentation;
typedef _Coeff_t Coeff_t; const bool LsVectorised=false;
typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;
typedef _Coeff_t Coeff_t;
typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;
INHERIT_GIMPL_TYPES(Gimpl);
INHERIT_GIMPL_TYPES(Gimpl); template <typename vtype> using iImplSpinor = iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp>;
template <typename vtype> using iImplHalfSpinor = iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp>;
template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
template <typename vtype> typedef iImplSpinor<Simd> SiteSpinor;
using iImplSpinor = typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp>; typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
template <typename vtype>
using iImplHalfSpinor = typedef Lattice<SiteSpinor> FermionField;
iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp>; typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
template <typename vtype>
using iImplDoubledGaugeField = typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>; typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
typedef GparityWilsonImplParams ImplParams;
typedef iImplSpinor<Simd> SiteSpinor; ImplParams Params;
typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
typedef Lattice<SiteSpinor> FermionField;
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
typedef GparityWilsonImplParams ImplParams; GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){};
ImplParams Params;
bool overlapCommsCompute(void) { return Params.overlapCommsCompute; };
GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){}; // provide the multiply by link that is differentiated between Gparity (with
// flavour index) and non-Gparity
inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
StencilImpl &St) {
bool overlapCommsCompute(void) { return Params.overlapCommsCompute; }; typedef SiteHalfSpinor vobj;
typedef typename SiteHalfSpinor::scalar_object sobj;
// provide the multiply by link that is differentiated between Gparity (with
// flavour index) and non-Gparity
inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
StencilImpl &St) {
typedef SiteHalfSpinor vobj;
typedef typename SiteHalfSpinor::scalar_object sobj;
vobj vtmp; vobj vtmp;
sobj stmp; sobj stmp;
GridBase *grid = St._grid; GridBase *grid = St._grid;
const int Nsimd = grid->Nsimd(); const int Nsimd = grid->Nsimd();
int direction = St._directions[mu]; int direction = St._directions[mu];
int distance = St._distances[mu]; int distance = St._distances[mu];
int ptype = St._permute_type[mu]; int ptype = St._permute_type[mu];
int sl = St._grid->_simd_layout[direction]; int sl = St._grid->_simd_layout[direction];
// Fixme X.Y.Z.T hardcode in stencil
int mmu = mu % Nd;
// Fixme X.Y.Z.T hardcode in stencil // assert our assumptions
int mmu = mu % Nd; assert((distance == 1) || (distance == -1)); // nearest neighbour stencil hard code
assert((sl == 1) || (sl == 2));
std::vector<int> icoor;
// assert our assumptions if ( SE->_around_the_world && Params.twists[mmu] ) {
assert((distance == 1) || (distance == -1)); // nearest neighbour stencil hard code
assert((sl == 1) || (sl == 2));
std::vector<int> icoor;
if ( SE->_around_the_world && Params.twists[mmu] ) {
if ( sl == 2 ) { if ( sl == 2 ) {
std::vector<sobj> vals(Nsimd);
std::vector<sobj> vals(Nsimd); extract(chi,vals);
for(int s=0;s<Nsimd;s++){
extract(chi,vals); grid->iCoorFromIindex(icoor,s);
for(int s=0;s<Nsimd;s++){
grid->iCoorFromIindex(icoor,s);
assert((icoor[direction]==0)||(icoor[direction]==1)); assert((icoor[direction]==0)||(icoor[direction]==1));
int permute_lane; int permute_lane;
if ( distance == 1) { if ( distance == 1) {
permute_lane = icoor[direction]?1:0; permute_lane = icoor[direction]?1:0;
} else { } else {
permute_lane = icoor[direction]?0:1; permute_lane = icoor[direction]?0:1;
}
if ( permute_lane ) {
stmp(0) = vals[s](1);
stmp(1) = vals[s](0);
vals[s] = stmp;
} }
}
if ( permute_lane ) { merge(vtmp,vals);
stmp(0) = vals[s](1);
stmp(1) = vals[s](0); } else {
vals[s] = stmp; vtmp(0) = chi(1);
} vtmp(1) = chi(0);
} }
merge(vtmp,vals); mult(&phi(0),&U(0)(mu),&vtmp(0));
mult(&phi(1),&U(1)(mu),&vtmp(1));
} else {
mult(&phi(0),&U(0)(mu),&chi(0));
mult(&phi(1),&U(1)(mu),&chi(1));
}
}
} else { inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
vtmp(0) = chi(1); {
vtmp(1) = chi(0); conformable(Uds._grid,GaugeGrid);
} conformable(Umu._grid,GaugeGrid);
mult(&phi(0),&U(0)(mu),&vtmp(0));
mult(&phi(1),&U(1)(mu),&vtmp(1)); GaugeLinkField Utmp (GaugeGrid);
GaugeLinkField U (GaugeGrid);
} else { GaugeLinkField Uconj(GaugeGrid);
mult(&phi(0),&U(0)(mu),&chi(0));
mult(&phi(1),&U(1)(mu),&chi(1)); Lattice<iScalar<vInteger> > coor(GaugeGrid);
}
} for(int mu=0;mu<Nd;mu++){
inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
{
conformable(Uds._grid,GaugeGrid);
conformable(Umu._grid,GaugeGrid);
GaugeLinkField Utmp (GaugeGrid);
GaugeLinkField U (GaugeGrid);
GaugeLinkField Uconj(GaugeGrid);
Lattice<iScalar<vInteger> > coor(GaugeGrid);
for(int mu=0;mu<Nd;mu++){
LatticeCoordinate(coor,mu); LatticeCoordinate(coor,mu);
U = PeekIndex<LorentzIndex>(Umu,mu); U = PeekIndex<LorentzIndex>(Umu,mu);
Uconj = conjugate(U); Uconj = conjugate(U);
// This phase could come from a simple bc 1,1,-1,1 ..
int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
if ( Params.twists[mu] ) {
Uconj = where(coor==neglink,-Uconj,Uconj);
}
// This phase could come from a simple bc 1,1,-1,1 .. PARALLEL_FOR_LOOP
int neglink = GaugeGrid->GlobalDimensions()[mu]-1; for(auto ss=U.begin();ss<U.end();ss++){
if ( Params.twists[mu] ) { Uds[ss](0)(mu) = U[ss]();
Uconj = where(coor==neglink,-Uconj,Uconj); Uds[ss](1)(mu) = Uconj[ss]();
} }
U = adj(Cshift(U ,mu,-1)); // correct except for spanning the boundary
Uconj = adj(Cshift(Uconj,mu,-1));
Utmp = U;
if ( Params.twists[mu] ) {
Utmp = where(coor==0,Uconj,Utmp);
}
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(auto ss=U.begin();ss<U.end();ss++){ for(auto ss=U.begin();ss<U.end();ss++){
Uds[ss](0)(mu) = U[ss](); Uds[ss](0)(mu+4) = Utmp[ss]();
Uds[ss](1)(mu) = Uconj[ss](); }
}
U = adj(Cshift(U ,mu,-1)); // correct except for spanning the boundary Utmp = Uconj;
Uconj = adj(Cshift(Uconj,mu,-1)); if ( Params.twists[mu] ) {
Utmp = where(coor==0,U,Utmp);
}
Utmp = U; PARALLEL_FOR_LOOP
if ( Params.twists[mu] ) { for(auto ss=U.begin();ss<U.end();ss++){
Utmp = where(coor==0,Uconj,Utmp); Uds[ss](1)(mu+4) = Utmp[ss]();
} }
PARALLEL_FOR_LOOP }
for(auto ss=U.begin();ss<U.end();ss++){ }
Uds[ss](0)(mu+4) = Utmp[ss]();
}
Utmp = Uconj;
if ( Params.twists[mu] ) {
Utmp = where(coor==0,U,Utmp);
}
PARALLEL_FOR_LOOP
for(auto ss=U.begin();ss<U.end();ss++){
Uds[ss](1)(mu+4) = Utmp[ss]();
}
}
}
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
FermionField &A, int mu) {
// DhopDir provides U or Uconj depending on coor/flavour. // DhopDir provides U or Uconj depending on coor/flavour.
GaugeLinkField link(mat._grid); GaugeLinkField link(mat._grid);
// use lorentz for flavour as hack. // use lorentz for flavour as hack.
auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A)); auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for (auto ss = tmp.begin(); ss < tmp.end(); ss++) { for (auto ss = tmp.begin(); ss < tmp.end(); ss++) {
link[ss]() = tmp[ss](0, 0) - conjugate(tmp[ss](1, 1)); link[ss]() = tmp[ss](0, 0) - conjugate(tmp[ss](1, 1));
} }
PokeIndex<LorentzIndex>(mat, link, mu); PokeIndex<LorentzIndex>(mat, link, mu);
return; return;
} }
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
FermionField &Atilde, int mu) {
int Ls = Btilde._grid->_fdimensions[0]; int Ls = Btilde._grid->_fdimensions[0];
GaugeLinkField tmp(mat._grid); GaugeLinkField tmp(mat._grid);
tmp = zero; tmp = zero;
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for (int ss = 0; ss < tmp._grid->oSites(); ss++) { for (int ss = 0; ss < tmp._grid->oSites(); ss++) {
for (int s = 0; s < Ls; s++) { for (int s = 0; s < Ls; s++) {
int sF = s + Ls * ss; int sF = s + Ls * ss;
auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde[sF], Atilde[sF])); auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde[sF], Atilde[sF]));
tmp[ss]() = tmp[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1)); tmp[ss]() = tmp[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
} }
} }
PokeIndex<LorentzIndex>(mat, tmp, mu); PokeIndex<LorentzIndex>(mat, tmp, mu);
return; return;
} }
};
typedef WilsonImpl<vComplex, FundamentalRepresentation > WilsonImplR; // Real.. whichever prec };
typedef WilsonImpl<vComplexF, FundamentalRepresentation > WilsonImplF; // Float
typedef WilsonImpl<vComplexD, FundamentalRepresentation > WilsonImplD; // Double
typedef WilsonImpl<vComplex, FundamentalRepresentation > WilsonImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, FundamentalRepresentation > WilsonImplF; // Float
typedef WilsonImpl<vComplexD, FundamentalRepresentation > WilsonImplD; // Double
typedef WilsonImpl<vComplex, FundamentalRepresentation, ComplexD > ZWilsonImplR; // Real.. whichever prec typedef WilsonImpl<vComplex, FundamentalRepresentation, ComplexD > ZWilsonImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, FundamentalRepresentation, ComplexD > ZWilsonImplF; // Float typedef WilsonImpl<vComplexF, FundamentalRepresentation, ComplexD > ZWilsonImplF; // Float
typedef WilsonImpl<vComplexD, FundamentalRepresentation, ComplexD > ZWilsonImplD; // Double typedef WilsonImpl<vComplexD, FundamentalRepresentation, ComplexD > ZWilsonImplD; // Double
typedef WilsonImpl<vComplex, AdjointRepresentation > WilsonAdjImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, AdjointRepresentation > WilsonAdjImplF; // Float
typedef WilsonImpl<vComplexD, AdjointRepresentation > WilsonAdjImplD; // Double
typedef WilsonImpl<vComplex, TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplF; // Float
typedef WilsonImpl<vComplexD, TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplD; // Double
typedef DomainWallVec5dImpl<vComplex ,Nc> DomainWallVec5dImplR; // Real.. whichever prec
typedef DomainWallVec5dImpl<vComplexF,Nc> DomainWallVec5dImplF; // Float
typedef DomainWallVec5dImpl<vComplexD,Nc> DomainWallVec5dImplD; // Double
typedef DomainWallVec5dImpl<vComplex ,Nc,ComplexD> ZDomainWallVec5dImplR; // Real.. whichever prec
typedef DomainWallVec5dImpl<vComplexF,Nc,ComplexD> ZDomainWallVec5dImplF; // Float
typedef DomainWallVec5dImpl<vComplexD,Nc,ComplexD> ZDomainWallVec5dImplD; // Double
typedef GparityWilsonImpl<vComplex , Nc> GparityWilsonImplR; // Real.. whichever prec
typedef GparityWilsonImpl<vComplexF, Nc> GparityWilsonImplF; // Float
typedef GparityWilsonImpl<vComplexD, Nc> GparityWilsonImplD; // Double
typedef WilsonImpl<vComplex, AdjointRepresentation > WilsonAdjImplR; // Real.. whichever prec }}
typedef WilsonImpl<vComplexF, AdjointRepresentation > WilsonAdjImplF; // Float
typedef WilsonImpl<vComplexD, AdjointRepresentation > WilsonAdjImplD; // Double
typedef WilsonImpl<vComplex, TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplF; // Float
typedef WilsonImpl<vComplexD, TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplD; // Double
typedef DomainWallVec5dImpl<vComplex ,Nc> DomainWallVec5dImplR; // Real.. whichever prec
typedef DomainWallVec5dImpl<vComplexF,Nc> DomainWallVec5dImplF; // Float
typedef DomainWallVec5dImpl<vComplexD,Nc> DomainWallVec5dImplD; // Double
typedef DomainWallVec5dImpl<vComplex ,Nc,ComplexD> ZDomainWallVec5dImplR; // Real.. whichever prec
typedef DomainWallVec5dImpl<vComplexF,Nc,ComplexD> ZDomainWallVec5dImplF; // Float
typedef DomainWallVec5dImpl<vComplexD,Nc,ComplexD> ZDomainWallVec5dImplD; // Double
typedef GparityWilsonImpl<vComplex, Nc> GparityWilsonImplR; // Real.. whichever prec
typedef GparityWilsonImpl<vComplexF, Nc> GparityWilsonImplF; // Float
typedef GparityWilsonImpl<vComplexD, Nc> GparityWilsonImplD; // Double
}
}
#endif #endif

View File

@ -222,7 +222,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
//////////////////////// ////////////////////////
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for (int sss = 0; sss < B._grid->oSites(); sss++) { for (int sss = 0; sss < B._grid->oSites(); sss++) {
Kernels::DiracOptDhopDir(st, U, st.comm_buf, sss, sss, B, Btilde, mu, Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sss, sss, B, Btilde, mu,
gamma); gamma);
} }
@ -333,7 +333,7 @@ void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for (int sss = 0; sss < in._grid->oSites(); sss++) { for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DiracOptDhopDir(Stencil, Umu, Stencil.comm_buf, sss, sss, in, out, Kernels::DiracOptDhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out,
dirdisp, gamma); dirdisp, gamma);
} }
}; };
@ -351,13 +351,13 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
if (dag == DaggerYes) { if (dag == DaggerYes) {
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for (int sss = 0; sss < in._grid->oSites(); sss++) { for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sss, sss, 1, 1, in, Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in,
out); out);
} }
} else { } else {
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for (int sss = 0; sss < in._grid->oSites(); sss++) { for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DiracOptDhopSite(st, lo, U, st.comm_buf, sss, sss, 1, 1, in, Kernels::DiracOptDhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in,
out); out);
} }
} }

View File

@ -184,44 +184,37 @@ void WilsonFermion5D<Impl>::Report(void)
if ( DhopCalls > 0 ) { if ( DhopCalls > 0 ) {
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Number of Dhop Calls : " << DhopCalls << std::endl; std::cout << GridLogMessage << "WilsonFermion5D Number of Dhop Calls : " << DhopCalls << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime<< " us" << std::endl;
<< " us" << std::endl; std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls : " << DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls : " std::cout << GridLogMessage << "WilsonFermion5D Total Compute time : " << DhopComputeTime << " us" << std::endl;
<< DhopCommTime / DhopCalls << " us" << std::endl; std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Total Compute time : "
<< DhopComputeTime << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls : "
<< DhopComputeTime / DhopCalls << " us" << std::endl;
RealD mflops = 1344*volume*DhopCalls/DhopComputeTime; RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NP << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
} }
if ( DerivCalls > 0 ) { if ( DerivCalls > 0 ) {
std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl; std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Number of Deriv Calls : " <<DerivCalls <<std::endl; std::cout << GridLogMessage << "WilsonFermion5D Number of Deriv Calls : " <<DerivCalls <<std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " <<DerivCommTime <<" us"<<std::endl; std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " <<DerivCommTime <<" us"<<std::endl;
std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls : " <<DerivCommTime/DerivCalls<<" us" <<std::endl; std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Total Compute time : " <<DerivComputeTime <<" us"<<std::endl; std::cout << GridLogMessage << "WilsonFermion5D Total Compute time : " <<DerivComputeTime <<" us"<<std::endl;
std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl; std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Total Dhop Compute time : " <<DerivDhopComputeTime <<" us"<<std::endl; std::cout << GridLogMessage << "WilsonFermion5D Total Dhop Compute time : " <<DerivDhopComputeTime <<" us"<<std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Dhop ComputeTime/Calls : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl; std::cout << GridLogMessage << "WilsonFermion5D Dhop ComputeTime/Calls : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime; std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NP << std::endl;
} }
if (DerivCalls > 0 || DhopCalls > 0){ if (DerivCalls > 0 || DhopCalls > 0){
std::cout << GridLogMessage << "WilsonFermion5D Stencil"<<std::endl; Stencil.Report(); std::cout << GridLogMessage << "WilsonFermion5D Stencil"<<std::endl; Stencil.Report();
std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl; StencilEven.Report(); std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "WilsonFermion5D StencilOdd"<<std::endl; StencilOdd.Report(); std::cout << GridLogMessage << "WilsonFermion5D StencilOdd"<<std::endl; StencilOdd.Report();
} }
} }
@ -275,7 +268,7 @@ PARALLEL_FOR_LOOP
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
int sU=ss; int sU=ss;
int sF = s+Ls*sU; int sF = s+Ls*sU;
Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sF,sU,in,out,dirdisp,gamma); Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.CommBuf(),sF,sU,in,out,dirdisp,gamma);
} }
} }
}; };
@ -327,8 +320,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
assert(sF < B._grid->oSites()); assert(sF < B._grid->oSites());
assert(sU < U._grid->oSites()); assert(sU < U._grid->oSites());
Kernels::DiracOptDhopDir(st, U, st.comm_buf, sF, sU, B, Btilde, mu, Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sF, sU, B, Btilde, mu, gamma);
gamma);
//////////////////////////// ////////////////////////////
// spin trace outer product // spin trace outer product
@ -342,10 +334,10 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
} }
template<class Impl> template<class Impl>
void WilsonFermion5D<Impl>::DhopDeriv( GaugeField &mat, void WilsonFermion5D<Impl>::DhopDeriv(GaugeField &mat,
const FermionField &A, const FermionField &A,
const FermionField &B, const FermionField &B,
int dag) int dag)
{ {
conformable(A._grid,FermionGrid()); conformable(A._grid,FermionGrid());
conformable(A._grid,B._grid); conformable(A._grid,B._grid);
@ -358,9 +350,9 @@ void WilsonFermion5D<Impl>::DhopDeriv( GaugeField &mat,
template<class Impl> template<class Impl>
void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat, void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
const FermionField &A, const FermionField &A,
const FermionField &B, const FermionField &B,
int dag) int dag)
{ {
conformable(A._grid,FermionRedBlackGrid()); conformable(A._grid,FermionRedBlackGrid());
conformable(GaugeRedBlackGrid(),mat._grid); conformable(GaugeRedBlackGrid(),mat._grid);
@ -376,9 +368,9 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
template<class Impl> template<class Impl>
void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat, void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
const FermionField &A, const FermionField &A,
const FermionField &B, const FermionField &B,
int dag) int dag)
{ {
conformable(A._grid,FermionRedBlackGrid()); conformable(A._grid,FermionRedBlackGrid());
conformable(GaugeRedBlackGrid(),mat._grid); conformable(GaugeRedBlackGrid(),mat._grid);
@ -393,10 +385,9 @@ void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
template<class Impl> template<class Impl>
void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo, void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U, DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag) const FermionField &in, FermionField &out,int dag)
{ {
DhopCalls++;
// assert((dag==DaggerNo) ||(dag==DaggerYes)); // assert((dag==DaggerNo) ||(dag==DaggerYes));
Compressor compressor(dag); Compressor compressor(dag);
@ -413,27 +404,25 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
for (int ss = 0; ss < U._grid->oSites(); ss++) { for (int ss = 0; ss < U._grid->oSites(); ss++) {
int sU = ss; int sU = ss;
int sF = LLs * sU; int sF = LLs * sU;
Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in, Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sF, sU, LLs, 1, in, out);
out);
} }
#ifdef AVX512 #ifdef AVX512
} else if (stat.is_init() ) { } else if (stat.is_init() ) {
int nthreads; int nthreads;
stat.start(); stat.start();
#pragma omp parallel #pragma omp parallel
{ {
#pragma omp master #pragma omp master
nthreads = omp_get_num_threads(); nthreads = omp_get_num_threads();
int mythread = omp_get_thread_num(); int mythread = omp_get_thread_num();
stat.enter(mythread); stat.enter(mythread);
#pragma omp for nowait #pragma omp for nowait
for(int ss=0;ss<U._grid->oSites();ss++) for(int ss=0;ss<U._grid->oSites();ss++) {
{ int sU=ss;
int sU=ss; int sF=LLs*sU;
int sF=LLs*sU; Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out); }
}
stat.exit(mythread); stat.exit(mythread);
} }
stat.accum(nthreads); stat.accum(nthreads);
@ -443,8 +432,7 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
for (int ss = 0; ss < U._grid->oSites(); ss++) { for (int ss = 0; ss < U._grid->oSites(); ss++) {
int sU = ss; int sU = ss;
int sF = LLs * sU; int sF = LLs * sU;
Kernels::DiracOptDhopSite(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in, Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
out);
} }
} }
DhopComputeTime+=usecond(); DhopComputeTime+=usecond();
@ -454,6 +442,7 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
template<class Impl> template<class Impl>
void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag) void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
{ {
DhopCalls++;
conformable(in._grid,FermionRedBlackGrid()); // verifies half grid conformable(in._grid,FermionRedBlackGrid()); // verifies half grid
conformable(in._grid,out._grid); // drops the cb check conformable(in._grid,out._grid); // drops the cb check
@ -465,6 +454,7 @@ void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int
template<class Impl> template<class Impl>
void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
{ {
DhopCalls++;
conformable(in._grid,FermionRedBlackGrid()); // verifies half grid conformable(in._grid,FermionRedBlackGrid()); // verifies half grid
conformable(in._grid,out._grid); // drops the cb check conformable(in._grid,out._grid); // drops the cb check
@ -476,6 +466,7 @@ void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int
template<class Impl> template<class Impl>
void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag) void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
{ {
DhopCalls+=2;
conformable(in._grid,FermionGrid()); // verifies full grid conformable(in._grid,FermionGrid()); // verifies full grid
conformable(in._grid,out._grid); conformable(in._grid,out._grid);

View File

@ -34,8 +34,18 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/Stat.h> #include <Grid/Stat.h>
namespace Grid { namespace Grid {
namespace QCD {
namespace QCD { ////////////////////////////////////////////////////////////////////////////////
// This is the 4d red black case appropriate to support
//
// parity = (x+y+z+t)|2;
// generalised five dim fermions like mobius, zolotarev etc..
//
// i.e. even even contains fifth dim hopping term.
//
// [DIFFERS from original CPS red black implementation parity = (x+y+z+t+s)|2 ]
////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// This is the 4d red black case appropriate to support // This is the 4d red black case appropriate to support
@ -114,78 +124,78 @@ namespace Grid {
// add a DhopComm // add a DhopComm
// -- suboptimal interface will presently trigger multiple comms. // -- suboptimal interface will presently trigger multiple comms.
void DhopDir(const FermionField &in, FermionField &out,int dir,int disp); void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// New methods added // New methods added
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
void DerivInternal(StencilImpl & st, void DerivInternal(StencilImpl & st,
DoubledGaugeField & U, DoubledGaugeField & U,
GaugeField &mat, GaugeField &mat,
const FermionField &A, const FermionField &A,
const FermionField &B, const FermionField &B,
int dag); int dag);
void DhopInternal(StencilImpl & st, void DhopInternal(StencilImpl & st,
LebesgueOrder &lo, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &U,
const FermionField &in, const FermionField &in,
FermionField &out, FermionField &out,
int dag); int dag);
// Constructors // Constructors
WilsonFermion5D(GaugeField &_Umu, WilsonFermion5D(GaugeField &_Umu,
GridCartesian &FiveDimGrid, GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid, GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid, GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid, GridRedBlackCartesian &FourDimRedBlackGrid,
double _M5,const ImplParams &p= ImplParams()); double _M5,const ImplParams &p= ImplParams());
// Constructors // Constructors
/* /*
WilsonFermion5D(int simd, WilsonFermion5D(int simd,
GaugeField &_Umu, GaugeField &_Umu,
GridCartesian &FiveDimGrid, GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid, GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid, GridCartesian &FourDimGrid,
double _M5,const ImplParams &p= ImplParams()); double _M5,const ImplParams &p= ImplParams());
*/ */
// DoubleStore
void ImportGauge(const GaugeField &_Umu);
///////////////////////////////////////////////////////////////
// Data members require to support the functionality
///////////////////////////////////////////////////////////////
public:
// Add these to the support from Wilson
GridBase *_FourDimGrid;
GridBase *_FourDimRedBlackGrid;
GridBase *_FiveDimGrid;
GridBase *_FiveDimRedBlackGrid;
double M5;
int Ls;
//Defines the stencils for even and odd
StencilImpl Stencil;
StencilImpl StencilEven;
StencilImpl StencilOdd;
// Copy of the gauge field , with even and odd subsets
DoubledGaugeField Umu;
DoubledGaugeField UmuEven;
DoubledGaugeField UmuOdd;
LebesgueOrder Lebesgue;
LebesgueOrder LebesgueEvenOdd;
// Comms buffer
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
};
// DoubleStore }}
void ImportGauge(const GaugeField &_Umu);
///////////////////////////////////////////////////////////////
// Data members require to support the functionality
///////////////////////////////////////////////////////////////
public:
// Add these to the support from Wilson
GridBase *_FourDimGrid;
GridBase *_FourDimRedBlackGrid;
GridBase *_FiveDimGrid;
GridBase *_FiveDimRedBlackGrid;
double M5;
int Ls;
//Defines the stencils for even and odd
StencilImpl Stencil;
StencilImpl StencilEven;
StencilImpl StencilOdd;
// Copy of the gauge field , with even and odd subsets
DoubledGaugeField Umu;
DoubledGaugeField UmuEven;
DoubledGaugeField UmuOdd;
LebesgueOrder Lebesgue;
LebesgueOrder LebesgueEvenOdd;
// Comms buffer
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
};
}
}
#endif #endif

View File

@ -43,10 +43,9 @@ WilsonKernels<Impl>::WilsonKernels(const ImplParams &p) : Base(p){};
//////////////////////////////////////////// ////////////////////////////////////////////
template <class Impl> template <class Impl>
void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag( void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor *buf, int sF,
commVector<SiteHalfSpinor> &buf, int sF, int sU, const FermionField &in, FermionField &out) {
int sU, const FermionField &in, FermionField &out) {
SiteHalfSpinor tmp; SiteHalfSpinor tmp;
SiteHalfSpinor chi; SiteHalfSpinor chi;
SiteHalfSpinor *chi_p; SiteHalfSpinor *chi_p;
@ -220,10 +219,9 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(
// Need controls to do interior, exterior, or both // Need controls to do interior, exterior, or both
template <class Impl> template <class Impl>
void WilsonKernels<Impl>::DiracOptGenericDhopSite( void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor *buf, int sF,
commVector<SiteHalfSpinor> &buf, int sF, int sU, const FermionField &in, FermionField &out) {
int sU, const FermionField &in, FermionField &out) {
SiteHalfSpinor tmp; SiteHalfSpinor tmp;
SiteHalfSpinor chi; SiteHalfSpinor chi;
SiteHalfSpinor *chi_p; SiteHalfSpinor *chi_p;
@ -396,10 +394,9 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSite(
}; };
template <class Impl> template <class Impl>
void WilsonKernels<Impl>::DiracOptDhopDir( void WilsonKernels<Impl>::DiracOptDhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int sF,
StencilImpl &st, DoubledGaugeField &U, int sU, const FermionField &in, FermionField &out, int dir, int gamma) {
commVector<SiteHalfSpinor> &buf, int sF,
int sU, const FermionField &in, FermionField &out, int dir, int gamma) {
SiteHalfSpinor tmp; SiteHalfSpinor tmp;
SiteHalfSpinor chi; SiteHalfSpinor chi;
SiteSpinor result; SiteSpinor result;

View File

@ -32,175 +32,132 @@ directory
#define GRID_QCD_DHOP_H #define GRID_QCD_DHOP_H
namespace Grid { namespace Grid {
namespace QCD {
namespace QCD { ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Helper routines that implement Wilson stencil for a single site.
//////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Common to both the WilsonFermion and WilsonFermion5D
// Helper routines that implement Wilson stencil for a single site. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Common to both the WilsonFermion and WilsonFermion5D class WilsonKernelsStatic {
//////////////////////////////////////////////////////////////////////////////////////////////////////////////// public:
class WilsonKernelsStatic { // S-direction is INNERMOST and takes no part in the parity.
public: static int AsmOpt; // these are a temporary hack
// S-direction is INNERMOST and takes no part in the parity. static int HandOpt; // these are a temporary hack
static int AsmOpt; // these are a temporary hack };
static int HandOpt; // these are a temporary hack
}; template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic {
public:
template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic {
public: INHERIT_IMPL_TYPES(Impl);
typedef FermionOperator<Impl> Base;
INHERIT_IMPL_TYPES(Impl);
typedef FermionOperator<Impl> Base; public:
public: template <bool EnableBool = true>
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
template <bool EnableBool = true> DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
DiracOptDhopSite(
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
int sF, int sU, int Ls, int Ns, const FermionField &in,
FermionField &out) {
#ifdef AVX512 #ifdef AVX512
if (AsmOpt) { if (AsmOpt) {
WilsonKernels<Impl>::DiracOptAsmDhopSite(st, lo, U, buf, sF, sU, Ls, Ns, WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
in, out); } else {
} else {
#else #else
{ {
#endif #endif
for (int site = 0; site < Ns; site++) { for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) { for (int s = 0; s < Ls; s++) {
if (HandOpt) if (HandOpt)
WilsonKernels<Impl>::DiracOptHandDhopSite(st, lo, U, buf, sF, sU, WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
in, out); else
else WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, sF++;
in, out);
sF++;
}
sU++;
}
}
} }
sU++;
template <bool EnableBool = true>
typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
DiracOptDhopSite(
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
int sF, int sU, int Ls, int Ns, const FermionField &in,
FermionField &out) {
for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) {
WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in,
out);
sF++;
}
sU++;
}
}
template <bool EnableBool = true>
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,
void>::type
DiracOptDhopSiteDag(
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
int sF, int sU, int Ls, int Ns, const FermionField &in,
FermionField &out) {
#ifdef AVX512
if (AsmOpt) {
WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st, lo, U, buf, sF, sU, Ls,
Ns, in, out);
} else {
#else
{
#endif
for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) {
if (HandOpt)
WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st, lo, U, buf, sF, sU,
in, out);
else
WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF,
sU, in, out);
sF++;
}
sU++;
}
}
}
template <bool EnableBool = true>
typename std::enable_if<
(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,
void>::type
DiracOptDhopSiteDag(
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
int sF, int sU, int Ls, int Ns, const FermionField &in,
FermionField &out) {
for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) {
WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF, sU,
in, out);
sF++;
}
sU++;
}
}
void DiracOptDhopDir(
StencilImpl &st, DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
int sF, int sU, const FermionField &in, FermionField &out, int dirdisp,
int gamma);
private:
// Specialised variants
void DiracOptGenericDhopSite(
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
int sF, int sU, const FermionField &in, FermionField &out);
void DiracOptGenericDhopSiteDag(
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
int sF, int sU, const FermionField &in, FermionField &out);
void DiracOptAsmDhopSite(
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
int sF, int sU, int Ls, int Ns, const FermionField &in,
FermionField &out);
void DiracOptAsmDhopSiteDag(
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
int sF, int sU, int Ls, int Ns, const FermionField &in,
FermionField &out);
void DiracOptHandDhopSite(
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
int sF, int sU, const FermionField &in, FermionField &out);
void DiracOptHandDhopSiteDag(
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
int sF, int sU, const FermionField &in, FermionField &out);
public:
WilsonKernels(const ImplParams &p = ImplParams());
};
} }
} }
}
template <bool EnableBool = true>
typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) {
WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in, out);
sF++;
}
sU++;
}
}
template <bool EnableBool = true>
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,void>::type
DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
#ifdef AVX512
if (AsmOpt) {
WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
} else {
#else
{
#endif
for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) {
if (HandOpt)
WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
else
WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
sF++;
}
sU++;
}
}
}
template <bool EnableBool = true>
typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,void>::type
DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) {
WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
sF++;
}
sU++;
}
}
void DiracOptDhopDir(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, int gamma);
private:
// Specialised variants
void DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out);
void DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out);
void DiracOptAsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
void DiracOptAsmDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);
void DiracOptHandDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out);
void DiracOptHandDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out);
public:
WilsonKernels(const ImplParams &p = ImplParams());
};
}}
#endif #endif

View File

@ -33,31 +33,27 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
namespace Grid { namespace Grid {
namespace QCD { namespace QCD {
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
// Default to no assembler implementation // Default to no assembler implementation
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
template<class Impl> template<class Impl> void
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
commVector<SiteHalfSpinor> &buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) {
{ assert(0);
assert(0); }
}
template<class Impl>
void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
{
assert(0);
}
template<class Impl> void
WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
{
assert(0);
}
#if defined(AVX512) #if defined(AVX512)
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
// If we are AVX512 specialise the single precision routine // If we are AVX512 specialise the single precision routine
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
@ -65,16 +61,16 @@ namespace Grid {
#include <simd/Intel512wilson.h> #include <simd/Intel512wilson.h>
#include <simd/Intel512single.h> #include <simd/Intel512single.h>
static Vector<vComplexF> signs; static Vector<vComplexF> signs;
int setupSigns(void ){ int setupSigns(void ){
Vector<vComplexF> bother(2); Vector<vComplexF> bother(2);
signs = bother; signs = bother;
vrsign(signs[0]); vrsign(signs[0]);
visign(signs[1]); visign(signs[1]);
return 1; return 1;
} }
static int signInit = setupSigns(); static int signInit = setupSigns();
#define label(A) ilabel(A) #define label(A) ilabel(A)
#define ilabel(A) ".globl\n" #A ":\n" #define ilabel(A) ".globl\n" #A ":\n"
@ -84,17 +80,15 @@ namespace Grid {
#define FX(A) WILSONASM_ ##A #define FX(A) WILSONASM_ ##A
#undef KERNEL_DAG #undef KERNEL_DAG
template<> template<> void
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
commVector<SiteHalfSpinor> &buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#define KERNEL_DAG #define KERNEL_DAG
template<> template<> void
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
commVector<SiteHalfSpinor> &buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef VMOVIDUP #undef VMOVIDUP
@ -109,31 +103,26 @@ namespace Grid {
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf) #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
#undef KERNEL_DAG #undef KERNEL_DAG
template<> template<> void
void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
commVector<SiteHalfSpinor> &buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#define KERNEL_DAG #define KERNEL_DAG
template<> template<> void
void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
commVector<SiteHalfSpinor> &buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#endif #endif
#define INSTANTIATE_ASM(A)\ #define INSTANTIATE_ASM(A)\
template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\ template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
commVector<SiteHalfSpinor> &buf,\
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\ int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
template void WilsonKernels<A>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\ \
commVector<SiteHalfSpinor> &buf,\ template void WilsonKernels<A>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\ int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
INSTANTIATE_ASM(WilsonImplF); INSTANTIATE_ASM(WilsonImplF);
INSTANTIATE_ASM(WilsonImplD); INSTANTIATE_ASM(WilsonImplD);
INSTANTIATE_ASM(ZWilsonImplF); INSTANTIATE_ASM(ZWilsonImplF);
@ -144,6 +133,6 @@ INSTANTIATE_ASM(DomainWallVec5dImplF);
INSTANTIATE_ASM(DomainWallVec5dImplD); INSTANTIATE_ASM(DomainWallVec5dImplD);
INSTANTIATE_ASM(ZDomainWallVec5dImplF); INSTANTIATE_ASM(ZDomainWallVec5dImplF);
INSTANTIATE_ASM(ZDomainWallVec5dImplD); INSTANTIATE_ASM(ZDomainWallVec5dImplD);
}
} }}

View File

@ -311,10 +311,9 @@ namespace Grid {
namespace QCD { namespace QCD {
template<class Impl> template<class Impl> void
void WilsonKernels<Impl>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, WilsonKernels<Impl>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
commVector<SiteHalfSpinor> &buf, int ss,int sU,const FermionField &in, FermionField &out)
int ss,int sU,const FermionField &in, FermionField &out)
{ {
typedef typename Simd::scalar_type S; typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V; typedef typename Simd::vector_type V;
@ -554,10 +553,9 @@ namespace QCD {
} }
} }
template<class Impl> template<class Impl>
void WilsonKernels<Impl>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, void WilsonKernels<Impl>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
commVector<SiteHalfSpinor> &buf, int ss,int sU,const FermionField &in, FermionField &out)
int ss,int sU,const FermionField &in, FermionField &out)
{ {
// std::cout << "Hand op Dhop "<<std::endl; // std::cout << "Hand op Dhop "<<std::endl;
typedef typename Simd::scalar_type S; typedef typename Simd::scalar_type S;
@ -798,38 +796,35 @@ namespace QCD {
} }
} }
//////////////////////////////////////////////// ////////////////////////////////////////////////
// Specialise Gparity to simple implementation // Specialise Gparity to simple implementation
//////////////////////////////////////////////// ////////////////////////////////////////////////
template<> template<> void
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf, SiteHalfSpinor *buf,
int sF,int sU,const FermionField &in, FermionField &out) int sF,int sU,const FermionField &in, FermionField &out)
{ {
assert(0); assert(0);
} }
template<> template<> void
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf, SiteHalfSpinor *buf,
int sF,int sU,const FermionField &in, FermionField &out) int sF,int sU,const FermionField &in, FermionField &out)
{ {
assert(0); assert(0);
} }
template<> template<> void
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
commVector<SiteHalfSpinor> &buf, int sF,int sU,const FermionField &in, FermionField &out)
int sF,int sU,const FermionField &in, FermionField &out)
{ {
assert(0); assert(0);
} }
template<> template<> void
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
commVector<SiteHalfSpinor> &buf, int sF,int sU,const FermionField &in, FermionField &out)
int sF,int sU,const FermionField &in, FermionField &out)
{ {
assert(0); assert(0);
} }
@ -840,12 +835,10 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,
// Need Nc=3 though // // Need Nc=3 though //
#define INSTANTIATE_THEM(A) \ #define INSTANTIATE_THEM(A) \
template void WilsonKernels<A>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\ template void WilsonKernels<A>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
commVector<SiteHalfSpinor> &buf,\ int ss,int sU,const FermionField &in, FermionField &out); \
int ss,int sU,const FermionField &in, FermionField &out);\ template void WilsonKernels<A>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
template void WilsonKernels<A>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\ int ss,int sU,const FermionField &in, FermionField &out);
commVector<SiteHalfSpinor> &buf,\
int ss,int sU,const FermionField &in, FermionField &out);
INSTANTIATE_THEM(WilsonImplF); INSTANTIATE_THEM(WilsonImplF);
INSTANTIATE_THEM(WilsonImplD); INSTANTIATE_THEM(WilsonImplD);

View File

@ -42,20 +42,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
namespace Grid{ namespace Grid{
namespace Optimization { namespace Optimization {
template<class vtype>
union uconv {
__m512 f;
vtype v;
};
union u512f { union u512f {
__m512 v; __m512 v;
float f[8]; float f[16];
}; };
union u512d { union u512d {
__m512 v; __m512d v;
double f[4]; double f[8];
}; };
struct Vsplat{ struct Vsplat{
@ -388,9 +382,14 @@ namespace Optimization {
// Some Template specialization // Some Template specialization
// Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases // Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases
<<<<<<< HEAD
#define GNU_CLANG_COMPILER
#ifdef GNU_CLANG_COMPILER
=======
#ifndef __INTEL_COMPILER #ifndef __INTEL_COMPILER
#warning "Slow reduction due to incomplete reduce intrinsics" #warning "Slow reduction due to incomplete reduce intrinsics"
>>>>>>> develop
//Complex float Reduce //Complex float Reduce
template<> template<>
inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){ inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){

View File

@ -53,7 +53,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\ #define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
VSHUFMEMd(O,P,tmp) \ VSHUFMEMd(O,P,tmp) \
VMULMEMd(O,P,B,Biirr) \ VMULMEMd(O,P,B,Biirr) \
VMULMEMd(O,P,C,Ciirr) \ VMULMEMd(O,P,C,Ciirr) \
VMULd(tmp,B,Briir) \ VMULd(tmp,B,Briir) \
VMULd(tmp,C,Criir) VMULd(tmp,C,Criir)

View File

@ -37,7 +37,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
// Opcodes common // Opcodes common
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
#define MASK_REGS \ #define MASK_REGS \
__asm__ ("mov $0xAAAA, %%eax \n"\ __asm__ ("mov $0xAAAA, %%eax \n"\
"kmovw %%eax, %%k6 \n"\ "kmovw %%eax, %%k6 \n"\
"mov $0x5555, %%eax \n"\ "mov $0x5555, %%eax \n"\
"kmovw %%eax, %%k7 \n" : : : "%eax"); "kmovw %%eax, %%k7 \n" : : : "%eax");

View File

@ -116,7 +116,7 @@ int main (int argc, char ** argv)
else if (SE->_is_local) else if (SE->_is_local)
Check._odata[i] = Foo._odata[SE->_offset]; Check._odata[i] = Foo._odata[SE->_offset];
else else
Check._odata[i] = myStencil.comm_buf[SE->_offset]; Check._odata[i] = myStencil.CommBuf()[SE->_offset];
} }
Real nrmC = norm2(Check); Real nrmC = norm2(Check);
@ -207,7 +207,7 @@ int main (int argc, char ** argv)
else if (SE->_is_local) else if (SE->_is_local)
OCheck._odata[i] = EFoo._odata[SE->_offset]; OCheck._odata[i] = EFoo._odata[SE->_offset];
else else
OCheck._odata[i] = EStencil.comm_buf[SE->_offset]; OCheck._odata[i] = EStencil.CommBuf()[SE->_offset];
} }
for(int i=0;i<ECheck._grid->oSites();i++){ for(int i=0;i<ECheck._grid->oSites();i++){
int permute_type; int permute_type;
@ -220,7 +220,7 @@ int main (int argc, char ** argv)
else if (SE->_is_local) else if (SE->_is_local)
ECheck._odata[i] = OFoo._odata[SE->_offset]; ECheck._odata[i] = OFoo._odata[SE->_offset];
else else
ECheck._odata[i] = OStencil.comm_buf[SE->_offset]; ECheck._odata[i] = OStencil.CommBuf()[SE->_offset];
} }
setCheckerboard(Check,ECheck); setCheckerboard(Check,ECheck);

View File

@ -86,11 +86,12 @@ int main (int argc, char ** argv)
FFT theFFT(&GRID); FFT theFFT(&GRID);
Ctilde=C;
std::cout<<" Benchmarking FFT of LatticeComplex "<<std::endl; std::cout<<" Benchmarking FFT of LatticeComplex "<<std::endl;
theFFT.FFT_dim(Ctilde,C,0,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()<<" Mflops "<<std::endl; theFFT.FFT_dim(Ctilde,Ctilde,0,FFT::forward); std::cout << theFFT.MFlops()<<" Mflops "<<std::endl;
theFFT.FFT_dim(Ctilde,C,1,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()<<" Mflops "<<std::endl; theFFT.FFT_dim(Ctilde,Ctilde,1,FFT::forward); std::cout << theFFT.MFlops()<<" Mflops "<<std::endl;
theFFT.FFT_dim(Ctilde,C,2,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()<<" Mflops "<<std::endl; theFFT.FFT_dim(Ctilde,Ctilde,2,FFT::forward); std::cout << theFFT.MFlops()<<" Mflops "<<std::endl;
theFFT.FFT_dim(Ctilde,C,3,FFT::forward); std::cout << theFFT.MFlops()<<" Mflops "<<std::endl; theFFT.FFT_dim(Ctilde,Ctilde,3,FFT::forward); std::cout << theFFT.MFlops()<<" Mflops "<<std::endl;
// C=zero; // C=zero;
// Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde); // Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde);
@ -113,11 +114,12 @@ int main (int argc, char ** argv)
Cref= Cref - C; Cref= Cref - C;
std::cout << " invertible check " << norm2(Cref)<<std::endl; std::cout << " invertible check " << norm2(Cref)<<std::endl;
Stilde=S;
std::cout<<" Benchmarking FFT of LatticeSpinMatrix "<<std::endl; std::cout<<" Benchmarking FFT of LatticeSpinMatrix "<<std::endl;
theFFT.FFT_dim(Stilde,S,0,FFT::forward); S=Stilde;std::cout << theFFT.MFlops()<<" mflops "<<std::endl; theFFT.FFT_dim(Stilde,S,0,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl;
theFFT.FFT_dim(Stilde,S,1,FFT::forward); S=Stilde;std::cout << theFFT.MFlops()<<" mflops "<<std::endl; theFFT.FFT_dim(Stilde,S,1,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl;
theFFT.FFT_dim(Stilde,S,2,FFT::forward); S=Stilde;std::cout << theFFT.MFlops()<<" mflops "<<std::endl; theFFT.FFT_dim(Stilde,S,2,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl;
theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<" mflops "<<std::endl; theFFT.FFT_dim(Stilde,S,3,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl;
SpinMatrixD Sp; SpinMatrixD Sp;
Sp = zero; Sp = Sp+cVol; Sp = zero; Sp = Sp+cVol;