1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-09 23:45:36 +00:00

Reworking CSHIFT and Stencil. Implementing Wilson and discovered rework is required

This commit is contained in:
Peter Boyle 2015-04-27 13:45:07 +01:00
parent 94f728bee4
commit f159495a9d
11 changed files with 176 additions and 204 deletions

4
TODO
View File

@ -2,6 +2,10 @@
- use protocol buffers? replace xmlReader/Writer ec..
- Binary use htonll, htonl
* Stencil -- do the permute for locally permuted in halo exchange.
- BUG cshift mpi; the "s" indexing is weird in the Cshift_comms_simd
as simd_layout or not is confusing
* Reduce implemention is poor
* Bug in SeedFixedIntegers gives same output on each site.
* Bug in RNG with complex numbers ; only filling real values; need helper function -- DONE

View File

@ -2,7 +2,7 @@
/* lib/Grid_config.h.in. Generated from configure.ac by autoheader. */
/* AVX */
/* #undef AVX1 */
#define AVX1 1
/* AVX2 */
/* #undef AVX2 */
@ -77,7 +77,7 @@
#define PACKAGE_VERSION "1.0"
/* SSE4 */
#define SSE4 1
/* #undef SSE4 */
/* Define to 1 if you have the ANSI C header files. */
#define STDC_HEADERS 1

View File

@ -52,7 +52,7 @@ namespace Grid {
// Gather for when there is no need to SIMD split with compression
///////////////////////////////////////////////////////////////////
template<class vobj,class cobj,class compressor> void
Gather_plane_simple (Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress)
Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress)
{
int rd = rhs._grid->_rdimensions[dimension];
@ -97,7 +97,7 @@ Gather_plane_simple (Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj>
// Gather for when there *is* need to SIMD split with compression
///////////////////////////////////////////////////////////////////
template<class cobj,class vobj,class compressor> void
Gather_plane_extract(Lattice<vobj> &rhs,std::vector<typename cobj::scalar_type *> pointers,int dimension,int plane,int cbmask,compressor &compress)
Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_type *> pointers,int dimension,int plane,int cbmask,compressor &compress)
{
int rd = rhs._grid->_rdimensions[dimension];
@ -182,7 +182,7 @@ template<class cobj,class vobj,class compressor> void
// Could allow a functional munging of the halo to another type during the comms.
// this could implement the 16bit/32bit/64bit compression.
template<class vobj,class cobj, class compressor> void
HaloExchange(Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
HaloExchange(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
{
// conformable(source._grid,_grid);
assert(source._grid==_grid);
@ -237,7 +237,7 @@ template<class cobj,class vobj,class compressor> void
}
template<class vobj,class cobj, class compressor>
void GatherStartComms(Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
void GatherStartComms(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
int &u_comm_offset,compressor & compress)
{
@ -250,6 +250,7 @@ template<class cobj,class vobj,class compressor> void
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
int pd = _grid->_processors[dimension];
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
assert(simd_layout==1);
@ -267,11 +268,10 @@ template<class cobj,class vobj,class compressor> void
for(int x=0;x<rd;x++){
int offnode = ( x+sshift >= rd );
int sx = (x+sshift)%rd;
int comm_proc = (x+sshift)/rd;
if (offnode) {
int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc) {
int words = send_buf.size();
if (cbmask != 0x3) words=words>>1;
@ -284,7 +284,8 @@ template<class cobj,class vobj,class compressor> void
int recv_from_rank;
int xmit_to_rank;
_grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
assert (xmit_to_rank != _grid->ThisRank());
assert (recv_from_rank != _grid->ThisRank());
// FIXME Implement asynchronous send & also avoid buffer copy
_grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
@ -293,7 +294,11 @@ template<class cobj,class vobj,class compressor> void
bytes);
printf("GatherStartComms communicated offnode x %d\n",x);fflush(stdout);
printf("GatherStartComms inserting %d buf size %d\n",u_comm_offset,buffer_size);fflush(stdout);
printf("GatherStartComms inserting %le to u_comm_offset %d buf size %d for dim %d shift %d\n",
*( (RealF *) &recv_buf[0]),
u_comm_offset,buffer_size,
dimension,shift
); fflush(stdout);
for(int i=0;i<buffer_size;i++){
u_comm_buf[u_comm_offset+i]=recv_buf[i];
}
@ -305,17 +310,19 @@ template<class cobj,class vobj,class compressor> void
template<class vobj,class cobj, class compressor>
void GatherStartCommsSimd(Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
void GatherStartCommsSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
int &u_comm_offset,compressor &compress)
{
const int Nsimd = _grid->Nsimd();
typedef typename cobj::vector_type vector_type;
typedef typename cobj::scalar_type scalar_type;
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
int ld = _grid->_ldimensions[dimension];
int pd = _grid->_processors[dimension];
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
@ -346,89 +353,62 @@ template<class cobj,class vobj,class compressor> void
int cb = (cbmask==0x2)? 1 : 0;
int sshift= _grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
std::vector<int> comm_offnode(simd_layout);
std::vector<int> comm_proc (simd_layout); //relative processor coord in dim=dimension
std::vector<int> icoor(_grid->Nd());
for(int x=0;x<rd;x++){
// loop over outer coord planes orthog to dim
for(int x=0;x<rd;x++){
int comm_any = 0;
for(int s=0;s<simd_layout;s++) {
int shifted_x = x+s*rd+sshift;
comm_offnode[s] = shifted_x >= ld;
comm_any = comm_any | comm_offnode[s];
comm_proc[s] = shifted_x/ld;
// FIXME call local permute copy if none are offnode.
for(int i=0;i<Nsimd;i++){
pointers[i] = (scalar_type *)&send_buf_extract[i][0];
}
int o = 0;
int bo = x*_grid->_ostride[dimension];
int sx = (x+sshift)%rd;
if ( comm_any ) {
for(int i=0;i<Nsimd;i++){
pointers[Nsimd-1-i] = (scalar_type *)&send_buf_extract[i][0];
std::cout<< "Gathering "<< x <<std::endl;
Gather_plane_extract<cobj>(rhs,pointers,dimension,sx,cbmask,compress);
std::cout<< "Gathered "<<std::endl;
for(int i=0;i<Nsimd;i++){
int inner_bit = (Nsimd>>(permute_type+1));
int ic= (i&inner_bit)? 1:0;
int my_coor = rd*ic + x;
int nbr_coor = my_coor+sshift;
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
int nbr_ox = (nbr_coor%rd); // outer coord of peer
int nbr_lane = (i&(~inner_bit));
int recv_from_rank;
int xmit_to_rank;
if (nbr_ic) nbr_lane|=inner_bit;
assert (sx == nbr_ox);
if(nbr_proc){
std::cout<< "MPI sending "<<std::endl;
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
_grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
xmit_to_rank,
(void *)&recv_buf_extract[i][0],
recv_from_rank,
bytes);
std::cout<< "MPI complete "<<std::endl;
rpointers[i] = (scalar_type *)&recv_buf_extract[i][0];
} else {
rpointers[i] = (scalar_type *)&send_buf_extract[nbr_lane][0];
}
Gather_plane_extract<cobj>(rhs,pointers,dimension,sx,cbmask,compress);
for(int i=0;i<Nsimd;i++){
int s;
_grid->iCoorFromIindex(icoor,i);
s = icoor[dimension];
if(comm_offnode[s]){
int rank = _grid->_processor;
int recv_from_rank;
int xmit_to_rank;
_grid->ShiftedRanks(dimension,comm_proc[s],xmit_to_rank,recv_from_rank);
_grid->SendToRecvFrom((void *)&send_buf_extract[i][0],
xmit_to_rank,
(void *)&recv_buf_extract[i][0],
recv_from_rank,
bytes);
rpointers[i] = (scalar_type *)&recv_buf_extract[i][0];
} else {
rpointers[i] = (scalar_type *)&send_buf_extract[i][0];
}
}
// Permute by swizzling pointers in merge
int permute_slice=0;
int lshift=sshift%ld;
int wrap =lshift/rd;
int num =lshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
int toggle_bit = (Nsimd>>(permute_type+1));
int PermuteMap;
for(int i=0;i<Nsimd;i++){
if ( permute_slice ) {
PermuteMap=i^toggle_bit;
pointers[Nsimd-1-i] = rpointers[PermuteMap];
} else {
pointers[Nsimd-1-i] = rpointers[i];
}
}
// Here we don't want to scatter, just place into a buffer.
for(int i=0;i<buffer_size;i++){
merge(u_comm_buf[u_comm_offset+i],pointers);
}
}
// Here we don't want to scatter, just place into a buffer.
std::cout<< "merging "<<std::endl;
for(int i=0;i<buffer_size;i++){
merge(u_comm_buf[u_comm_offset+i],rpointers);
}
}
}
};

View File

@ -98,6 +98,9 @@ public:
index = index / dims[d];
}
}
inline void oCoorFromOindex (std::vector<int>& coor,int Oindex){
CoorFromIndex(coor,Oindex,_rdimensions);
}
static inline void IndexFromCoor (std::vector<int>& coor,int &index,std::vector<int> &dims){
int nd=dims.size();
int stride=1;
@ -107,9 +110,6 @@ public:
stride=stride*dims[d];
}
}
inline void oCoorFromOindex (std::vector<int>& coor,int Oindex){
CoorFromIndex(coor,Oindex,_rdimensions);
}
//////////////////////////////////////////////////////////
// SIMD lane addressing

View File

@ -5,7 +5,7 @@ namespace Grid {
//////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split
//////////////////////////////////////////////////////
template<class vobj> void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer, int dimension,int plane,int cbmask)
template<class vobj> void Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer, int dimension,int plane,int cbmask)
{
int rd = rhs._grid->_rdimensions[dimension];
@ -49,7 +49,7 @@ template<class vobj> void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vo
//////////////////////////////////////////////////////
// Gather for when there *is* need to SIMD split
//////////////////////////////////////////////////////
template<class vobj,class scalar_type> void Gather_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane,int cbmask)
template<class vobj,class scalar_type> void Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane,int cbmask)
{
int rd = rhs._grid->_rdimensions[dimension];

View File

@ -79,6 +79,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,Lattice<vobj> &rhs,int
int fd = rhs._grid->_fdimensions[dimension];
int rd = rhs._grid->_rdimensions[dimension];
int pd = rhs._grid->_processors[dimension];
int simd_layout = rhs._grid->_simd_layout[dimension];
int comm_dim = rhs._grid->_processors[dimension] >1 ;
assert(simd_layout==1);
@ -95,11 +96,10 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,Lattice<vobj> &rhs,int
for(int x=0;x<rd;x++){
int offnode = ( x+sshift >= rd );
int sx = (x+sshift)%rd;
int comm_proc = (x+sshift)/rd;
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
if (!offnode) {
if (comm_proc==0) {
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
@ -138,6 +138,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &r
int fd = grid->_fdimensions[dimension];
int rd = grid->_rdimensions[dimension];
int ld = grid->_ldimensions[dimension];
int pd = grid->_processors[dimension];
int simd_layout = grid->_simd_layout[dimension];
int comm_dim = grid->_processors[dimension] >1 ;
@ -164,101 +165,58 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &r
///////////////////////////////////////////
// Work out what to send where
///////////////////////////////////////////
int cb = (cbmask==0x2)? 1 : 0;
int sshift= grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
std::vector<int> comm_offnode(simd_layout);
std::vector<int> comm_proc (simd_layout); //relative processor coord in dim=dimension
std::vector<int> icoor(grid->Nd());
// loop over outer coord planes orthog to dim
for(int x=0;x<rd;x++){
int comm_any = 0;
for(int s=0;s<simd_layout;s++) {
int shifted_x = x+s*rd+sshift;
comm_offnode[s] = shifted_x >= ld;
comm_any = comm_any | comm_offnode[s];
comm_proc[s] = shifted_x/ld;
// FIXME call local permute copy if none are offnode.
for(int i=0;i<Nsimd;i++){
pointers[i] = (scalar_type *)&send_buf_extract[i][0];
}
int o = 0;
int bo = x*grid->_ostride[dimension];
int sx = (x+sshift)%rd;
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
if ( comm_any ) {
for(int i=0;i<Nsimd;i++){
int inner_bit = (Nsimd>>(permute_type+1));
int ic= (i&inner_bit)? 1:0;
for(int i=0;i<Nsimd;i++){ // there is a reversal in extract merge
pointers[i] = (scalar_type *)&send_buf_extract[Nsimd-1-i][0];
int my_coor = rd*ic + x;
int nbr_coor = my_coor+sshift;
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
int nbr_ox = (nbr_coor%rd); // outer coord of peer
int nbr_lane = (i&(~inner_bit));
int recv_from_rank;
int xmit_to_rank;
if (nbr_ic) nbr_lane|=inner_bit;
assert (sx == nbr_ox);
if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
xmit_to_rank,
(void *)&recv_buf_extract[i][0],
recv_from_rank,
bytes);
rpointers[i] = (scalar_type *)&recv_buf_extract[i][0];
} else {
rpointers[i] = (scalar_type *)&send_buf_extract[nbr_lane][0];
}
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
for(int i=0;i<Nsimd;i++){
int s;
grid->iCoorFromIindex(icoor,i);
s = icoor[dimension];
if(comm_offnode[s]){
int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc[s],xmit_to_rank,recv_from_rank);
grid->SendToRecvFrom((void *)&send_buf_extract[i][0],
xmit_to_rank,
(void *)&recv_buf_extract[i][0],
recv_from_rank,
bytes);
rpointers[i] = (scalar_type *)&recv_buf_extract[i][0];
} else {
rpointers[i] = (scalar_type *)&send_buf_extract[i][0];
}
}
// Permute by swizzling pointers in merge
int permute_slice=0;
int lshift=sshift%ld;
int wrap =lshift/rd;
int num =lshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
int toggle_bit = (Nsimd>>(permute_type+1));
int PermuteMap;
for(int i=0;i<Nsimd;i++){
if ( permute_slice ) {
PermuteMap=i^toggle_bit;
pointers[Nsimd-1-i] = rpointers[PermuteMap];
} else {
pointers[Nsimd-1-i] = rpointers[i];
}
}
Scatter_plane_merge(ret,pointers,dimension,x,cbmask);
} else {
int permute_slice=0;
int wrap = sshift/rd;
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type);
else Copy_plane(ret,rhs,dimension,x,sx,cbmask);
}
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
}
}
}
}
#endif

View File

@ -22,8 +22,12 @@ const int WilsonMatrix::Tm = 7;
public:
int mu;
void Point(int p) { mu=p;};
vHalfSpinColourVector operator () (vSpinColourVector &in)
void Point(int p) {
mu=p;
std::cout << "WilsonCompressor.Point " << mu<<std::endl;
};
vHalfSpinColourVector operator () (const vSpinColourVector &in)
{
vHalfSpinColourVector ret;
switch(mu) {
@ -90,7 +94,8 @@ void WilsonMatrix::multiply(const LatticeFermion &in, LatticeFermion &out)
void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
{
// Stencil.HaloExchange(in,comm_buf);
WilsonCompressor compressor;
Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
for(int ss=0;ss<grid->oSites();ss++){
@ -100,10 +105,13 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
vHalfSpinColourVector chi;
vHalfSpinColourVector Uchi;
vHalfSpinColourVector *chi_p;
result=zero;
#if 0
// Xp
offset = Stencil._offsets [Xp][ss];
local = Stencil._is_local[Xp][ss];
chi_p = &comm_buf[offset];
if ( local ) {
spProjXp(chi,in._odata[offset]);
@ -112,7 +120,6 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
mult(&(Uchi()),&(Umu._odata[ss](Xp)),&(*chi_p)());
spReconXp(result,Uchi);
#if 0
// Yp
offset = Stencil._offsets [Yp][ss];
local = Stencil._is_local[Yp][ss];
@ -145,6 +152,7 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
}
mult(&(Uchi()),&(Umu._odata[ss](Tp)),&(*chi_p)());
accumReconTp(result,Uchi);
#endif
// Xm
offset = Stencil._offsets [Xm][ss];
@ -156,6 +164,7 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
}
mult(&(Uchi()),&(Umu._odata[ss](Xm)),&(*chi_p)());
accumReconXm(result,Uchi);
#if 0
// Ym
offset = Stencil._offsets [Ym][ss];
@ -190,6 +199,7 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
mult(&(Uchi()),&(Umu._odata[ss](Tm)),&(*chi_p)());
accumReconTm(result,Uchi);
#endif
out._odata[ss] = result;
}

View File

@ -1,7 +1,6 @@
#include <Grid.h>
#include <parallelIO/GridNerscIO.h>
using namespace std;
using namespace Grid;
using namespace Grid::QCD;
@ -10,7 +9,7 @@ int main (int argc, char ** argv)
Grid_init(&argc,&argv);
std::vector<int> simd_layout({1,1,2,2});
std::vector<int> mpi_layout ({2,2,2,2});
std::vector<int> mpi_layout ({2,2,1,4});
std::vector<int> latt_size ({8,8,8,16});
GridCartesian Fine(latt_size,simd_layout,mpi_layout);
@ -71,11 +70,11 @@ int main (int argc, char ** argv)
Fine.CoorFromIndex(peer,index,latt_size);
if (nrm > 0){
cout<<"FAIL shift "<< shift<<" in dir "<< dir<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "<< cm()()()<<" expect "<<scm<<" "<<nrm<<endl;
cout<<"Got "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
std::cerr<<"FAIL shift "<< shift<<" in dir "<< dir<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "<< cm()()()<<" expect "<<scm<<" "<<nrm<<std::endl;
std::cerr<<"Got "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
index=real(scm);
Fine.CoorFromIndex(peer,index,latt_size);
cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
std::cerr<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
}
}}}}
}

View File

@ -116,8 +116,6 @@ int main (int argc, char ** argv)
// Insist that operations on random scalars gives
// identical results to on vectors.
Tester<RealD,vRealD>(funcPlus());
std::cout << "==================================="<< std::endl;
std::cout << "Testing vComplexF "<<std::endl;
std::cout << "==================================="<< std::endl;

View File

@ -9,7 +9,7 @@ class SimpleCompressor {
public:
void Point(int) {};
vobj operator() (vobj &arg) {
vobj operator() (const vobj &arg) {
return arg;
}
};

View File

@ -1,5 +1,4 @@
#include <Grid.h>
#include <parallelIO/GridNerscIO.h>
using namespace std;
using namespace Grid;
@ -10,13 +9,19 @@ struct scal {
d internal;
};
Gamma::GammaMatrix Gmu [] = {
Gamma::GammaX,
Gamma::GammaY,
Gamma::GammaZ,
Gamma::GammaT
};
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
std::vector<int> simd_layout({1,1,2,2});
std::vector<int> mpi_layout ({1,1,1,1});
std::vector<int> mpi_layout ({2,1,1,2});
std::vector<int> latt_size ({8,8,8,8});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
@ -29,19 +34,37 @@ int main (int argc, char ** argv)
LatticeFermion src(&Grid); random(pRNG,src);
LatticeFermion result(&Grid); result=zero;
LatticeFermion ref(&Grid); ref=zero;
LatticeFermion tmp(&Grid); tmp=zero;
LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
std::vector<LatticeColourMatrix> U(4,&Grid);
for(int mu=0;mu<Nd;mu++){
U[mu] = 1.0;
pokeIndex<3>(Umu,U[mu],mu);
// U[mu] = 1.0;
// pokeIndex<3>(Umu,U[mu],mu);
U[mu] = peekIndex<3>(Umu,mu);
}
{
int mu=0;
// ref = src + Gamma(Gamma::GammaX)* src ; // 1-gamma_x
ref = src;
ref = U[0]*Cshift(ref,0,1);
std::vector<int> mask({0,0,0,0,1,0,0,0});
{ // Naive wilson implementation
ref = zero;
for(int mu=0;mu<Nd;mu++){
// ref = src + Gamma(Gamma::GammaX)* src ; // 1-gamma_x
if( mask[mu] ) {
tmp = U[mu]*Cshift(src,mu,1);
for(int i=0;i<ref._odata.size();i++){
ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
}
}
if( mask[mu+4] ){
tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu,-1);
for(int i=0;i<ref._odata.size();i++){
ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
}
}
}
}
RealD mass=0.1;