mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Merge commit '1e554350acae0e67fa7177ed0db9d4f684a54af2'
This commit is contained in:
commit
f6c53e5039
@ -127,7 +127,7 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
|
||||
t1=usecond();
|
||||
mfa = flops*ncall/(t1-t0);
|
||||
std::cout<<GridLogMessage << "Called ASM Dw"<< " mflop/s = "<< mfa<<std::endl;
|
||||
|
||||
/*
|
||||
int dag=DaggerNo;
|
||||
t0=usecond();
|
||||
for(int i=0;i<1;i++){
|
||||
@ -144,12 +144,12 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
|
||||
t1=usecond();
|
||||
mfl1= flops*100/(t1-t0);
|
||||
std::cout<<GridLogMessage << "Called ASM-L1 Dw"<< " mflop/s = "<< mfl1<<std::endl;
|
||||
|
||||
os << latt4[0]*latt4[1]*latt4[2]*latt4[3]<< " "<<Ls<<" "<< latt4[0] <<" " <<latt4[2]<< " "
|
||||
<< mfc<<" "
|
||||
<< mfa<<" "
|
||||
<< mfo<<" "
|
||||
<< mfl1<<std::endl;
|
||||
*/
|
||||
|
||||
#if 0
|
||||
for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
|
||||
@ -166,6 +166,7 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
|
||||
diff = resulto-resulta;
|
||||
std::cout<<GridLogMessage << "diff "<< norm2(diff)<<std::endl;
|
||||
std::cout<<std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
11
configure.ac
11
configure.ac
@ -55,15 +55,6 @@ echo :::::::::::::::::::::::::::::::::::::::::::
|
||||
|
||||
AC_CHECK_FUNCS([gettimeofday])
|
||||
|
||||
#AC_CHECK_LIB([gmp],[__gmpf_init],,
|
||||
# [AC_MSG_ERROR(GNU Multiple Precision GMP library was not found in your system.
|
||||
#Please install or provide the correct path to your installation
|
||||
#Info at: http://www.gmplib.org)])
|
||||
|
||||
#AC_CHECK_LIB([mpfr],[mpfr_init],,
|
||||
# [AC_MSG_ERROR(GNU Multiple Precision MPFR library was not found in your system.
|
||||
#Please install or provide the correct path to your installation
|
||||
#Info at: http://www.mpfr.org/)])
|
||||
|
||||
#
|
||||
# SIMD instructions selection
|
||||
@ -124,7 +115,7 @@ case ${ac_SIMD} in
|
||||
echo Configuring for IMCI
|
||||
AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner] )
|
||||
supported="cross compilation"
|
||||
ac_ZMM=yes;
|
||||
ac_ZMM=no;
|
||||
;;
|
||||
NEONv8)
|
||||
echo Configuring for experimental ARMv8a support
|
||||
|
File diff suppressed because one or more lines are too long
@ -47,6 +47,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)
|
||||
#define _MM_SELECT_TWO_TWO (A,B) _MM_SELECT_FOUR_TWO(0,0,A,B)
|
||||
|
||||
#define RotateBit (0x100)
|
||||
|
||||
namespace Grid {
|
||||
|
||||
typedef uint32_t Integer;
|
||||
|
@ -321,6 +321,9 @@ PARALLEL_FOR_LOOP
|
||||
int simd_layout = _grid->_simd_layout[dimension];
|
||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
|
||||
int rotate_dim = _grid->_simd_layout[dimension]>2;
|
||||
|
||||
assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported
|
||||
|
||||
int sshift[2];
|
||||
|
||||
@ -368,7 +371,8 @@ PARALLEL_FOR_LOOP
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
int ld = _grid->_ldimensions[dimension];
|
||||
int gd = _grid->_gdimensions[dimension];
|
||||
|
||||
int ly = _grid->_simd_layout[dimension];
|
||||
|
||||
// Map to always positive shift modulo global full dimension.
|
||||
int shift = (shiftpm+fd)%fd;
|
||||
|
||||
@ -398,7 +402,7 @@ PARALLEL_FOR_LOOP
|
||||
int wrap = sshift/rd;
|
||||
int num = sshift%rd;
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
else permute_slice = (wrap+1)%ly;
|
||||
}
|
||||
|
||||
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
|
||||
@ -418,7 +422,6 @@ PARALLEL_FOR_LOOP
|
||||
int simd_layout = _grid->_simd_layout[dimension];
|
||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||
|
||||
// assert(simd_layout==1); // Why?
|
||||
assert(comm_dim==1);
|
||||
int shift = (shiftpm + fd) %fd;
|
||||
assert(shift>=0);
|
||||
@ -529,7 +532,7 @@ PARALLEL_FOR_LOOP
|
||||
_entries[point][lo+o+b]._around_the_world=wrap;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
o +=_grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
@ -591,8 +594,11 @@ PARALLEL_FOR_LOOP
|
||||
template<class compressor>
|
||||
void HaloExchange(const Lattice<vobj> &source,compressor &compress)
|
||||
{
|
||||
auto thr = HaloExchangeBegin(source,compress);
|
||||
HaloExchangeComplete(thr);
|
||||
Mergers.resize(0);
|
||||
Packets.resize(0);
|
||||
HaloGather(source,compress);
|
||||
Communicate();
|
||||
CommsMerge();
|
||||
}
|
||||
|
||||
void HaloExchangeComplete(std::thread &thr)
|
||||
@ -749,6 +755,7 @@ PARALLEL_FOR_LOOP
|
||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||
|
||||
assert(comm_dim==1);
|
||||
// This will not work with a rotate dim
|
||||
assert(simd_layout==2);
|
||||
assert(shift>=0);
|
||||
assert(shift<fd);
|
||||
@ -793,7 +800,9 @@ PARALLEL_FOR_LOOP
|
||||
gathermtime+=usecond();
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
|
||||
|
||||
// FIXME
|
||||
// This logic is hard coded to simd_layout ==2 and not allowing >2
|
||||
// std::cout << "GatherSimd : lane 1st elem " << i << u_simd_send_buf[i ][u_comm_offset]<<std::endl;
|
||||
|
||||
int inner_bit = (Nsimd>>(permute_type+1));
|
||||
|
@ -16,9 +16,13 @@
|
||||
#define INCLUDED_ALG_REMEZ_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <Config.h>
|
||||
|
||||
//#include <algorithms/approx/bigfloat.h>
|
||||
#ifdef HAVE_GMP_H
|
||||
#include <algorithms/approx/bigfloat.h>
|
||||
#else
|
||||
#include <algorithms/approx/bigfloat_double.h>
|
||||
#endif
|
||||
|
||||
#define JMAX 10000 //Maximum number of iterations of Newton's approximation
|
||||
#define SUM_MAX 10 // Maximum number of terms in exponential
|
||||
|
@ -564,6 +564,7 @@ until convergence
|
||||
|
||||
for(int j=k1-1; j<k2+1; ++j){
|
||||
for(int k=0; k<Nm; ++k){
|
||||
B[j].checkerboard = evec[k].checkerboard;
|
||||
B[j] += Qt[k+Nm*j] * evec[k];
|
||||
}
|
||||
}
|
||||
@ -592,6 +593,7 @@ until convergence
|
||||
|
||||
for(int j = 0; j<Nk; ++j){
|
||||
for(int k = 0; k<Nk; ++k){
|
||||
B[j].checkerboard = evec[k].checkerboard;
|
||||
B[j] += Qt[k+j*Nm] * evec[k];
|
||||
}
|
||||
// std::cout << "norm(B["<<j<<"])="<<norm2(B[j])<<std::endl;
|
||||
|
@ -138,6 +138,25 @@ public:
|
||||
}
|
||||
inline int PermuteType(int dimension){
|
||||
int permute_type=0;
|
||||
//
|
||||
// FIXME:
|
||||
//
|
||||
// Best way to encode this would be to present a mask
|
||||
// for which simd dimensions are rotated, and the rotation
|
||||
// size. If there is only one simd dimension rotated, this is just
|
||||
// a permute.
|
||||
//
|
||||
// Cases: PermuteType == 1,2,4,8
|
||||
// Distance should be either 0,1,2..
|
||||
//
|
||||
if ( _simd_layout[dimension] > 2 ) {
|
||||
for(int d=0;d<_ndimension;d++){
|
||||
if ( d != dimension ) assert ( (_simd_layout[d]==1) );
|
||||
}
|
||||
permute_type = RotateBit; // How to specify distance; this is not just direction.
|
||||
return permute_type;
|
||||
}
|
||||
|
||||
for(int d=_ndimension-1;d>dimension;d--){
|
||||
if (_simd_layout[d]>1 ) permute_type++;
|
||||
}
|
||||
@ -147,12 +166,12 @@ public:
|
||||
// Array sizing queries
|
||||
////////////////////////////////////////////////////////////////
|
||||
|
||||
inline int iSites(void) { return _isites; };
|
||||
inline int Nsimd(void) { return _isites; };// Synonymous with iSites
|
||||
inline int oSites(void) { return _osites; };
|
||||
inline int lSites(void) { return _isites*_osites; };
|
||||
inline int gSites(void) { return _isites*_osites*_Nprocessors; };
|
||||
inline int Nd (void) { return _ndimension;};
|
||||
inline int iSites(void) const { return _isites; };
|
||||
inline int Nsimd(void) const { return _isites; };// Synonymous with iSites
|
||||
inline int oSites(void) const { return _osites; };
|
||||
inline int lSites(void) const { return _isites*_osites; };
|
||||
inline int gSites(void) const { return _isites*_osites*_Nprocessors; };
|
||||
inline int Nd (void) const { return _ndimension;};
|
||||
|
||||
inline const std::vector<int> &FullDimensions(void) { return _fdimensions;};
|
||||
inline const std::vector<int> &GlobalDimensions(void) { return _gdimensions;};
|
||||
@ -165,6 +184,9 @@ public:
|
||||
void GlobalIndexToGlobalCoor(int gidx,std::vector<int> &gcoor){
|
||||
Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
|
||||
}
|
||||
void LocalIndexToLocalCoor(int lidx,std::vector<int> &lcoor){
|
||||
Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
|
||||
}
|
||||
void GlobalCoorToGlobalIndex(const std::vector<int> & gcoor,int & gidx){
|
||||
gidx=0;
|
||||
int mult=1;
|
||||
|
@ -324,6 +324,7 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
|
||||
int rd = grid->_rdimensions[dimension];
|
||||
int ld = grid->_ldimensions[dimension];
|
||||
int gd = grid->_gdimensions[dimension];
|
||||
int ly = grid->_simd_layout[dimension];
|
||||
|
||||
// Map to always positive shift modulo global full dimension.
|
||||
shift = (shift+fd)%fd;
|
||||
@ -332,6 +333,7 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
|
||||
// the permute type
|
||||
int permute_dim =grid->PermuteDim(dimension);
|
||||
int permute_type=grid->PermuteType(dimension);
|
||||
int permute_type_dist;
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
@ -343,15 +345,31 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
|
||||
int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
|
||||
int sx = (x+sshift)%rd;
|
||||
|
||||
// FIXME : This must change where we have a
|
||||
// Rotate slice.
|
||||
|
||||
// Document how this works ; why didn't I do this when I first wrote it...
|
||||
// wrap is whether sshift > rd.
|
||||
// num is sshift mod rd.
|
||||
//
|
||||
int permute_slice=0;
|
||||
if(permute_dim){
|
||||
int wrap = sshift/rd;
|
||||
int num = sshift%rd;
|
||||
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
else permute_slice = (wrap+1)%ly;
|
||||
|
||||
if ( (ly>2) && (permute_slice) ) {
|
||||
assert(permute_type & RotateBit);
|
||||
permute_type_dist = permute_type|permute_slice;
|
||||
} else {
|
||||
permute_type_dist = permute_type;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type);
|
||||
if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist);
|
||||
else Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
||||
|
||||
|
||||
|
@ -152,7 +152,7 @@ PARALLEL_FOR_LOOP
|
||||
// Peek a scalar object from the SIMD array
|
||||
//////////////////////////////////////////////////////////
|
||||
template<class vobj,class sobj>
|
||||
void peekLocalSite(sobj &s,Lattice<vobj> &l,std::vector<int> &site){
|
||||
void peekLocalSite(sobj &s,const Lattice<vobj> &l,std::vector<int> &site){
|
||||
|
||||
GridBase *grid=l._grid;
|
||||
|
||||
|
@ -142,6 +142,10 @@ namespace Grid {
|
||||
mult(&phi(),&U(mu),&chi());
|
||||
}
|
||||
|
||||
template<class ref>
|
||||
inline void loadLinkElement(Simd & reg,ref &memory){
|
||||
reg = memory;
|
||||
}
|
||||
inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
|
||||
{
|
||||
conformable(Uds._grid,GaugeGrid);
|
||||
@ -181,6 +185,100 @@ PARALLEL_FOR_LOOP
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
///////
|
||||
// Single flavour four spinors with colour index, 5d redblack
|
||||
///////
|
||||
template<class S,int Nrepresentation=Nc>
|
||||
class DomainWallRedBlack5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > {
|
||||
public:
|
||||
|
||||
typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;
|
||||
|
||||
INHERIT_GIMPL_TYPES(Gimpl);
|
||||
|
||||
template<typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
|
||||
template<typename vtype> using iImplHalfSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
|
||||
template<typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds >;
|
||||
template<typename vtype> using iImplGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd >;
|
||||
template<typename vtype> using iImplGaugeLink = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
|
||||
|
||||
typedef iImplSpinor <Simd> SiteSpinor;
|
||||
typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
|
||||
typedef Lattice<SiteSpinor> FermionField;
|
||||
|
||||
// Make the doubled gauge field a *scalar*
|
||||
typedef iImplDoubledGaugeField<typename Simd::scalar_type> SiteDoubledGaugeField; // This is a scalar
|
||||
typedef iImplGaugeField<typename Simd::scalar_type> SiteScalarGaugeField; // scalar
|
||||
typedef iImplGaugeLink <typename Simd::scalar_type> SiteScalarGaugeLink; // scalar
|
||||
|
||||
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
|
||||
|
||||
typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
|
||||
typedef WilsonImplParams ImplParams;
|
||||
typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;
|
||||
|
||||
ImplParams Params;
|
||||
|
||||
DomainWallRedBlack5dImpl(const ImplParams &p= ImplParams()) : Params(p) {};
|
||||
|
||||
bool overlapCommsCompute(void) { return false; };
|
||||
|
||||
template<class ref>
|
||||
inline void loadLinkElement(Simd & reg,ref &memory){
|
||||
vsplat(reg,memory);
|
||||
}
|
||||
inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
|
||||
{
|
||||
SiteGaugeLink UU;
|
||||
for(int i=0;i<Nrepresentation;i++){
|
||||
for(int j=0;j<Nrepresentation;j++){
|
||||
vsplat(UU()()(i,j),U(mu)()(i,j));
|
||||
}
|
||||
}
|
||||
mult(&phi(),&UU(),&chi());
|
||||
}
|
||||
|
||||
inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
|
||||
{
|
||||
SiteScalarGaugeField ScalarUmu;
|
||||
SiteDoubledGaugeField ScalarUds;
|
||||
|
||||
GaugeLinkField U (Umu._grid);
|
||||
GaugeField Uadj(Umu._grid);
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
U = PeekIndex<LorentzIndex>(Umu,mu);
|
||||
U = adj(Cshift(U,mu,-1));
|
||||
PokeIndex<LorentzIndex>(Uadj,U,mu);
|
||||
}
|
||||
|
||||
for(int lidx=0;lidx<GaugeGrid->lSites();lidx++){
|
||||
std::vector<int> lcoor;
|
||||
GaugeGrid->LocalIndexToLocalCoor(lidx,lcoor);
|
||||
|
||||
peekLocalSite(ScalarUmu,Umu,lcoor);
|
||||
for(int mu=0;mu<4;mu++) ScalarUds(mu) = ScalarUmu(mu);
|
||||
|
||||
peekLocalSite(ScalarUmu,Uadj,lcoor);
|
||||
for(int mu=0;mu<4;mu++) ScalarUds(mu+4) = ScalarUmu(mu);
|
||||
|
||||
pokeLocalSite(ScalarUds,Uds,lcoor);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
|
||||
assert(0);
|
||||
}
|
||||
|
||||
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu){
|
||||
assert(0);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Flavour doubled spinors; is Gparity the only? what about C*?
|
||||
////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -290,8 +388,8 @@ PARALLEL_FOR_LOOP
|
||||
conformable(Uds._grid,GaugeGrid);
|
||||
conformable(Umu._grid,GaugeGrid);
|
||||
|
||||
GaugeLinkField Utmp(GaugeGrid);
|
||||
GaugeLinkField U(GaugeGrid);
|
||||
GaugeLinkField Utmp (GaugeGrid);
|
||||
GaugeLinkField U (GaugeGrid);
|
||||
GaugeLinkField Uconj(GaugeGrid);
|
||||
|
||||
Lattice<iScalar<vInteger> > coor(GaugeGrid);
|
||||
@ -379,6 +477,10 @@ PARALLEL_FOR_LOOP
|
||||
typedef WilsonImpl<vComplexF,Nc> WilsonImplF; // Float
|
||||
typedef WilsonImpl<vComplexD,Nc> WilsonImplD; // Double
|
||||
|
||||
typedef DomainWallRedBlack5dImpl<vComplex ,Nc> DomainWallRedBlack5dImplR; // Real.. whichever prec
|
||||
typedef DomainWallRedBlack5dImpl<vComplexF,Nc> DomainWallRedBlack5dImplF; // Float
|
||||
typedef DomainWallRedBlack5dImpl<vComplexD,Nc> DomainWallRedBlack5dImplD; // Double
|
||||
|
||||
typedef GparityWilsonImpl<vComplex ,Nc> GparityWilsonImplR; // Real.. whichever prec
|
||||
typedef GparityWilsonImpl<vComplexF,Nc> GparityWilsonImplF; // Float
|
||||
typedef GparityWilsonImpl<vComplexD,Nc> GparityWilsonImplD; // Double
|
||||
|
@ -288,11 +288,7 @@ PARALLEL_FOR_LOOP
|
||||
void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
if ( Impl::overlapCommsCompute () ) {
|
||||
DhopInternalCommsOverlapCompute(st,U,in,out,dag);
|
||||
} else {
|
||||
DhopInternalCommsThenCompute(st,U,in,out,dag);
|
||||
}
|
||||
DhopInternalCommsThenCompute(st,U,in,out,dag);
|
||||
}
|
||||
template<class Impl>
|
||||
void WilsonFermion<Impl>::DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
|
||||
@ -330,15 +326,6 @@ PARALLEL_FOR_LOOP
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
|
||||
const FermionField &in, FermionField &out,int dag) {
|
||||
|
||||
assert(0);
|
||||
|
||||
};
|
||||
|
||||
|
||||
FermOpTemplateInstantiate(WilsonFermion);
|
||||
GparityFermOpTemplateInstantiate(WilsonFermion);
|
||||
|
@ -116,9 +116,6 @@ namespace Grid {
|
||||
|
||||
void DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
|
||||
const FermionField &in, FermionField &out,int dag) ;
|
||||
void DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
|
||||
const FermionField &in, FermionField &out,int dag) ;
|
||||
|
||||
|
||||
// Constructor
|
||||
WilsonFermion(GaugeField &_Umu,
|
||||
|
@ -68,10 +68,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
|
||||
// some assertions
|
||||
assert(FiveDimGrid._ndimension==5);
|
||||
assert(FourDimGrid._ndimension==4);
|
||||
|
||||
assert(FiveDimRedBlackGrid._ndimension==5);
|
||||
assert(FourDimRedBlackGrid._ndimension==4);
|
||||
|
||||
assert(FiveDimRedBlackGrid._checker_dim==1);
|
||||
|
||||
// Dimension zero of the five-d is the Ls direction
|
||||
@ -106,11 +104,75 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
|
||||
dslashtime=0;
|
||||
dslash1time=0;
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
WilsonFermion5D<Impl>::WilsonFermion5D(int simd, GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||
GridCartesian &FourDimGrid,
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
RealD _M5,const ImplParams &p) :
|
||||
Kernels(p),
|
||||
_FiveDimGrid (&FiveDimGrid),
|
||||
_FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
|
||||
_FourDimGrid (&FourDimGrid),
|
||||
_FourDimRedBlackGrid(&FourDimRedBlackGrid),
|
||||
Stencil (_FiveDimGrid,npoint,Even,directions,displacements),
|
||||
StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
|
||||
StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
|
||||
M5(_M5),
|
||||
Umu(_FourDimGrid),
|
||||
UmuEven(_FourDimRedBlackGrid),
|
||||
UmuOdd (_FourDimRedBlackGrid),
|
||||
Lebesgue(_FourDimGrid),
|
||||
LebesgueEvenOdd(_FourDimRedBlackGrid)
|
||||
{
|
||||
int nsimd = Simd::Nsimd();
|
||||
|
||||
// some assertions
|
||||
assert(FiveDimGrid._ndimension==5);
|
||||
assert(FiveDimRedBlackGrid._ndimension==5);
|
||||
assert(FiveDimRedBlackGrid._checker_dim==0); // Checkerboard the s-direction
|
||||
assert(FourDimGrid._ndimension==4);
|
||||
assert(FourDimRedBlackGrid._ndimension==4);
|
||||
|
||||
// Dimension zero of the five-d is the Ls direction
|
||||
Ls=FiveDimGrid._fdimensions[0];
|
||||
assert(FiveDimGrid._processors[0] ==1);
|
||||
assert(FiveDimGrid._simd_layout[0] ==nsimd);
|
||||
|
||||
assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
|
||||
assert(FiveDimRedBlackGrid._processors[0] ==1);
|
||||
assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
|
||||
|
||||
// Other dimensions must match the decomposition of the four-D fields
|
||||
for(int d=0;d<4;d++){
|
||||
assert(FourDimRedBlackGrid._fdimensions[d] ==FourDimGrid._fdimensions[d]);
|
||||
assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
|
||||
|
||||
assert(FourDimRedBlackGrid._processors[d] ==FourDimGrid._processors[d]);
|
||||
assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
|
||||
|
||||
assert(FourDimGrid._simd_layout[d]=1);
|
||||
assert(FourDimRedBlackGrid._simd_layout[d] ==1);
|
||||
assert(FourDimRedBlackGrid._simd_layout[d] ==1);
|
||||
assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
|
||||
|
||||
assert(FiveDimGrid._fdimensions[d+1] ==FourDimGrid._fdimensions[d]);
|
||||
assert(FiveDimGrid._processors[d+1] ==FourDimGrid._processors[d]);
|
||||
assert(FiveDimGrid._simd_layout[d+1] ==FourDimGrid._simd_layout[d]);
|
||||
}
|
||||
|
||||
// Allocate the required comms buffer
|
||||
ImportGauge(_Umu);
|
||||
}
|
||||
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
|
||||
{
|
||||
GaugeField HUmu(_Umu._grid);
|
||||
HUmu = _Umu*(-0.5);
|
||||
GaugeField HUmu(_Umu._grid);
|
||||
HUmu = _Umu*(-0.5);
|
||||
Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
|
||||
pickCheckerboard(Even,UmuEven,Umu);
|
||||
pickCheckerboard(Odd ,UmuOdd,Umu);
|
||||
@ -294,15 +356,12 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
|
||||
Compressor compressor(dag);
|
||||
|
||||
// Assume balanced KMP_AFFINITY; this is forced in GridThread.h
|
||||
|
||||
int threads = GridThread::GetThreads();
|
||||
int HT = GridThread::GetHyperThreads();
|
||||
int cores = GridThread::GetCores();
|
||||
int nwork = U._grid->oSites();
|
||||
int LLs = in._grid->_rdimensions[0];
|
||||
|
||||
commtime -=usecond();
|
||||
auto handle = st.HaloExchangeBegin(in,compressor);
|
||||
st.HaloExchangeComplete(handle);
|
||||
// auto handle = st.HaloExchangeBegin(in,compressor);
|
||||
// st.HaloExchangeComplete(handle);
|
||||
st.HaloExchange(in,compressor);
|
||||
commtime +=usecond();
|
||||
|
||||
jointime -=usecond();
|
||||
@ -318,97 +377,48 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
|
||||
if( this->HandOptDslash ) {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||
int sU=ss;
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sF = s+Ls*sU;
|
||||
for(int s=0;s<LLs;s++){
|
||||
int sU=ss;
|
||||
int sF = s+LLs*sU;
|
||||
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||
{
|
||||
int sd;
|
||||
for(sd=0;sd<Ls;sd++){
|
||||
int sU=ss;
|
||||
int sF = sd+Ls*sU;
|
||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
for(int s=0;s<LLs;s++){
|
||||
int sU=ss;
|
||||
int sF = s+LLs*sU;
|
||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if( this->AsmOptDslash ) {
|
||||
// for(int i=0;i<1;i++){
|
||||
// for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
|
||||
// PerformanceCounter Counter(i);
|
||||
// Counter.Start();
|
||||
|
||||
#pragma omp parallel for
|
||||
for(int t=0;t<threads;t++){
|
||||
|
||||
int hyperthread = t%HT;
|
||||
int core = t/HT;
|
||||
|
||||
int sswork, swork,soff,ssoff, sU,sF;
|
||||
|
||||
GridThread::GetWork(nwork,core,sswork,ssoff,cores);
|
||||
GridThread::GetWork(Ls , hyperthread, swork, soff,HT);
|
||||
|
||||
for(int ss=0;ss<sswork;ss++){
|
||||
for(int s=soff;s<soff+swork;s++){
|
||||
|
||||
sU=ss+ ssoff;
|
||||
|
||||
if ( LebesgueOrder::UseLebesgueOrder ) {
|
||||
sU = lo.Reorder(sU);
|
||||
}
|
||||
sF = s+Ls*sU;
|
||||
Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||
for(int s=0;s<LLs;s++){
|
||||
int sU=ss;
|
||||
int sF = s+LLs*sU;
|
||||
Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
// Counter.Stop();
|
||||
// Counter.Report();
|
||||
// }
|
||||
} else if( this->HandOptDslash ) {
|
||||
/*
|
||||
|
||||
#pragma omp parallel for schedule(static)
|
||||
for(int t=0;t<threads;t++){
|
||||
|
||||
int hyperthread = t%HT;
|
||||
int core = t/HT;
|
||||
|
||||
int sswork, swork,soff,ssoff, sU,sF;
|
||||
|
||||
GridThread::GetWork(nwork,core,sswork,ssoff,cores);
|
||||
GridThread::GetWork(Ls , hyperthread, swork, soff,HT);
|
||||
|
||||
for(int ss=0;ss<sswork;ss++){
|
||||
sU=ss+ ssoff;
|
||||
for(int s=soff;s<soff+swork;s++){
|
||||
sF = s+Ls*sU;
|
||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||
int sU=ss;
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sF = s+Ls*sU;
|
||||
for(int s=0;s<LLs;s++){
|
||||
int sU=ss;
|
||||
int sF = s+LLs*sU;
|
||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||
int sU=ss;
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sF = s+Ls*sU;
|
||||
for(int s=0;s<LLs;s++){
|
||||
int sU=ss;
|
||||
int sF = s+LLs*sU;
|
||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
@ -418,251 +428,6 @@ PARALLEL_FOR_LOOP
|
||||
alltime+=usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder &lo,
|
||||
DoubledGaugeField & U,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||
alltime-=usecond();
|
||||
Compressor compressor(dag);
|
||||
|
||||
// Assume balanced KMP_AFFINITY; this is forced in GridThread.h
|
||||
|
||||
int threads = GridThread::GetThreads();
|
||||
int HT = GridThread::GetHyperThreads();
|
||||
int cores = GridThread::GetCores();
|
||||
int nwork = U._grid->oSites();
|
||||
|
||||
commtime -=usecond();
|
||||
auto handle = st.HaloExchangeBegin(in,compressor);
|
||||
st.HaloExchangeComplete(handle);
|
||||
commtime +=usecond();
|
||||
|
||||
jointime -=usecond();
|
||||
jointime +=usecond();
|
||||
|
||||
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
||||
// Not loop ordering and data layout.
|
||||
// Designed to create
|
||||
// - per thread reuse in L1 cache for U
|
||||
// - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
for(int jjj=0;jjj<100;jjj++){
|
||||
#pragma omp barrier
|
||||
dslashtime -=usecond();
|
||||
if ( dag == DaggerYes ) {
|
||||
if( this->HandOptDslash ) {
|
||||
#pragma omp for
|
||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||
int sU=ss;
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sF = s+Ls*sU;
|
||||
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
#pragma omp for
|
||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||
{
|
||||
int sd;
|
||||
for(sd=0;sd<Ls;sd++){
|
||||
int sU=ss;
|
||||
int sF = sd+Ls*sU;
|
||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if( this->AsmOptDslash ) {
|
||||
// for(int i=0;i<1;i++){
|
||||
// for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
|
||||
// PerformanceCounter Counter(i);
|
||||
// Counter.Start();
|
||||
|
||||
#pragma omp for
|
||||
for(int t=0;t<threads;t++){
|
||||
|
||||
int hyperthread = t%HT;
|
||||
int core = t/HT;
|
||||
|
||||
int sswork, swork,soff,ssoff, sU,sF;
|
||||
|
||||
GridThread::GetWork(nwork,core,sswork,ssoff,cores);
|
||||
GridThread::GetWork(Ls , hyperthread, swork, soff,HT);
|
||||
|
||||
for(int ss=0;ss<sswork;ss++){
|
||||
for(int s=soff;s<soff+swork;s++){
|
||||
|
||||
sU=ss+ ssoff;
|
||||
|
||||
if ( LebesgueOrder::UseLebesgueOrder ) {
|
||||
sU = lo.Reorder(sU);
|
||||
}
|
||||
sF = s+Ls*sU;
|
||||
Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Counter.Stop();
|
||||
// Counter.Report();
|
||||
// }
|
||||
} else if( this->HandOptDslash ) {
|
||||
#pragma omp for
|
||||
|
||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||
int sU=ss;
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sF = s+Ls*sU;
|
||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#pragma omp for
|
||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||
int sU=ss;
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sF = s+Ls*sU;
|
||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
dslashtime +=usecond();
|
||||
alltime+=usecond();
|
||||
}
|
||||
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::DhopInternalL1bench(StencilImpl & st, LebesgueOrder &lo,
|
||||
DoubledGaugeField & U,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||
alltime-=usecond();
|
||||
Compressor compressor(dag);
|
||||
|
||||
// Assume balanced KMP_AFFINITY; this is forced in GridThread.h
|
||||
|
||||
int threads = GridThread::GetThreads();
|
||||
int HT = GridThread::GetHyperThreads();
|
||||
int cores = GridThread::GetCores();
|
||||
int nwork = U._grid->oSites();
|
||||
|
||||
commtime -=usecond();
|
||||
auto handle = st.HaloExchangeBegin(in,compressor);
|
||||
st.HaloExchangeComplete(handle);
|
||||
commtime +=usecond();
|
||||
|
||||
jointime -=usecond();
|
||||
jointime +=usecond();
|
||||
|
||||
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
||||
// Not loop ordering and data layout.
|
||||
// Designed to create
|
||||
// - per thread reuse in L1 cache for U
|
||||
// - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
for(int jjj=0;jjj<100;jjj++){
|
||||
#pragma omp barrier
|
||||
dslashtime -=usecond();
|
||||
if ( dag == DaggerYes ) {
|
||||
if( this->HandOptDslash ) {
|
||||
#pragma omp for
|
||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||
int sU=0;
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sF = s+Ls*sU;
|
||||
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
#pragma omp for
|
||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||
{
|
||||
int sd;
|
||||
for(sd=0;sd<Ls;sd++){
|
||||
int sU=0;
|
||||
int sF = sd+Ls*sU;
|
||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if( this->AsmOptDslash ) {
|
||||
// for(int i=0;i<1;i++){
|
||||
// for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
|
||||
// PerformanceCounter Counter(i);
|
||||
// Counter.Start();
|
||||
|
||||
#pragma omp for
|
||||
for(int t=0;t<threads;t++){
|
||||
|
||||
int hyperthread = t%HT;
|
||||
int core = t/HT;
|
||||
|
||||
int sswork, swork,soff,ssoff, sU,sF;
|
||||
|
||||
GridThread::GetWork(nwork,core,sswork,ssoff,cores);
|
||||
GridThread::GetWork(Ls , hyperthread, swork, soff,HT);
|
||||
|
||||
for(int ss=0;ss<sswork;ss++){
|
||||
for(int s=soff;s<soff+swork;s++){
|
||||
|
||||
sU=0;
|
||||
sF = s+Ls*sU;
|
||||
Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Counter.Stop();
|
||||
// Counter.Report();
|
||||
// }
|
||||
} else if( this->HandOptDslash ) {
|
||||
#pragma omp for
|
||||
|
||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||
int sU=0;
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sF = s+Ls*sU;
|
||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#pragma omp for
|
||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||
int sU=0;
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sF = s+Ls*sU;
|
||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
dslashtime +=usecond();
|
||||
alltime+=usecond();
|
||||
}
|
||||
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo,
|
||||
DoubledGaugeField & U,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
|
||||
@ -706,6 +471,8 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
|
||||
|
||||
FermOpTemplateInstantiate(WilsonFermion5D);
|
||||
GparityFermOpTemplateInstantiate(WilsonFermion5D);
|
||||
template class WilsonFermion5D<DomainWallRedBlack5dImplF>;
|
||||
template class WilsonFermion5D<DomainWallRedBlack5dImplD>;
|
||||
|
||||
}}
|
||||
|
||||
|
@ -87,6 +87,7 @@ namespace Grid {
|
||||
virtual void MeooeDag (const FermionField &in, FermionField &out){assert(0);};
|
||||
virtual void MooeeDag (const FermionField &in, FermionField &out){assert(0);};
|
||||
virtual void MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
|
||||
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
|
||||
|
||||
// These can be overridden by fancy 5d chiral action
|
||||
virtual void DhopDeriv (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
|
||||
@ -121,32 +122,12 @@ namespace Grid {
|
||||
FermionField &out,
|
||||
int dag);
|
||||
|
||||
void DhopInternalOMPbench(StencilImpl & st,
|
||||
LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out,
|
||||
int dag);
|
||||
|
||||
void DhopInternalL1bench(StencilImpl & st,
|
||||
LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out,
|
||||
int dag);
|
||||
|
||||
void DhopInternalCommsThenCompute(StencilImpl & st,
|
||||
LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out,
|
||||
int dag);
|
||||
void DhopInternalCommsOverlapCompute(StencilImpl & st,
|
||||
LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out,
|
||||
int dag);
|
||||
|
||||
// Constructors
|
||||
WilsonFermion5D(GaugeField &_Umu,
|
||||
@ -156,6 +137,15 @@ namespace Grid {
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
double _M5,const ImplParams &p= ImplParams());
|
||||
|
||||
// Constructors
|
||||
WilsonFermion5D(int simd,
|
||||
GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||
GridCartesian &FourDimGrid,
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
double _M5,const ImplParams &p= ImplParams());
|
||||
|
||||
// DoubleStore
|
||||
void ImportGauge(const GaugeField &_Umu);
|
||||
|
||||
|
@ -529,5 +529,7 @@ void WilsonKernels<Impl>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
|
||||
#endif
|
||||
|
||||
FermOpTemplateInstantiate(WilsonKernels);
|
||||
template class WilsonKernels<DomainWallRedBlack5dImplF>;
|
||||
template class WilsonKernels<DomainWallRedBlack5dImplD>;
|
||||
|
||||
}}
|
||||
|
@ -27,7 +27,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid.h>
|
||||
#if defined(AVX512) || defined (IMCI)
|
||||
#if defined(AVX512)
|
||||
//#if defined (IMCI)
|
||||
|
||||
#include <simd/Intel512wilson.h>
|
||||
@ -256,5 +256,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
|
||||
template class WilsonKernels<WilsonImplD>;
|
||||
template class WilsonKernels<GparityWilsonImplF>;
|
||||
template class WilsonKernels<GparityWilsonImplD>;
|
||||
template class WilsonKernels<DomainWallRedBlack5dImplF>;
|
||||
template class WilsonKernels<DomainWallRedBlack5dImplD>;
|
||||
}}
|
||||
#endif
|
||||
|
@ -54,14 +54,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Chi_11 = ref()(1)(1);\
|
||||
Chi_12 = ref()(1)(2);
|
||||
|
||||
// To splat or not to splat depends on the implementation
|
||||
#define MULT_2SPIN(A)\
|
||||
auto & ref(U._odata[sU](A)); \
|
||||
U_00 = ref()(0,0);\
|
||||
U_10 = ref()(1,0);\
|
||||
U_20 = ref()(2,0);\
|
||||
U_01 = ref()(0,1);\
|
||||
U_11 = ref()(1,1); \
|
||||
U_21 = ref()(2,1);\
|
||||
Impl::loadLinkElement(U_00,ref()(0,0)); \
|
||||
Impl::loadLinkElement(U_10,ref()(1,0)); \
|
||||
Impl::loadLinkElement(U_20,ref()(2,0)); \
|
||||
Impl::loadLinkElement(U_01,ref()(0,1)); \
|
||||
Impl::loadLinkElement(U_11,ref()(1,1)); \
|
||||
Impl::loadLinkElement(U_21,ref()(2,1)); \
|
||||
UChi_00 = U_00*Chi_00;\
|
||||
UChi_10 = U_00*Chi_10;\
|
||||
UChi_01 = U_10*Chi_00;\
|
||||
@ -74,9 +75,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
UChi_11+= U_11*Chi_11;\
|
||||
UChi_02+= U_21*Chi_01;\
|
||||
UChi_12+= U_21*Chi_11;\
|
||||
U_00 = ref()(0,2);\
|
||||
U_10 = ref()(1,2);\
|
||||
U_20 = ref()(2,2);\
|
||||
Impl::loadLinkElement(U_00,ref()(0,2)); \
|
||||
Impl::loadLinkElement(U_10,ref()(1,2)); \
|
||||
Impl::loadLinkElement(U_20,ref()(2,2)); \
|
||||
UChi_00+= U_00*Chi_02;\
|
||||
UChi_10+= U_00*Chi_12;\
|
||||
UChi_01+= U_10*Chi_02;\
|
||||
@ -84,6 +85,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
UChi_02+= U_20*Chi_02;\
|
||||
UChi_12+= U_20*Chi_12;
|
||||
|
||||
|
||||
#define PERMUTE_DIR(dir) \
|
||||
permute##dir(Chi_00,Chi_00);\
|
||||
permute##dir(Chi_01,Chi_01);\
|
||||
@ -809,7 +811,6 @@ int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,Doub
|
||||
int sF,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
|
||||
//check consistency of return types between these functions and the ones in WilsonKernels.cc
|
||||
return 0;
|
||||
|
||||
}
|
||||
@ -843,6 +844,47 @@ int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,D
|
||||
|
||||
|
||||
|
||||
//////////////
|
||||
/*
|
||||
template<>
|
||||
int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
template<>
|
||||
int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
|
||||
return 0;
|
||||
}
|
||||
|
||||
template<>
|
||||
int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
|
||||
return 0;
|
||||
}
|
||||
|
||||
template<>
|
||||
int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
|
||||
return 0;
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out);
|
||||
@ -870,4 +912,21 @@ template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilI
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out);
|
||||
|
||||
|
||||
|
||||
|
||||
template int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out);
|
||||
template int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out);
|
||||
template int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out);
|
||||
template int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out);
|
||||
|
||||
|
||||
}}
|
||||
|
@ -42,7 +42,9 @@ template<class Gimpl> class WilsonLoops;
|
||||
#define INHERIT_GIMPL_TYPES(GImpl) \
|
||||
typedef typename GImpl::Simd Simd;\
|
||||
typedef typename GImpl::GaugeLinkField GaugeLinkField;\
|
||||
typedef typename GImpl::GaugeField GaugeField;
|
||||
typedef typename GImpl::GaugeField GaugeField;\
|
||||
typedef typename GImpl::SiteGaugeField SiteGaugeField;\
|
||||
typedef typename GImpl::SiteGaugeLink SiteGaugeLink;
|
||||
|
||||
|
||||
//
|
||||
|
@ -41,7 +41,11 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFourDimRedBlackGrid(const GridCartesia
|
||||
{
|
||||
return new GridRedBlackCartesian(FourDimGrid);
|
||||
}
|
||||
|
||||
GridCartesian *SpaceTimeGrid::makeFourDimDWFGrid(const std::vector<int> & latt,const std::vector<int> &mpi)
|
||||
{
|
||||
std::vector<int> simd(4,1);
|
||||
return makeFourDimGrid(latt,simd,mpi);
|
||||
}
|
||||
GridCartesian *SpaceTimeGrid::makeFiveDimGrid(int Ls,const GridCartesian *FourDimGrid)
|
||||
{
|
||||
int N4=FourDimGrid->_ndimension;
|
||||
@ -58,6 +62,7 @@ GridCartesian *SpaceTimeGrid::makeFiveDimGrid(int Ls,const GridCartesian
|
||||
return new GridCartesian(latt5,simd5,mpi5);
|
||||
}
|
||||
|
||||
|
||||
GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridCartesian *FourDimGrid)
|
||||
{
|
||||
int N4=FourDimGrid->_ndimension;
|
||||
@ -76,4 +81,42 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridC
|
||||
return new GridRedBlackCartesian(latt5,simd5,mpi5,cb5,cbd);
|
||||
}
|
||||
|
||||
|
||||
GridCartesian *SpaceTimeGrid::makeFiveDimDWFGrid(int Ls,const GridCartesian *FourDimGrid)
|
||||
{
|
||||
int N4=FourDimGrid->_ndimension;
|
||||
int nsimd = FourDimGrid->Nsimd();
|
||||
|
||||
std::vector<int> latt5(1,Ls);
|
||||
std::vector<int> simd5(1,nsimd);
|
||||
std::vector<int> mpi5(1,1);
|
||||
|
||||
for(int d=0;d<N4;d++){
|
||||
latt5.push_back(FourDimGrid->_fdimensions[d]);
|
||||
simd5.push_back(1);
|
||||
mpi5.push_back(FourDimGrid->_processors[d]);
|
||||
}
|
||||
return new GridCartesian(latt5,simd5,mpi5);
|
||||
}
|
||||
|
||||
GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(int Ls,const GridCartesian *FourDimGrid)
|
||||
{
|
||||
int N4=FourDimGrid->_ndimension;
|
||||
int nsimd = FourDimGrid->Nsimd();
|
||||
int cbd=0;
|
||||
std::vector<int> latt5(1,Ls);
|
||||
std::vector<int> simd5(1,nsimd);
|
||||
std::vector<int> mpi5(1,1);
|
||||
std::vector<int> cb5(1,1);
|
||||
|
||||
for(int d=0;d<N4;d++){
|
||||
latt5.push_back(FourDimGrid->_fdimensions[d]);
|
||||
simd5.push_back(1);
|
||||
mpi5.push_back(FourDimGrid->_processors[d]);
|
||||
cb5.push_back(1);
|
||||
}
|
||||
return new GridRedBlackCartesian(latt5,simd5,mpi5,cb5,cbd);
|
||||
}
|
||||
|
||||
|
||||
}}
|
||||
|
@ -35,9 +35,14 @@ class SpaceTimeGrid {
|
||||
|
||||
static GridCartesian *makeFourDimGrid(const std::vector<int> & latt,const std::vector<int> &simd,const std::vector<int> &mpi);
|
||||
static GridRedBlackCartesian *makeFourDimRedBlackGrid (const GridCartesian *FourDimGrid);
|
||||
|
||||
static GridCartesian *makeFiveDimGrid (int Ls,const GridCartesian *FourDimGrid);
|
||||
static GridRedBlackCartesian *makeFiveDimRedBlackGrid(int Ls,const GridCartesian *FourDimGrid);
|
||||
|
||||
static GridCartesian *makeFiveDimDWFGrid (int Ls,const GridCartesian *FourDimGrid);
|
||||
static GridRedBlackCartesian *makeFiveDimDWFRedBlackGrid(int Ls,const GridCartesian *FourDimGrid);
|
||||
static GridCartesian *makeFourDimDWFGrid (const std::vector<int> & latt,const std::vector<int> &mpi);
|
||||
|
||||
};
|
||||
|
||||
}}
|
||||
|
@ -410,22 +410,22 @@ namespace Optimization {
|
||||
struct Permute{
|
||||
|
||||
static inline __m256 Permute0(__m256 in){
|
||||
return _mm256_permute2f128_ps(in,in,0x01);
|
||||
return _mm256_permute2f128_ps(in,in,0x01); //ABCD EFGH -> EFGH ABCD
|
||||
};
|
||||
static inline __m256 Permute1(__m256 in){
|
||||
return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||
return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //ABCD EFGH -> CDAB GHEF
|
||||
};
|
||||
static inline __m256 Permute2(__m256 in){
|
||||
return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //ABCD EFGH -> BADC FEHG
|
||||
};
|
||||
static inline __m256 Permute3(__m256 in){
|
||||
return in;
|
||||
};
|
||||
|
||||
static inline __m256d Permute0(__m256d in){
|
||||
return _mm256_permute2f128_pd(in,in,0x01);
|
||||
return _mm256_permute2f128_pd(in,in,0x01); //AB CD -> CD AB
|
||||
};
|
||||
static inline __m256d Permute1(__m256d in){
|
||||
static inline __m256d Permute1(__m256d in){ //AB CD -> BA DC
|
||||
return _mm256_shuffle_pd(in,in,0x5);
|
||||
};
|
||||
static inline __m256d Permute2(__m256d in){
|
||||
@ -437,6 +437,111 @@ namespace Optimization {
|
||||
|
||||
};
|
||||
|
||||
#if defined (AVX2) || defined (AVXFMA4)
|
||||
#define _mm256_alignr_epi32(ret,a,b,n) ret=(__m256) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*4)%16)
|
||||
#define _mm256_alignr_epi64(ret,a,b,n) ret=(__m256d) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*8)%16)
|
||||
#endif
|
||||
|
||||
#if defined (AVX1)
|
||||
|
||||
#define _mm256_alignr_epi32(ret,a,b,n) { \
|
||||
__m128 aa, bb; \
|
||||
\
|
||||
aa = _mm256_extractf128_ps(a,1); \
|
||||
bb = _mm256_extractf128_ps(b,1); \
|
||||
aa = (__m128)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*4)%16); \
|
||||
ret = _mm256_insertf128_ps(ret,aa,1); \
|
||||
\
|
||||
aa = _mm256_extractf128_ps(a,0); \
|
||||
bb = _mm256_extractf128_ps(b,0); \
|
||||
aa = (__m128)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*4)%16); \
|
||||
ret = _mm256_insertf128_ps(ret,aa,0); \
|
||||
}
|
||||
|
||||
#define _mm256_alignr_epi64(ret,a,b,n) { \
|
||||
__m128d aa, bb; \
|
||||
\
|
||||
aa = _mm256_extractf128_pd(a,1); \
|
||||
bb = _mm256_extractf128_pd(b,1); \
|
||||
aa = (__m128d)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*8)%16); \
|
||||
ret = _mm256_insertf128_pd(ret,aa,1); \
|
||||
\
|
||||
aa = _mm256_extractf128_pd(a,0); \
|
||||
bb = _mm256_extractf128_pd(b,0); \
|
||||
aa = (__m128d)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*8)%16); \
|
||||
ret = _mm256_insertf128_pd(ret,aa,0); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
inline std::ostream & operator << (std::ostream& stream, const __m256 a)
|
||||
{
|
||||
const float *p=(const float *)&a;
|
||||
stream<< "{"<<p[0]<<","<<p[1]<<","<<p[2]<<","<<p[3]<<","<<p[4]<<","<<p[5]<<","<<p[6]<<","<<p[7]<<"}";
|
||||
return stream;
|
||||
};
|
||||
inline std::ostream & operator<< (std::ostream& stream, const __m256d a)
|
||||
{
|
||||
const double *p=(const double *)&a;
|
||||
stream<< "{"<<p[0]<<","<<p[1]<<","<<p[2]<<","<<p[3]<<"}";
|
||||
return stream;
|
||||
};
|
||||
|
||||
struct Rotate{
|
||||
|
||||
static inline __m256 rotate(__m256 in,int n){
|
||||
switch(n){
|
||||
case 0: return tRotate<0>(in);break;
|
||||
case 1: return tRotate<1>(in);break;
|
||||
case 2: return tRotate<2>(in);break;
|
||||
case 3: return tRotate<3>(in);break;
|
||||
case 4: return tRotate<4>(in);break;
|
||||
case 5: return tRotate<5>(in);break;
|
||||
case 6: return tRotate<6>(in);break;
|
||||
case 7: return tRotate<7>(in);break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
static inline __m256d rotate(__m256d in,int n){
|
||||
switch(n){
|
||||
case 0: return tRotate<0>(in);break;
|
||||
case 1: return tRotate<1>(in);break;
|
||||
case 2: return tRotate<2>(in);break;
|
||||
case 3: return tRotate<3>(in);break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<int n>
|
||||
static inline __m256 tRotate(__m256 in){
|
||||
__m256 tmp = Permute::Permute0(in);
|
||||
__m256 ret;
|
||||
if ( n > 3 ) {
|
||||
_mm256_alignr_epi32(ret,in,tmp,n);
|
||||
} else {
|
||||
_mm256_alignr_epi32(ret,tmp,in,n);
|
||||
}
|
||||
// std::cout << " align epi32 n=" <<n<<" in "<<tmp<<in<<" -> "<< ret <<std::endl;
|
||||
return ret;
|
||||
};
|
||||
|
||||
template<int n>
|
||||
static inline __m256d tRotate(__m256d in){
|
||||
__m256d tmp = Permute::Permute0(in);
|
||||
__m256d ret;
|
||||
if ( n > 1 ) {
|
||||
_mm256_alignr_epi64(ret,in,tmp,n);
|
||||
} else {
|
||||
_mm256_alignr_epi64(ret,tmp,in,n);
|
||||
}
|
||||
// std::cout << " align epi64 n=" <<n<<" in "<<tmp<<in<<" -> "<< ret <<std::endl;
|
||||
return ret;
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
//Complex float Reduce
|
||||
template<>
|
||||
|
@ -309,6 +309,54 @@ namespace Optimization {
|
||||
};
|
||||
|
||||
|
||||
struct Rotate{
|
||||
|
||||
static inline __m512 rotate(__m512 in,int n){
|
||||
switch(n){
|
||||
case 0: return tRotate<0>(in);break;
|
||||
case 1: return tRotate<1>(in);break;
|
||||
case 2: return tRotate<2>(in);break;
|
||||
case 3: return tRotate<3>(in);break;
|
||||
case 4: return tRotate<4>(in);break;
|
||||
case 5: return tRotate<5>(in);break;
|
||||
case 6: return tRotate<6>(in);break;
|
||||
case 7: return tRotate<7>(in);break;
|
||||
|
||||
case 8 : return tRotate<8>(in);break;
|
||||
case 9 : return tRotate<9>(in);break;
|
||||
case 10: return tRotate<10>(in);break;
|
||||
case 11: return tRotate<11>(in);break;
|
||||
case 12: return tRotate<12>(in);break;
|
||||
case 13: return tRotate<13>(in);break;
|
||||
case 14: return tRotate<14>(in);break;
|
||||
case 15: return tRotate<15>(in);break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
static inline __m512d rotate(__m512d in,int n){
|
||||
switch(n){
|
||||
case 0: return tRotate<0>(in);break;
|
||||
case 1: return tRotate<1>(in);break;
|
||||
case 2: return tRotate<2>(in);break;
|
||||
case 3: return tRotate<3>(in);break;
|
||||
case 4: return tRotate<4>(in);break;
|
||||
case 5: return tRotate<5>(in);break;
|
||||
case 6: return tRotate<6>(in);break;
|
||||
case 7: return tRotate<7>(in);break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
template<int n> static inline __m512 tRotate(__m512 in){
|
||||
return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n);
|
||||
};
|
||||
|
||||
template<int n> static inline __m512d tRotate(__m512d in){
|
||||
return (__m512d)_mm512_alignr_epi64((__m512i)in,(__m512i)in,n);
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// Some Template specialization
|
||||
|
||||
|
@ -55,51 +55,67 @@ namespace Optimization {
|
||||
|
||||
struct Vsplat{
|
||||
//Complex float
|
||||
inline float operator()(float a, float b){
|
||||
return 0;
|
||||
inline u128f operator()(float a, float b){
|
||||
u128f out;
|
||||
out.f[0] = a;
|
||||
out.f[1] = b;
|
||||
out.f[2] = a;
|
||||
out.f[3] = b;
|
||||
return out;
|
||||
}
|
||||
// Real float
|
||||
inline float operator()(float a){
|
||||
return 0;
|
||||
inline u128f operator()(float a){
|
||||
u128f out;
|
||||
out.f[0] = a;
|
||||
out.f[1] = a;
|
||||
out.f[2] = a;
|
||||
out.f[3] = a;
|
||||
return out;
|
||||
}
|
||||
//Complex double
|
||||
inline double operator()(double a, double b){
|
||||
return 0;
|
||||
inline u128d operator()(double a, double b){
|
||||
u128d out;
|
||||
out.f[0] = a;
|
||||
out.f[1] = b;
|
||||
return out;
|
||||
}
|
||||
//Real double
|
||||
inline double operator()(double a){
|
||||
return 0;
|
||||
inline u128d operator()(double a){
|
||||
u128d out;
|
||||
out.f[0] = a;
|
||||
out.f[1] = a;
|
||||
return out;
|
||||
}
|
||||
//Integer
|
||||
inline int operator()(Integer a){
|
||||
return 0;
|
||||
return a;
|
||||
}
|
||||
};
|
||||
|
||||
struct Vstore{
|
||||
//Float
|
||||
inline void operator()(float a, float* F){
|
||||
|
||||
inline void operator()(u128f a, float* F){
|
||||
memcpy(F,a.f,4*sizeof(float));
|
||||
}
|
||||
//Double
|
||||
inline void operator()(double a, double* D){
|
||||
|
||||
inline void operator()(u128d a, double* D){
|
||||
memcpy(D,a.f,2*sizeof(double));
|
||||
}
|
||||
//Integer
|
||||
inline void operator()(int a, Integer* I){
|
||||
|
||||
I[0] = a;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
struct Vstream{
|
||||
//Float
|
||||
inline void operator()(float * a, float b){
|
||||
|
||||
inline void operator()(float * a, u128f b){
|
||||
memcpy(a,b.f,4*sizeof(float));
|
||||
}
|
||||
//Double
|
||||
inline void operator()(double * a, double b){
|
||||
|
||||
inline void operator()(double * a, u128d b){
|
||||
memcpy(a,b.f,2*sizeof(double));
|
||||
}
|
||||
|
||||
|
||||
@ -107,24 +123,40 @@ namespace Optimization {
|
||||
|
||||
struct Vset{
|
||||
// Complex float
|
||||
inline float operator()(Grid::ComplexF *a){
|
||||
return 0;
|
||||
inline u128f operator()(Grid::ComplexF *a){
|
||||
u128f out;
|
||||
out.f[0] = a[0].real();
|
||||
out.f[1] = a[0].imag();
|
||||
out.f[2] = a[1].real();
|
||||
out.f[3] = a[1].imag();
|
||||
return out;
|
||||
}
|
||||
// Complex double
|
||||
inline double operator()(Grid::ComplexD *a){
|
||||
return 0;
|
||||
inline u128d operator()(Grid::ComplexD *a){
|
||||
u128d out;
|
||||
out.f[0] = a[0].real();
|
||||
out.f[1] = a[0].imag();
|
||||
return out;
|
||||
}
|
||||
// Real float
|
||||
inline float operator()(float *a){
|
||||
return 0;
|
||||
inline u128f operator()(float *a){
|
||||
u128f out;
|
||||
out.f[0] = a[0];
|
||||
out.f[1] = a[1];
|
||||
out.f[2] = a[2];
|
||||
out.f[3] = a[3];
|
||||
return out;
|
||||
}
|
||||
// Real double
|
||||
inline double operator()(double *a){
|
||||
return 0;
|
||||
inline u128d operator()(double *a){
|
||||
u128d out;
|
||||
out.f[0] = a[0];
|
||||
out.f[1] = a[1];
|
||||
return out;
|
||||
}
|
||||
// Integer
|
||||
inline int operator()(Integer *a){
|
||||
return 0;
|
||||
return a[0];
|
||||
}
|
||||
|
||||
|
||||
@ -146,129 +178,198 @@ namespace Optimization {
|
||||
/////////////////////////////////////////////////////
|
||||
struct Sum{
|
||||
//Complex/Real float
|
||||
inline float operator()(float a, float b){
|
||||
return 0;
|
||||
inline u128f operator()(u128f a, u128f b){
|
||||
u128f out;
|
||||
out.f[0] = a.f[0] + b.f[0];
|
||||
out.f[1] = a.f[1] + b.f[1];
|
||||
out.f[2] = a.f[2] + b.f[2];
|
||||
out.f[3] = a.f[3] + b.f[3];
|
||||
return out;
|
||||
}
|
||||
//Complex/Real double
|
||||
inline double operator()(double a, double b){
|
||||
return 0;
|
||||
inline u128d operator()(u128d a, u128d b){
|
||||
u128d out;
|
||||
out.f[0] = a.f[0] + b.f[0];
|
||||
out.f[1] = a.f[1] + b.f[1];
|
||||
return out;
|
||||
}
|
||||
//Integer
|
||||
inline int operator()(int a, int b){
|
||||
return 0;
|
||||
return a + b;
|
||||
}
|
||||
};
|
||||
|
||||
struct Sub{
|
||||
//Complex/Real float
|
||||
inline float operator()(float a, float b){
|
||||
return 0;
|
||||
inline u128f operator()(u128f a, u128f b){
|
||||
u128f out;
|
||||
out.f[0] = a.f[0] - b.f[0];
|
||||
out.f[1] = a.f[1] - b.f[1];
|
||||
out.f[2] = a.f[2] - b.f[2];
|
||||
out.f[3] = a.f[3] - b.f[3];
|
||||
return out;
|
||||
}
|
||||
//Complex/Real double
|
||||
inline double operator()(double a, double b){
|
||||
return 0;
|
||||
inline u128d operator()(u128d a, u128d b){
|
||||
u128d out;
|
||||
out.f[0] = a.f[0] - b.f[0];
|
||||
out.f[1] = a.f[1] - b.f[1];
|
||||
return out;
|
||||
}
|
||||
//Integer
|
||||
inline int operator()(int a, int b){
|
||||
return 0;
|
||||
return a-b;
|
||||
}
|
||||
};
|
||||
|
||||
struct MultComplex{
|
||||
// Complex float
|
||||
inline float operator()(float a, float b){
|
||||
return 0;
|
||||
inline u128f operator()(u128f a, u128f b){
|
||||
u128f out;
|
||||
out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1];
|
||||
out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0];
|
||||
out.f[2] = a.f[2]*b.f[2] - a.f[3]*b.f[3];
|
||||
out.f[3] = a.f[2]*b.f[3] + a.f[3]*b.f[2];
|
||||
return out;
|
||||
}
|
||||
// Complex double
|
||||
inline double operator()(double a, double b){
|
||||
return 0;
|
||||
inline u128d operator()(u128d a, u128d b){
|
||||
u128d out;
|
||||
out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1];
|
||||
out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0];
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
struct Mult{
|
||||
inline float mac(float a, float b,double c){
|
||||
return 0;
|
||||
}
|
||||
inline double mac(double a, double b,double c){
|
||||
return 0;
|
||||
}
|
||||
//CK: Appear unneeded
|
||||
// inline float mac(float a, float b,double c){
|
||||
// return 0;
|
||||
// }
|
||||
// inline double mac(double a, double b,double c){
|
||||
// return 0;
|
||||
// }
|
||||
|
||||
// Real float
|
||||
inline float operator()(float a, float b){
|
||||
return 0;
|
||||
inline u128f operator()(u128f a, u128f b){
|
||||
u128f out;
|
||||
out.f[0] = a.f[0]*b.f[0];
|
||||
out.f[1] = a.f[1]*b.f[1];
|
||||
out.f[2] = a.f[2]*b.f[2];
|
||||
out.f[3] = a.f[3]*b.f[3];
|
||||
return out;
|
||||
}
|
||||
// Real double
|
||||
inline double operator()(double a, double b){
|
||||
return 0;
|
||||
inline u128d operator()(u128d a, u128d b){
|
||||
u128d out;
|
||||
out.f[0] = a.f[0]*b.f[0];
|
||||
out.f[1] = a.f[1]*b.f[1];
|
||||
return out;
|
||||
}
|
||||
// Integer
|
||||
inline int operator()(int a, int b){
|
||||
return 0;
|
||||
return a*b;
|
||||
}
|
||||
};
|
||||
|
||||
struct Conj{
|
||||
// Complex single
|
||||
inline float operator()(float in){
|
||||
return 0;
|
||||
inline u128f operator()(u128f in){
|
||||
u128f out;
|
||||
out.f[0] = in.f[0];
|
||||
out.f[1] = -in.f[1];
|
||||
out.f[2] = in.f[2];
|
||||
out.f[3] = -in.f[3];
|
||||
return out;
|
||||
}
|
||||
// Complex double
|
||||
inline double operator()(double in){
|
||||
return 0;
|
||||
inline u128d operator()(u128d in){
|
||||
u128d out;
|
||||
out.f[0] = in.f[0];
|
||||
out.f[1] = -in.f[1];
|
||||
return out;
|
||||
}
|
||||
// do not define for integer input
|
||||
};
|
||||
|
||||
struct TimesMinusI{
|
||||
//Complex single
|
||||
inline float operator()(float in, float ret){
|
||||
return 0;
|
||||
inline u128f operator()(u128f in, u128f ret){ //note ret is ignored
|
||||
u128f out;
|
||||
out.f[0] = in.f[1];
|
||||
out.f[1] = -in.f[0];
|
||||
out.f[2] = in.f[3];
|
||||
out.f[3] = -in.f[2];
|
||||
return out;
|
||||
}
|
||||
//Complex double
|
||||
inline double operator()(double in, double ret){
|
||||
return 0;
|
||||
inline u128d operator()(u128d in, u128d ret){
|
||||
u128d out;
|
||||
out.f[0] = in.f[1];
|
||||
out.f[1] = -in.f[0];
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
struct TimesI{
|
||||
//Complex single
|
||||
inline float operator()(float in, float ret){
|
||||
return 0;
|
||||
inline u128f operator()(u128f in, u128f ret){ //note ret is ignored
|
||||
u128f out;
|
||||
out.f[0] = -in.f[1];
|
||||
out.f[1] = in.f[0];
|
||||
out.f[2] = -in.f[3];
|
||||
out.f[3] = in.f[2];
|
||||
return out;
|
||||
}
|
||||
//Complex double
|
||||
inline double operator()(double in, double ret){
|
||||
return 0;
|
||||
inline u128d operator()(u128d in, u128d ret){
|
||||
u128d out;
|
||||
out.f[0] = -in.f[1];
|
||||
out.f[1] = in.f[0];
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// Some Template specialization
|
||||
struct Permute{
|
||||
|
||||
static inline float Permute0(float in){
|
||||
//We just have to mirror the permutes of Grid_sse4.h
|
||||
static inline u128f Permute0(u128f in){ //AB CD -> CD AB
|
||||
u128f out;
|
||||
out.f[0] = in.f[2];
|
||||
out.f[1] = in.f[3];
|
||||
out.f[2] = in.f[0];
|
||||
out.f[3] = in.f[1];
|
||||
return out;
|
||||
};
|
||||
static inline u128f Permute1(u128f in){ //AB CD -> BA DC
|
||||
u128f out;
|
||||
out.f[0] = in.f[1];
|
||||
out.f[1] = in.f[0];
|
||||
out.f[2] = in.f[3];
|
||||
out.f[3] = in.f[2];
|
||||
return out;
|
||||
};
|
||||
static inline u128f Permute2(u128f in){
|
||||
return in;
|
||||
};
|
||||
static inline float Permute1(float in){
|
||||
return in;
|
||||
};
|
||||
static inline float Permute2(float in){
|
||||
return in;
|
||||
};
|
||||
static inline float Permute3(float in){
|
||||
static inline u128f Permute3(u128f in){
|
||||
return in;
|
||||
};
|
||||
|
||||
static inline double Permute0(double in){
|
||||
static inline u128d Permute0(u128d in){ //AB -> BA
|
||||
u128d out;
|
||||
out.f[0] = in.f[1];
|
||||
out.f[1] = in.f[0];
|
||||
return out;
|
||||
};
|
||||
static inline u128d Permute1(u128d in){
|
||||
return in;
|
||||
};
|
||||
static inline double Permute1(double in){
|
||||
static inline u128d Permute2(u128d in){
|
||||
return in;
|
||||
};
|
||||
static inline double Permute2(double in){
|
||||
return in;
|
||||
};
|
||||
static inline double Permute3(double in){
|
||||
static inline u128d Permute3(u128d in){
|
||||
return in;
|
||||
};
|
||||
|
||||
@ -280,26 +381,26 @@ namespace Optimization {
|
||||
|
||||
//Complex float Reduce
|
||||
template<>
|
||||
inline Grid::ComplexF Reduce<Grid::ComplexF, float>::operator()(float in){
|
||||
return 0;
|
||||
inline Grid::ComplexF Reduce<Grid::ComplexF, u128f>::operator()(u128f in){ //2 complex
|
||||
return Grid::ComplexF(in.f[0] + in.f[2], in.f[1] + in.f[3]);
|
||||
}
|
||||
//Real float Reduce
|
||||
template<>
|
||||
inline Grid::RealF Reduce<Grid::RealF, float>::operator()(float in){
|
||||
return 0;
|
||||
inline Grid::RealF Reduce<Grid::RealF, u128f>::operator()(u128f in){ //4 floats
|
||||
return in.f[0] + in.f[1] + in.f[2] + in.f[3];
|
||||
}
|
||||
|
||||
|
||||
//Complex double Reduce
|
||||
template<>
|
||||
inline Grid::ComplexD Reduce<Grid::ComplexD, double>::operator()(double in){
|
||||
return 0;
|
||||
inline Grid::ComplexD Reduce<Grid::ComplexD, u128d>::operator()(u128d in){ //1 complex
|
||||
return Grid::ComplexD(in.f[0],in.f[1]);
|
||||
}
|
||||
|
||||
//Real double Reduce
|
||||
template<>
|
||||
inline Grid::RealD Reduce<Grid::RealD, double>::operator()(double in){
|
||||
return 0;
|
||||
inline Grid::RealD Reduce<Grid::RealD, u128d>::operator()(u128d in){ //2 doubles
|
||||
return in.f[0] + in.f[1];
|
||||
}
|
||||
|
||||
//Integer Reduce
|
||||
@ -314,8 +415,8 @@ namespace Optimization {
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// Here assign types
|
||||
|
||||
typedef float SIMD_Ftype; // Single precision type
|
||||
typedef double SIMD_Dtype; // Double precision type
|
||||
typedef Optimization::u128f SIMD_Ftype; // Single precision type
|
||||
typedef Optimization::u128d SIMD_Dtype; // Double precision type
|
||||
typedef int SIMD_Itype; // Integer type
|
||||
|
||||
// prefetch utilities
|
||||
|
@ -36,7 +36,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <zmmintrin.h>
|
||||
|
||||
namespace Grid{
|
||||
namespace Optimization {
|
||||
|
||||
struct Vsplat{
|
||||
@ -316,6 +318,54 @@ namespace Optimization {
|
||||
|
||||
};
|
||||
|
||||
struct Rotate{
|
||||
|
||||
static inline __m512 rotate(__m512 in,int n){
|
||||
switch(n){
|
||||
case 0: return tRotate<0>(in);break;
|
||||
case 1: return tRotate<1>(in);break;
|
||||
case 2: return tRotate<2>(in);break;
|
||||
case 3: return tRotate<3>(in);break;
|
||||
case 4: return tRotate<4>(in);break;
|
||||
case 5: return tRotate<5>(in);break;
|
||||
case 6: return tRotate<6>(in);break;
|
||||
case 7: return tRotate<7>(in);break;
|
||||
|
||||
case 8 : return tRotate<8>(in);break;
|
||||
case 9 : return tRotate<9>(in);break;
|
||||
case 10: return tRotate<10>(in);break;
|
||||
case 11: return tRotate<11>(in);break;
|
||||
case 12: return tRotate<12>(in);break;
|
||||
case 13: return tRotate<13>(in);break;
|
||||
case 14: return tRotate<14>(in);break;
|
||||
case 15: return tRotate<15>(in);break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
static inline __m512d rotate(__m512d in,int n){
|
||||
switch(n){
|
||||
case 0: return tRotate<0>(in);break;
|
||||
case 1: return tRotate<1>(in);break;
|
||||
case 2: return tRotate<2>(in);break;
|
||||
case 3: return tRotate<3>(in);break;
|
||||
case 4: return tRotate<4>(in);break;
|
||||
case 5: return tRotate<5>(in);break;
|
||||
case 6: return tRotate<6>(in);break;
|
||||
case 7: return tRotate<7>(in);break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
template<int n> static inline __m512 tRotate(__m512 in){
|
||||
return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n);
|
||||
};
|
||||
|
||||
template<int n> static inline __m512d tRotate(__m512d in){
|
||||
return (__m512d)_mm512_alignr_epi32((__m512i)in,(__m512i)in,2*n);
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
//////////////////////////////////////////////
|
||||
@ -358,7 +408,7 @@ namespace Optimization {
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// Here assign types
|
||||
namespace Grid {
|
||||
|
||||
typedef __m512 SIMD_Ftype; // Single precision type
|
||||
typedef __m512d SIMD_Dtype; // Double precision type
|
||||
typedef __m512i SIMD_Itype; // Integer type
|
||||
|
@ -267,10 +267,10 @@ namespace Optimization {
|
||||
struct Permute{
|
||||
|
||||
static inline __m128 Permute0(__m128 in){
|
||||
return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||
return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //AB CD -> CD AB
|
||||
};
|
||||
static inline __m128 Permute1(__m128 in){
|
||||
return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //AB CD -> BA DC
|
||||
};
|
||||
static inline __m128 Permute2(__m128 in){
|
||||
return in;
|
||||
@ -279,7 +279,7 @@ namespace Optimization {
|
||||
return in;
|
||||
};
|
||||
|
||||
static inline __m128d Permute0(__m128d in){
|
||||
static inline __m128d Permute0(__m128d in){ //AB -> BA
|
||||
return _mm_shuffle_pd(in,in,0x1);
|
||||
};
|
||||
static inline __m128d Permute1(__m128d in){
|
||||
@ -294,6 +294,32 @@ namespace Optimization {
|
||||
|
||||
};
|
||||
|
||||
struct Rotate{
|
||||
|
||||
static inline __m128 rotate(__m128 in,int n){
|
||||
switch(n){
|
||||
case 0: return tRotate<0>(in);break;
|
||||
case 1: return tRotate<1>(in);break;
|
||||
case 2: return tRotate<2>(in);break;
|
||||
case 3: return tRotate<3>(in);break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
static inline __m128d rotate(__m128d in,int n){
|
||||
switch(n){
|
||||
case 0: return tRotate<0>(in);break;
|
||||
case 1: return tRotate<1>(in);break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
#define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16)
|
||||
#define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16)
|
||||
|
||||
template<int n> static inline __m128 tRotate(__m128 in){ return (__m128)_mm_alignr_epi32((__m128i)in,(__m128i)in,n); };
|
||||
template<int n> static inline __m128d tRotate(__m128d in){ return (__m128d)_mm_alignr_epi64((__m128i)in,(__m128i)in,n); };
|
||||
|
||||
};
|
||||
//////////////////////////////////////////////
|
||||
// Some Template specialization
|
||||
|
||||
|
@ -299,16 +299,44 @@ namespace Grid {
|
||||
}
|
||||
friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
|
||||
{
|
||||
if (perm==3) permute3(y,b);
|
||||
else if (perm==2) permute2(y,b);
|
||||
else if (perm==1) permute1(y,b);
|
||||
else if (perm==0) permute0(y,b);
|
||||
if ( perm & RotateBit ) {
|
||||
int dist = perm&0xF;
|
||||
y=rotate(b,dist);
|
||||
return;
|
||||
}
|
||||
switch(perm){
|
||||
case 3: permute3(y,b); break;
|
||||
case 2: permute2(y,b); break;
|
||||
case 1: permute1(y,b); break;
|
||||
case 0: permute0(y,b); break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
};// end of Grid_simd class definition
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// General rotate
|
||||
////////////////////////////////////////////////////////////////////
|
||||
template <class S, class V, IfNotComplex<S> =0>
|
||||
inline Grid_simd<S,V> rotate(Grid_simd<S,V> b,int nrot)
|
||||
{
|
||||
nrot = nrot % Grid_simd<S,V>::Nsimd();
|
||||
Grid_simd<S,V> ret;
|
||||
// std::cout << "Rotate Real by "<<nrot<<std::endl;
|
||||
ret.v = Optimization::Rotate::rotate(b.v,nrot);
|
||||
return ret;
|
||||
}
|
||||
template <class S, class V, IfComplex<S> =0>
|
||||
inline Grid_simd<S,V> rotate(Grid_simd<S,V> b,int nrot)
|
||||
{
|
||||
nrot = nrot % Grid_simd<S,V>::Nsimd();
|
||||
Grid_simd<S,V> ret;
|
||||
// std::cout << "Rotate Complex by "<<nrot<<std::endl;
|
||||
ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
|
||||
return ret;
|
||||
}
|
||||
|
||||
///////////////////////
|
||||
// Splat
|
||||
///////////////////////
|
||||
|
@ -44,7 +44,7 @@ template<class vsimd,class scalar>
|
||||
inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const vsimd >::type * y,
|
||||
std::vector<scalar *> &extracted,int offset){
|
||||
// FIXME: bounce off memory is painful
|
||||
static const int Nsimd=vsimd::Nsimd();
|
||||
static const int Nsimd=sizeof(vsimd)/sizeof(scalar);
|
||||
int Nextr=extracted.size();
|
||||
int s=Nsimd/Nextr;
|
||||
|
||||
@ -59,7 +59,9 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v
|
||||
template<class vsimd,class scalar>
|
||||
inline void merge(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::type * y,
|
||||
std::vector<scalar *> &extracted,int offset){
|
||||
static const int Nsimd=vsimd::Nsimd();
|
||||
|
||||
static const int Nsimd=sizeof(vsimd)/sizeof(scalar);
|
||||
|
||||
int Nextr=extracted.size();
|
||||
int s=Nsimd/Nextr; // can have sparse occupation of simd vector if simd_layout does not fill it
|
||||
// replicate n-fold. Use to allow Integer masks to
|
||||
@ -127,7 +129,7 @@ template<class vobj> inline void extract(const vobj &vec,std::vector<typename vo
|
||||
typedef typename vobj::scalar_type scalar_type ;
|
||||
typedef typename vobj::vector_type vector_type ;
|
||||
|
||||
static const int Nsimd=vobj::vector_type::Nsimd();
|
||||
static const int Nsimd=sizeof(vector_type)/sizeof(scalar_type);
|
||||
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||
int Nextr=extracted.size();
|
||||
int s=Nsimd/Nextr;
|
||||
@ -174,7 +176,7 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object> &extracted)
|
||||
typedef typename vobj::scalar_type scalar_type ;
|
||||
typedef typename vobj::vector_type vector_type ;
|
||||
|
||||
static const int Nsimd=vobj::vector_type::Nsimd();
|
||||
static const int Nsimd=sizeof(vector_type)/sizeof(scalar_type);
|
||||
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||
|
||||
int Nextr = extracted.size();
|
||||
@ -199,7 +201,7 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
|
||||
typedef typename vobj::scalar_type scalar_type ;
|
||||
typedef typename vobj::vector_type vector_type ;
|
||||
|
||||
const int Nsimd=vobj::vector_type::Nsimd();
|
||||
const int Nsimd=sizeof(vector_type)/sizeof(scalar_type);
|
||||
const int words=sizeof(vobj)/sizeof(vector_type);
|
||||
|
||||
int Nextr=extracted.size();
|
||||
|
@ -1,5 +1,5 @@
|
||||
|
||||
bin_PROGRAMS += Test_GaugeAction Test_RectPlaq Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_gamma Test_gp_rect_force Test_gparity Test_gpdwf_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd
|
||||
bin_PROGRAMS = Test_GaugeAction Test_RectPlaq Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_cshift_red_black_rotate Test_cshift_rotate Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_dwf_rb5d Test_gamma Test_gp_rect_force Test_gparity Test_gpdwf_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd
|
||||
|
||||
|
||||
Test_GaugeAction_SOURCES=Test_GaugeAction.cc
|
||||
@ -58,6 +58,14 @@ Test_cshift_red_black_SOURCES=Test_cshift_red_black.cc
|
||||
Test_cshift_red_black_LDADD=-lGrid
|
||||
|
||||
|
||||
Test_cshift_red_black_rotate_SOURCES=Test_cshift_red_black_rotate.cc
|
||||
Test_cshift_red_black_rotate_LDADD=-lGrid
|
||||
|
||||
|
||||
Test_cshift_rotate_SOURCES=Test_cshift_rotate.cc
|
||||
Test_cshift_rotate_LDADD=-lGrid
|
||||
|
||||
|
||||
Test_dwf_cg_prec_SOURCES=Test_dwf_cg_prec.cc
|
||||
Test_dwf_cg_prec_LDADD=-lGrid
|
||||
|
||||
@ -98,6 +106,10 @@ Test_dwf_lanczos_SOURCES=Test_dwf_lanczos.cc
|
||||
Test_dwf_lanczos_LDADD=-lGrid
|
||||
|
||||
|
||||
Test_dwf_rb5d_SOURCES=Test_dwf_rb5d.cc
|
||||
Test_dwf_rb5d_LDADD=-lGrid
|
||||
|
||||
|
||||
Test_gamma_SOURCES=Test_gamma.cc
|
||||
Test_gamma_LDADD=-lGrid
|
||||
|
||||
|
223
tests/Test_cshift_red_black_rotate.cc
Normal file
223
tests/Test_cshift_red_black_rotate.cc
Normal file
@ -0,0 +1,223 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./tests/Test_cshift_red_black.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid.h>
|
||||
|
||||
using namespace Grid;
|
||||
using namespace Grid::QCD;
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
std::vector<int> latt_size = GridDefaultLatt();
|
||||
int Nd = latt_size.size();
|
||||
std::vector<int> simd_layout( { vComplex::Nsimd(),1,1,1});
|
||||
std::vector<int> mpi_layout = GridDefaultMpi();
|
||||
|
||||
std::vector<int> mask(Nd,1);
|
||||
mask[0]=0;
|
||||
|
||||
GridCartesian Fine (latt_size,simd_layout,mpi_layout);
|
||||
GridRedBlackCartesian RBFine(latt_size,simd_layout,mpi_layout,mask,1);
|
||||
|
||||
GridParallelRNG FineRNG(&Fine); FineRNG.SeedRandomDevice();
|
||||
|
||||
LatticeComplex U(&Fine);
|
||||
LatticeComplex ShiftU(&Fine);
|
||||
LatticeComplex rbShiftU(&Fine);
|
||||
LatticeComplex Ue(&RBFine);
|
||||
LatticeComplex Uo(&RBFine);
|
||||
LatticeComplex ShiftUe(&RBFine);
|
||||
LatticeComplex ShiftUo(&RBFine);
|
||||
LatticeComplex lex(&Fine);
|
||||
lex=zero;
|
||||
Integer stride =1;
|
||||
{
|
||||
double nrm;
|
||||
LatticeComplex coor(&Fine);
|
||||
|
||||
for(int d=0;d<Nd;d++){
|
||||
// Integer i=10000;
|
||||
Integer i=0;
|
||||
LatticeCoordinate(coor,d);
|
||||
lex = lex + coor*stride+i;
|
||||
stride=stride*latt_size[d];
|
||||
}
|
||||
U=lex;
|
||||
}
|
||||
|
||||
pickCheckerboard(Even,Ue,U);
|
||||
pickCheckerboard(Odd,Uo,U);
|
||||
|
||||
// std::cout<<GridLogMessage << U<<std::endl;
|
||||
std::cout<<GridLogMessage << "Ue " <<norm2(Ue)<<std::endl;
|
||||
std::cout<<GridLogMessage << "Uo " <<norm2(Uo)<<std::endl;
|
||||
|
||||
|
||||
TComplex cm;
|
||||
TComplex cmeo;
|
||||
for(int dir=0;dir<Nd;dir++){
|
||||
// if ( dir!=1 ) continue;
|
||||
for(int shift=0;shift<latt_size[dir];shift++){
|
||||
|
||||
std::cout<<GridLogMessage<<"Shifting by "<<shift<<" in direction"<<dir<<std::endl;
|
||||
|
||||
std::cout<<GridLogMessage<<"Even grid"<<std::endl;
|
||||
ShiftUe = Cshift(Ue,dir,shift); // Shift everything cb by cb
|
||||
std::cout<<GridLogMessage << "\tShiftUe " <<norm2(ShiftUe)<<std::endl;
|
||||
|
||||
std::cout<<GridLogMessage<<"Odd grid"<<std::endl;
|
||||
ShiftUo = Cshift(Uo,dir,shift);
|
||||
std::cout<<GridLogMessage << "\tShiftUo " <<norm2(ShiftUo)<<std::endl;
|
||||
|
||||
std::cout<<GridLogMessage<<"Recombined Even/Odd grids"<<std::endl;
|
||||
setCheckerboard(rbShiftU,ShiftUe);
|
||||
setCheckerboard(rbShiftU,ShiftUo);
|
||||
std::cout<<GridLogMessage << "\trbShiftU " <<norm2(rbShiftU)<<std::endl;
|
||||
|
||||
std::cout<<GridLogMessage<<"Full grid shift"<<std::endl;
|
||||
ShiftU = Cshift(U,dir,shift); // Shift everything
|
||||
std::cout<<GridLogMessage << "\tShiftU " <<norm2(rbShiftU)<<std::endl;
|
||||
|
||||
std::vector<int> coor(4);
|
||||
|
||||
std::cout<<GridLogMessage << "Checking the non-checkerboard shift"<<std::endl;
|
||||
for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
|
||||
for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
|
||||
for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
|
||||
for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
|
||||
|
||||
peekSite(cm,ShiftU,coor);
|
||||
|
||||
///////// double nrm=norm2(U);
|
||||
|
||||
std::vector<int> scoor(coor);
|
||||
scoor[dir] = (scoor[dir]+shift)%latt_size[dir];
|
||||
|
||||
Integer slex = scoor[0]
|
||||
+ latt_size[0]*scoor[1]
|
||||
+ latt_size[0]*latt_size[1]*scoor[2]
|
||||
+ latt_size[0]*latt_size[1]*latt_size[2]*scoor[3];
|
||||
|
||||
Complex scm(slex);
|
||||
|
||||
double nrm = abs(scm-cm()()());
|
||||
std::vector<int> peer(4);
|
||||
Complex ctmp = cm;
|
||||
Integer index=real(ctmp);
|
||||
Lexicographic::CoorFromIndex(peer,index,latt_size);
|
||||
|
||||
if (nrm > 0){
|
||||
std::cout<<"FAIL shift "<< shift<<" in dir "<< dir
|
||||
<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
|
||||
<< cm()()()<<" expect "<<scm<<" "<<nrm<<std::endl;
|
||||
std::cout<<"Got "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
||||
index=real(scm);
|
||||
Lexicographic::CoorFromIndex(peer,index,latt_size);
|
||||
std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
||||
exit(-1);
|
||||
}
|
||||
}}}}
|
||||
|
||||
int exx=0;
|
||||
std::cout<<GridLogMessage << "Checking the checkerboard shift"<<std::endl;
|
||||
for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
|
||||
for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
|
||||
for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
|
||||
for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
|
||||
|
||||
peekSite(cm,rbShiftU,coor);
|
||||
|
||||
Integer checkerboard = RBFine.CheckerBoard(coor);
|
||||
|
||||
// std::cout << " coor "<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] \n ";
|
||||
// std::cout << "shift "<< shift <<" dir "<<dir<< " checker board "<< checkerboard << " ";
|
||||
// std::cout << "Uo " << ShiftUo.checkerboard << " Ue "<<ShiftUe.checkerboard<<std::endl;
|
||||
if ( checkerboard == ShiftUo.checkerboard ) {
|
||||
peekSite(cmeo,ShiftUo,coor);
|
||||
} else {
|
||||
peekSite(cmeo,ShiftUe,coor);
|
||||
}
|
||||
|
||||
|
||||
std::vector<int> scoor(coor);
|
||||
scoor[dir] = (scoor[dir]+shift)%latt_size[dir];
|
||||
|
||||
Integer slex = scoor[0]
|
||||
+ latt_size[0]*scoor[1]
|
||||
+ latt_size[0]*latt_size[1]*scoor[2]
|
||||
+ latt_size[0]*latt_size[1]*latt_size[2]*scoor[3];
|
||||
|
||||
Complex scm(slex);
|
||||
|
||||
std::vector<int> peer(4);
|
||||
Complex ctmp=cmeo;
|
||||
Integer index=real(ctmp);
|
||||
Lexicographic::CoorFromIndex(peer,index,latt_size);
|
||||
|
||||
double nrm = abs(cmeo()()()-scm);
|
||||
if (nrm != 0) {
|
||||
std::cout<<"EOFAIL shift "<< shift<<" in dir "<< dir
|
||||
<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
|
||||
<< cmeo()()()<<" expect "<<scm<<" "<<nrm<<std::endl;
|
||||
std::cout<<"Got "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
||||
index=real(scm);
|
||||
Lexicographic::CoorFromIndex(peer,index,latt_size);
|
||||
std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
||||
exx=1;
|
||||
|
||||
}
|
||||
|
||||
ctmp=cm;
|
||||
index=real(ctmp);
|
||||
nrm = abs(scm-cm()()());
|
||||
|
||||
if (nrm > 0){
|
||||
std::cout<<"FAIL shift "<< shift<<" in dir "<< dir
|
||||
<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
|
||||
<< cm()()()<<" expect "<<scm<<" "<<nrm<<std::endl;
|
||||
std::cout<<"Got "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
||||
index=real(scm);
|
||||
Lexicographic::CoorFromIndex(peer,index,latt_size);
|
||||
std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
||||
exx=1;
|
||||
} else if (1) {
|
||||
std::cout<<GridLogMessage<<"PASS shift "<< shift<<" in dir "<< dir
|
||||
<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
|
||||
<< cm()()()<<" expect "<<scm<<" "<<nrm<<std::endl;
|
||||
}
|
||||
}}}}
|
||||
if (exx) exit(-1);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
Grid_finalize();
|
||||
}
|
125
tests/Test_cshift_rotate.cc
Normal file
125
tests/Test_cshift_rotate.cc
Normal file
@ -0,0 +1,125 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./tests/Test_cshift.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid.h>
|
||||
|
||||
using namespace Grid;
|
||||
using namespace Grid::QCD;
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
std::vector<int> latt_size = GridDefaultLatt();
|
||||
std::vector<int> simd_layout( { vComplex::Nsimd(),1,1,1});
|
||||
std::vector<int> mpi_layout = GridDefaultMpi();
|
||||
|
||||
GridCartesian Fine(latt_size,simd_layout,mpi_layout);
|
||||
|
||||
GridParallelRNG FineRNG(&Fine); FineRNG.SeedRandomDevice();
|
||||
|
||||
LatticeComplex U(&Fine);
|
||||
LatticeComplex ShiftU(&Fine);
|
||||
|
||||
LatticeComplex lex(&Fine);
|
||||
lex=zero;
|
||||
Integer stride =1;
|
||||
{
|
||||
double nrm;
|
||||
LatticeComplex coor(&Fine);
|
||||
|
||||
for(int d=0;d<4;d++){
|
||||
LatticeCoordinate(coor,d);
|
||||
lex = lex + coor*stride;
|
||||
stride=stride*latt_size[d];
|
||||
}
|
||||
U=lex;
|
||||
}
|
||||
|
||||
|
||||
TComplex cm;
|
||||
|
||||
for(int dir=0;dir<4;dir++){
|
||||
for(int shift=0;shift<latt_size[dir];shift++){
|
||||
if ( Fine.IsBoss() )
|
||||
std::cout<<GridLogMessage<<"Shifting by "<<shift<<" in direction"<<dir<<std::endl;
|
||||
|
||||
ShiftU = Cshift(U,dir,shift); // Shift everything
|
||||
|
||||
/*
|
||||
std::cout << "U[0]" << U[0]<<std::endl;
|
||||
std::cout << "U[1]" << U[1]<<std::endl;
|
||||
std::cout << "ShiftU[0]" << ShiftU[0]<<std::endl;
|
||||
std::cout << "ShiftU[1]" << ShiftU[1]<<std::endl;
|
||||
*/
|
||||
std::vector<int> coor(4);
|
||||
|
||||
for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
|
||||
for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
|
||||
for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
|
||||
for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
|
||||
|
||||
peekSite(cm,ShiftU,coor);
|
||||
|
||||
double nrm=norm2(U);
|
||||
|
||||
std::vector<int> scoor(coor);
|
||||
scoor[dir] = (scoor[dir]+shift)%latt_size[dir];
|
||||
|
||||
Integer slex = scoor[0]
|
||||
+ latt_size[0]*scoor[1]
|
||||
+ latt_size[0]*latt_size[1]*scoor[2]
|
||||
+ latt_size[0]*latt_size[1]*latt_size[2]*scoor[3];
|
||||
|
||||
Complex scm(slex);
|
||||
|
||||
nrm = abs(scm-cm()()());
|
||||
std::vector<int> peer(4);
|
||||
Complex tmp =cm;
|
||||
Integer index=real(tmp);
|
||||
Lexicographic::CoorFromIndex(peer,index,latt_size);
|
||||
|
||||
if (nrm > 0){
|
||||
std::cerr<<"FAIL shift "<< shift<<" in dir "<< dir<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "<< cm()()()<<" expect "<<scm<<" "<<nrm<<std::endl;
|
||||
std::cerr<<"Got "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
||||
index=real(scm);
|
||||
Lexicographic::CoorFromIndex(peer,index,latt_size);
|
||||
std::cerr<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
||||
}
|
||||
/*
|
||||
else {
|
||||
std::cerr<<"PASS shift "<< shift<<" in dir "<< dir<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "<< cm()()()<<" expect "<<scm<<" "<<nrm<<std::endl;
|
||||
std::cerr<<"Got "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
||||
}
|
||||
*/
|
||||
}}}}
|
||||
}
|
||||
}
|
||||
|
||||
Grid_finalize();
|
||||
}
|
138
tests/Test_dwf_rb5d.cc
Normal file
138
tests/Test_dwf_rb5d.cc
Normal file
@ -0,0 +1,138 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./tests/Test_dwf_even_odd.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid.h>
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
using namespace Grid::QCD;
|
||||
|
||||
template<class d>
|
||||
struct scal {
|
||||
d internal;
|
||||
};
|
||||
|
||||
Gamma::GammaMatrix Gmu [] = {
|
||||
Gamma::GammaX,
|
||||
Gamma::GammaY,
|
||||
Gamma::GammaZ,
|
||||
Gamma::GammaT
|
||||
};
|
||||
|
||||
typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
|
||||
typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
|
||||
typedef WilsonFermion5D<DomainWallRedBlack5dImplD> WilsonFermion5DD;
|
||||
|
||||
typedef WilsonFermion5D<WilsonImplR> WilsonFermion5D_OKR;
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
int threads = GridThread::GetThreads();
|
||||
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
||||
|
||||
|
||||
const int Ls=32;
|
||||
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
|
||||
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
|
||||
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
|
||||
|
||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
|
||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
|
||||
|
||||
std::vector<int> seeds4({1,2,3,4});
|
||||
std::vector<int> seeds5({5,6,7,8});
|
||||
|
||||
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
||||
|
||||
LatticeFermion src (FGrid); random(RNG5,src);
|
||||
LatticeFermion result(FGrid); result=zero;
|
||||
LatticeFermion ref(FGrid); ref=zero;
|
||||
LatticeFermion tmp(FGrid);
|
||||
LatticeFermion err(FGrid);
|
||||
|
||||
LatticeGaugeField Umu(UGrid); random(RNG4,Umu);
|
||||
std::vector<LatticeColourMatrix> U(4,UGrid);
|
||||
|
||||
// Only one non-zero (y)
|
||||
/*
|
||||
Umu=zero;
|
||||
for(int nn=0;nn<Nd;nn++){
|
||||
random(RNG4,U[nn]);
|
||||
PokeIndex<LorentzIndex>(Umu,U[nn],nn);
|
||||
}
|
||||
*/
|
||||
|
||||
RealD mass=0.1;
|
||||
RealD M5 =1.8;
|
||||
typename WilsonFermion5DR::ImplParams params;
|
||||
|
||||
WilsonFermion5DR Dw(1,Umu,*FGrid,*FrbGrid,*sUGrid,*sUrbGrid,M5,params);
|
||||
|
||||
Dw.Dhop(src,result,0);
|
||||
|
||||
std::cout << "Norm src = "<<norm2(src)<<" Norm res = "<<norm2(result) << std::endl;
|
||||
|
||||
|
||||
|
||||
GridCartesian * FokGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||
GridRedBlackCartesian * FokrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||
WilsonFermion5D_OKR Dok(Umu,*FokGrid,*FokrbGrid,*UGrid,*UrbGrid,M5,params);
|
||||
|
||||
LatticeFermion src_ok (FokGrid);
|
||||
LatticeFermion ref_ok(FokGrid);
|
||||
LatticeFermion result_ok(FokGrid);
|
||||
|
||||
|
||||
for(int lidx=0;lidx<FGrid->lSites();lidx++){
|
||||
std::vector<int> lcoor;
|
||||
FGrid->LocalIndexToLocalCoor(lidx,lcoor);
|
||||
|
||||
SpinColourVector siteSrc;
|
||||
|
||||
peekLocalSite(siteSrc,src,lcoor);
|
||||
pokeLocalSite(siteSrc,src_ok,lcoor);
|
||||
|
||||
peekLocalSite(siteSrc,result,lcoor);
|
||||
pokeLocalSite(siteSrc,result_ok,lcoor);
|
||||
}
|
||||
|
||||
Dok.Dhop(src_ok,ref_ok,0);
|
||||
|
||||
std::cout << "Reference = "<<norm2(src_ok)<<" res = "<<norm2(ref_ok) << std::endl;
|
||||
ref_ok = ref_ok - result_ok;
|
||||
std::cout << "Reference diff = "<<norm2(result_ok)<< std::endl;
|
||||
std::cout << "Reference diff = "<<norm2(ref_ok)<< std::endl;
|
||||
|
||||
Grid_finalize();
|
||||
}
|
@ -69,7 +69,6 @@ public:
|
||||
template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = timesI(i1);}
|
||||
std::string name(void) const { return std::string("timesI"); }
|
||||
};
|
||||
|
||||
class funcTimesMinusI {
|
||||
public:
|
||||
funcTimesMinusI() {};
|
||||
@ -97,6 +96,7 @@ public:
|
||||
// zeroit
|
||||
// permute
|
||||
|
||||
|
||||
class funcReduce {
|
||||
public:
|
||||
funcReduce() {};
|
||||
@ -208,6 +208,100 @@ void ReductionTester(const functor &func)
|
||||
|
||||
|
||||
|
||||
class funcPermute {
|
||||
public:
|
||||
int n;
|
||||
funcPermute(int _n) { n=_n;};
|
||||
template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { permute(rr,i1,n);}
|
||||
template<class scal> void apply(std::vector<scal> &rr,std::vector<scal> &in) const {
|
||||
int sz=in.size();
|
||||
int msk = sz>>(n+1);
|
||||
for(int i=0;i<sz;i++){
|
||||
rr[i] = in[ i^msk ];
|
||||
}
|
||||
}
|
||||
std::string name(void) const { return std::string("Permute"); }
|
||||
};
|
||||
class funcRotate {
|
||||
public:
|
||||
int n;
|
||||
funcRotate(int _n) { n=_n;};
|
||||
template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr=rotate(i1,n);}
|
||||
template<class scal> void apply(std::vector<scal> &rr,std::vector<scal> &in) const {
|
||||
int sz = in.size();
|
||||
for(int i=0;i<sz;i++){
|
||||
rr[i] = in[(i+n)%sz];
|
||||
}
|
||||
}
|
||||
std::string name(void) const { return std::string("Rotate"); }
|
||||
};
|
||||
|
||||
|
||||
template<class scal, class vec,class functor >
|
||||
void PermTester(const functor &func)
|
||||
{
|
||||
GridSerialRNG sRNG;
|
||||
sRNG.SeedRandomDevice();
|
||||
|
||||
int Nsimd = vec::Nsimd();
|
||||
|
||||
std::vector<scal> input1(Nsimd);
|
||||
std::vector<scal> input2(Nsimd);
|
||||
std::vector<scal> result(Nsimd);
|
||||
std::vector<scal> reference(Nsimd);
|
||||
|
||||
std::vector<vec,alignedAllocator<vec> > buf(3);
|
||||
vec & v_input1 = buf[0];
|
||||
vec & v_input2 = buf[1];
|
||||
vec & v_result = buf[2];
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
random(sRNG,input1[i]);
|
||||
random(sRNG,input2[i]);
|
||||
random(sRNG,result[i]);
|
||||
}
|
||||
|
||||
merge<vec,scal>(v_input1,input1);
|
||||
merge<vec,scal>(v_input2,input2);
|
||||
merge<vec,scal>(v_result,result);
|
||||
|
||||
func(v_result,v_input1,v_input2);
|
||||
|
||||
func.apply(reference,input1);
|
||||
|
||||
extract<vec,scal>(v_result,result);
|
||||
std::cout<<GridLogMessage << " " << func.name() << " " <<func.n <<std::endl;
|
||||
|
||||
int ok=0;
|
||||
if (0) {
|
||||
std::cout<<GridLogMessage<< "*****" << std::endl;
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
std::cout<< input1[i]<<" ";
|
||||
}
|
||||
std::cout <<std::endl;
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
std::cout<< result[i]<<" ";
|
||||
}
|
||||
std::cout <<std::endl;
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
std::cout<< reference[i]<<" ";
|
||||
}
|
||||
std::cout <<std::endl;
|
||||
std::cout<<GridLogMessage<< "*****" << std::endl;
|
||||
}
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
if ( abs(reference[i]-result[i])>1.0e-7){
|
||||
std::cout<<GridLogMessage<< "*****" << std::endl;
|
||||
std::cout<<GridLogMessage<< "["<<i<<"] "<< abs(reference[i]-result[i]) << " " <<reference[i]<< " " << result[i]<<std::endl;
|
||||
ok++;
|
||||
}
|
||||
}
|
||||
if ( ok==0 ) {
|
||||
std::cout<<GridLogMessage << " OK!" <<std::endl;
|
||||
}
|
||||
assert(ok==0);
|
||||
}
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
@ -235,6 +329,24 @@ int main (int argc, char ** argv)
|
||||
Tester<RealF,vRealF>(funcInnerProduct());
|
||||
ReductionTester<RealF,RealF,vRealF>(funcReduce());
|
||||
|
||||
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
std::cout<<GridLogMessage << "Testing vRealF permutes "<<std::endl;
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
|
||||
// Log2 iteration
|
||||
for(int i=0;(1<<i)< vRealF::Nsimd();i++){
|
||||
PermTester<RealF,vRealF>(funcPermute(i));
|
||||
}
|
||||
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
std::cout<<GridLogMessage << "Testing vRealF rotate "<<std::endl;
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
for(int r=0;r<vRealF::Nsimd();r++){
|
||||
PermTester<RealF,vRealF>(funcRotate(r));
|
||||
}
|
||||
|
||||
|
||||
std::cout << GridLogMessage <<"==================================="<< std::endl;
|
||||
std::cout << GridLogMessage <<"Testing vRealD "<<std::endl;
|
||||
std::cout << GridLogMessage <<"==================================="<< std::endl;
|
||||
@ -247,6 +359,25 @@ int main (int argc, char ** argv)
|
||||
Tester<RealD,vRealD>(funcInnerProduct());
|
||||
ReductionTester<RealD,RealD,vRealD>(funcReduce());
|
||||
|
||||
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
std::cout<<GridLogMessage << "Testing vRealD permutes "<<std::endl;
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
|
||||
// Log2 iteration
|
||||
for(int i=0;(1<<i)< vRealD::Nsimd();i++){
|
||||
PermTester<RealD,vRealD>(funcPermute(i));
|
||||
}
|
||||
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
std::cout<<GridLogMessage << "Testing vRealD rotate "<<std::endl;
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
for(int r=0;r<vRealD::Nsimd();r++){
|
||||
PermTester<RealD,vRealD>(funcRotate(r));
|
||||
}
|
||||
|
||||
|
||||
|
||||
std::cout << GridLogMessage <<"==================================="<< std::endl;
|
||||
std::cout << GridLogMessage <<"Testing vComplexF "<<std::endl;
|
||||
std::cout << GridLogMessage <<"==================================="<< std::endl;
|
||||
@ -261,6 +392,23 @@ int main (int argc, char ** argv)
|
||||
Tester<ComplexF,vComplexF>(funcInnerProduct());
|
||||
ReductionTester<ComplexF,ComplexF,vComplexF>(funcReduce());
|
||||
|
||||
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
std::cout<<GridLogMessage << "Testing vComplexF permutes "<<std::endl;
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
|
||||
// Log2 iteration
|
||||
for(int i=0;(1<<i)< vComplexF::Nsimd();i++){
|
||||
PermTester<ComplexF,vComplexF>(funcPermute(i));
|
||||
}
|
||||
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
std::cout<<GridLogMessage << "Testing vComplexF rotate "<<std::endl;
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
for(int r=0;r<vComplexF::Nsimd();r++){
|
||||
PermTester<ComplexF,vComplexF>(funcRotate(r));
|
||||
}
|
||||
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
std::cout<<GridLogMessage << "Testing vComplexD "<<std::endl;
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
@ -276,5 +424,23 @@ int main (int argc, char ** argv)
|
||||
Tester<ComplexD,vComplexD>(funcInnerProduct());
|
||||
ReductionTester<ComplexD,ComplexD,vComplexD>(funcReduce());
|
||||
|
||||
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
std::cout<<GridLogMessage << "Testing vComplexD permutes "<<std::endl;
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
|
||||
// Log2 iteration
|
||||
for(int i=0;(1<<i)< vComplexD::Nsimd();i++){
|
||||
PermTester<ComplexD,vComplexD>(funcPermute(i));
|
||||
}
|
||||
|
||||
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
std::cout<<GridLogMessage << "Testing vComplexD rotate "<<std::endl;
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
for(int r=0;r<vComplexD::Nsimd();r++){
|
||||
PermTester<ComplexD,vComplexD>(funcRotate(r));
|
||||
}
|
||||
|
||||
Grid_finalize();
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user