1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-12 20:27:06 +01:00

Big updates with progress towards wilson matrix

This commit is contained in:
Peter Boyle
2015-04-26 15:51:09 +01:00
parent c678f2d255
commit 35cfef2129
27 changed files with 1008 additions and 355 deletions

View File

@ -11,6 +11,7 @@
#define GRID_H
#include <stdio.h>
#include <complex>
#include <vector>
#include <iostream>

View File

@ -2,7 +2,7 @@
/* lib/Grid_config.h.in. Generated from configure.ac by autoheader. */
/* AVX */
#define AVX1 1
/* #undef AVX1 */
/* AVX2 */
/* #undef AVX2 */
@ -77,7 +77,7 @@
#define PACKAGE_VERSION "1.0"
/* SSE4 */
/* #undef SSE4 */
#define SSE4 1
/* Define to 1 if you have the ANSI C header files. */
#define STDC_HEADERS 1

View File

@ -26,10 +26,15 @@ namespace Grid {
typedef float RealF;
typedef double RealD;
#ifdef GRID_DEFAULT_PRECISION_DOUBLE
typedef RealD Real;
#else
typedef RealF Real;
#endif
typedef std::complex<RealF> ComplexF;
typedef std::complex<RealD> ComplexD;
typedef std::complex<Real> Complex;
inline RealF adj(const RealF & r){ return r; }
inline RealF conj(const RealF & r){ return r; }
@ -63,8 +68,8 @@ namespace Grid {
//conj already supported for complex
inline ComplexF timesI(const ComplexF r) { return(r*ComplexF(0.0,1.0));}
inline ComplexF timesMinusI(const ComplexF r){ return(r*ComplexF(0.0,-1.0));}
inline ComplexD timesI(const ComplexD r) { return(r*ComplexD(0.0,1.0));}
inline ComplexF timesMinusI(const ComplexF r){ return(r*ComplexF(0.0,-1.0));}
inline ComplexD timesMinusI(const ComplexD r){ return(r*ComplexD(0.0,-1.0));}
inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){ *y = (*a) * (*x)+(*y);}
@ -280,15 +285,11 @@ namespace Grid {
// Default precision
#ifdef GRID_DEFAULT_PRECISION_DOUBLE
typedef RealD Real;
typedef vRealD vReal;
typedef vComplexD vComplex;
typedef std::complex<Real> Complex;
#else
typedef RealF Real;
typedef vRealF vReal;
typedef vComplexF vComplex;
typedef std::complex<Real> Complex;
#endif
}
#endif

View File

@ -47,6 +47,101 @@ namespace Grid {
int from_rank;
} ;
///////////////////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split with compression
///////////////////////////////////////////////////////////////////
template<class vobj,class cobj,class compressor> void
Gather_plane_simple (Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress)
{
int rd = rhs._grid->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
// Simple block stride gather of SIMD objects
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
buffer[bo++]=compress(rhs._odata[so+o+b]);
}
o +=rhs._grid->_slice_stride[dimension];
}
} else {
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb &cbmask ) {
buffer[bo]=compress(rhs._odata[so+o+b]);
bo++;
}
}
o +=rhs._grid->_slice_stride[dimension];
}
}
}
///////////////////////////////////////////////////////////////////
// Gather for when there *is* need to SIMD split with compression
///////////////////////////////////////////////////////////////////
template<class cobj,class vobj,class compressor> void
Gather_plane_extract(Lattice<vobj> &rhs,std::vector<typename cobj::scalar_type *> pointers,int dimension,int plane,int cbmask,compressor &compress)
{
int rd = rhs._grid->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
// Simple block stride gather of SIMD objects
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
cobj temp;
temp=compress(rhs._odata[so+o+b]);
extract(temp,pointers);
}
o +=rhs._grid->_slice_stride[dimension];
}
} else {
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
if ( ocb & cbmask ) {
cobj temp;
temp =compress(rhs._odata[so+o+b]);
extract(temp,pointers);
}
}
o +=rhs._grid->_slice_stride[dimension];
}
}
}
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
public:
@ -86,8 +181,8 @@ namespace Grid {
// Could allow a functional munging of the halo to another type during the comms.
// this could implement the 16bit/32bit/64bit compression.
template<class vobj> void HaloExchange(Lattice<vobj> &source,
std::vector<vobj,alignedAllocator<vobj> > &u_comm_buf)
template<class vobj,class cobj, class compressor> void
HaloExchange(Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
{
// conformable(source._grid,_grid);
assert(source._grid==_grid);
@ -95,12 +190,10 @@ namespace Grid {
int u_comm_offset=0;
// Gather all comms buffers
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
for(int point = 0 ; point < _npoints; point++) {
printf("Point %d \n",point);fflush(stdout);
compress.Point(point);
int dimension = _directions[point];
int displacement = _distances[point];
@ -126,33 +219,30 @@ namespace Grid {
sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
if ( sshift[0] == sshift[1] ) {
if (splice_dim) {
printf("splice 0x3 \n");fflush(stdout);
GatherStartCommsSimd(source,dimension,shift,0x3,u_comm_buf,u_comm_offset);
GatherStartCommsSimd(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
} else {
printf("NO splice 0x3 \n");fflush(stdout);
GatherStartComms(source,dimension,shift,0x3,u_comm_buf,u_comm_offset);
GatherStartComms(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
}
} else {
if(splice_dim){
printf("splice 0x1,2 \n");fflush(stdout);
GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset);// if checkerboard is unfavourable take two passes
GatherStartCommsSimd(source,dimension,shift,0x2,u_comm_buf,u_comm_offset);// both with block stride loop iteration
GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);// if checkerboard is unfavourable take two passes
GatherStartCommsSimd(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);// both with block stride loop iteration
} else {
printf("NO splice 0x1,2 \n");fflush(stdout);
GatherStartComms(source,dimension,shift,0x1,u_comm_buf,u_comm_offset);
GatherStartComms(source,dimension,shift,0x2,u_comm_buf,u_comm_offset);
GatherStartComms(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);
GatherStartComms(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);
}
}
}
}
}
template<class vobj> void GatherStartComms(Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
std::vector<vobj,alignedAllocator<vobj> > &u_comm_buf,
int &u_comm_offset)
template<class vobj,class cobj, class compressor>
void GatherStartComms(Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
int &u_comm_offset,compressor & compress)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
typedef typename cobj::vector_type vector_type;
typedef typename cobj::scalar_type scalar_type;
GridBase *grid=_grid;
assert(rhs._grid==_grid);
@ -169,31 +259,26 @@ namespace Grid {
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size); // hmm...
std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
std::vector<cobj,alignedAllocator<cobj> > send_buf(buffer_size); // hmm...
std::vector<cobj,alignedAllocator<cobj> > recv_buf(buffer_size);
int cb= (cbmask==0x2)? 1 : 0;
int sshift= _grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
for(int x=0;x<rd;x++){
printf("GatherStartComms x %d/%d\n",x,rd);fflush(stdout);
int offnode = ( x+sshift >= rd );
int sx = (x+sshift)%rd;
int comm_proc = (x+sshift)/rd;
if (offnode) {
printf("GatherStartComms offnode x %d\n",x);fflush(stdout);
int words = send_buf.size();
if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj);
int bytes = words * sizeof(cobj);
printf("Gather_plane_simple dimension %d sx %d cbmask %d\n",dimension,sx,cbmask);fflush(stdout);
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
printf("GatherStartComms gathered offnode x %d\n",x);fflush(stdout);
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask,compress);
int rank = _grid->_processor;
int recv_from_rank;
@ -219,14 +304,14 @@ namespace Grid {
}
template<class vobj>
template<class vobj,class cobj, class compressor>
void GatherStartCommsSimd(Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
std::vector<vobj,alignedAllocator<vobj> > &u_comm_buf,
int &u_comm_offset)
std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
int &u_comm_offset,compressor &compress)
{
const int Nsimd = _grid->Nsimd();
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
typedef typename cobj::vector_type vector_type;
typedef typename cobj::scalar_type scalar_type;
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
@ -245,7 +330,7 @@ namespace Grid {
// Simd direction uses an extract/merge pair
///////////////////////////////////////////////
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
int words = sizeof(vobj)/sizeof(vector_type);
int words = sizeof(cobj)/sizeof(vector_type);
/* FIXME ALTERNATE BUFFER DETERMINATION */
std::vector<std::vector<scalar_type> > send_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
@ -285,7 +370,7 @@ namespace Grid {
for(int i=0;i<Nsimd;i++){
pointers[Nsimd-1-i] = (scalar_type *)&send_buf_extract[i][0];
}
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
Gather_plane_extract<cobj>(rhs,pointers,dimension,sx,cbmask,compress);
for(int i=0;i<Nsimd;i++){

View File

@ -18,6 +18,7 @@ libGrid_a_SOURCES =\
Grid_init.cc\
stencil/Grid_stencil_common.cc\
qcd/Grid_qcd_dirac.cc\
qcd/Grid_qcd_wilson_dop.cc\
$(extra_sources)
#

View File

@ -79,18 +79,18 @@ namespace Grid {
return ret;
}
template<class left,class right>
inline auto operator + (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
inline auto operator + (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]+rhs._odata[0])>
{
//NB mult performs conformable check. Do not reapply here for performance.
Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
Lattice<decltype(lhs._odata[0]+rhs._odata[0])> ret(rhs._grid);
add(ret,lhs,rhs);
return ret;
}
template<class left,class right>
inline auto operator - (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
inline auto operator - (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]-rhs._odata[0])>
{
//NB mult performs conformable check. Do not reapply here for performance.
Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
Lattice<decltype(lhs._odata[0]-rhs._odata[0])> ret(rhs._grid);
sub(ret,lhs,rhs);
return ret;
}
@ -107,9 +107,9 @@ namespace Grid {
return ret;
}
template<class left,class right>
inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs+rhs._odata[0])>
{
Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
Lattice<decltype(lhs+rhs._odata[0])> ret(rhs._grid);
#pragma omp parallel for
for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=lhs+rhs._odata[ss];
@ -117,9 +117,9 @@ namespace Grid {
return ret;
}
template<class left,class right>
inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs-rhs._odata[0])>
{
Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
Lattice<decltype(lhs-rhs._odata[0])> ret(rhs._grid);
#pragma omp parallel for
for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=lhs-rhs._odata[ss];
@ -137,9 +137,9 @@ namespace Grid {
return ret;
}
template<class left,class right>
inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]+rhs)>
{
Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
Lattice<decltype(lhs._odata[0]+rhs)> ret(lhs._grid);
#pragma omp parallel for
for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=lhs._odata[ss]+rhs;
@ -147,9 +147,9 @@ namespace Grid {
return ret;
}
template<class left,class right>
inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]-rhs)>
{
Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
Lattice<decltype(lhs._odata[0]-rhs)> ret(lhs._grid);
#pragma omp parallel for
for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=lhs._odata[ss]-rhs;

View File

@ -14,7 +14,7 @@ namespace Grid {
typedef typename vobj::scalar_type scalar;
typedef typename vobj::vector_type vector;
decltype(innerProduct(arg._odata[0],arg._odata[0])) vnrm=zero;
decltype(innerProduct(arg._odata[0],arg._odata[0])) vnrm;
scalar nrm;
//FIXME make this loop parallelisable
vnrm=zero;
@ -33,10 +33,11 @@ namespace Grid {
//->decltype(innerProduct(left._odata[0],right._odata[0]))
{
typedef typename vobj::scalar_type scalar;
decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero;
decltype(innerProduct(left._odata[0],right._odata[0])) vnrm;
scalar nrm;
//FIXME make this loop parallelisable
vnrm=zero;
for(int ss=0;ss<left._grid->oSites(); ss++){
vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]);
}
@ -94,8 +95,10 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim];
sobj szero; szero=zero;
std::vector<vobj,alignedAllocator<vobj> > lvSum(rd); // will locally sum vectors first
std::vector<sobj> lsSum(ld,sobj(zero)); // sum across these down to scalars
std::vector<sobj> lsSum(ld,szero); // sum across these down to scalars
std::vector<sobj> extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node for IO to file

View File

@ -26,6 +26,8 @@ namespace Grid {
}
};
class GridRNGbase {
public:
@ -62,6 +64,21 @@ namespace Grid {
}
// real scalars are one component
template<class scalar,class distribution> void fillScalar(scalar &s,distribution &dist)
{
s=dist(_generators[0]);
}
template<class distribution> void fillScalar(ComplexF &s,distribution &dist)
{
s=ComplexF(dist(_generators[0]),dist(_generators[0]));
}
template<class distribution> void fillScalar(ComplexD &s,distribution &dist)
{
s=ComplexD(dist(_generators[0]),dist(_generators[0]));
}
template <class sobj,class distribution> inline void fill(sobj &l,distribution &dist){
typedef typename sobj::scalar_type scalar_type;
@ -71,13 +88,60 @@ namespace Grid {
scalar_type *buf = (scalar_type *) & l;
for(int idx=0;idx<words;idx++){
buf[idx] = dist(_generators[0]);
fillScalar(buf[idx],dist);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
};
template <class distribution> inline void fill(ComplexF &l,distribution &dist){
fillScalar(l,dist);
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(ComplexD &l,distribution &dist){
fillScalar(l,dist);
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(RealF &l,distribution &dist){
fillScalar(l,dist);
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(RealD &l,distribution &dist){
fillScalar(l,dist);
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
// vector fill
template <class distribution> inline void fill(vComplexF &l,distribution &dist){
RealF *pointer=(RealF *)&l;
for(int i=0;i<2*vComplexF::Nsimd();i++){
fillScalar(pointer[i],dist);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(vComplexD &l,distribution &dist){
RealD *pointer=(RealD *)&l;
for(int i=0;i<2*vComplexD::Nsimd();i++){
fillScalar(pointer[i],dist);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(vRealF &l,distribution &dist){
RealF *pointer=(RealF *)&l;
for(int i=0;i<vRealF::Nsimd();i++){
fillScalar(pointer[i],dist);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(vRealD &l,distribution &dist){
RealD *pointer=(RealD *)&l;
for(int i=0;i<vRealD::Nsimd();i++){
fillScalar(pointer[i],dist);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
void SeedRandomDevice(void){
std::random_device rd;
Seed(rd);
@ -186,7 +250,6 @@ namespace Grid {
};
template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l){
rng.fill(l,rng._uniform);
}

View File

@ -11,21 +11,21 @@ namespace Grid {
// multiplication by fundamental scalar type
template<class l,int N> inline iScalar<l> operator * (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs)
{
typename iScalar<l>::tensor_reduced srhs(rhs);
typename iScalar<l>::tensor_reduced srhs; srhs=rhs;
return lhs*srhs;
}
template<class l,int N> inline iScalar<l> operator * (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) { return rhs*lhs; }
template<class l,int N> inline iVector<l,N> operator * (const iVector<l,N>& lhs,const typename iScalar<l>::scalar_type rhs)
{
typename iVector<l,N>::tensor_reduced srhs(rhs);
typename iVector<l,N>::tensor_reduced srhs; srhs=rhs;
return lhs*srhs;
}
template<class l,int N> inline iVector<l,N> operator * (const typename iScalar<l>::scalar_type lhs,const iVector<l,N>& rhs) { return rhs*lhs; }
template<class l,int N> inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,const typename iScalar<l>::scalar_type &rhs)
{
typename iMatrix<l,N>::tensor_reduced srhs(rhs);
typename iMatrix<l,N>::tensor_reduced srhs; srhs=rhs;
return lhs*srhs;
}
template<class l,int N> inline iMatrix<l,N> operator * (const typename iScalar<l>::scalar_type & lhs,const iMatrix<l,N>& rhs) { return rhs*lhs; }
@ -35,24 +35,24 @@ template<class l,int N> inline iMatrix<l,N> operator * (const typename iScalar<l
////////////////////////////////////////////////////////////////////
template<class l> inline iScalar<l> operator * (const iScalar<l>& lhs,double rhs)
{
typename iScalar<l>::scalar_type t(rhs);
typename iScalar<l>::tensor_reduced srhs(t);
typename iScalar<l>::scalar_type t; t=rhs;
typename iScalar<l>::tensor_reduced srhs;srhs=t;
return lhs*srhs;
}
template<class l> inline iScalar<l> operator * (double lhs,const iScalar<l>& rhs) { return rhs*lhs; }
template<class l,int N> inline iVector<l,N> operator * (const iVector<l,N>& lhs,double rhs)
{
typename iScalar<l>::scalar_type t(rhs);
typename iScalar<l>::tensor_reduced srhs(t);
typename iScalar<l>::scalar_type t;t=rhs;
typename iScalar<l>::tensor_reduced srhs;srhs=t;
return lhs*srhs;
}
template<class l,int N> inline iVector<l,N> operator * (double lhs,const iVector<l,N>& rhs) { return rhs*lhs; }
template<class l,int N> inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,double rhs)
{
typename iScalar<l>::scalar_type t(rhs);
typename iScalar<l>::tensor_reduced srhs(t);
typename iScalar<l>::scalar_type t;t=rhs;
typename iScalar<l>::tensor_reduced srhs;srhs=t;
return lhs*srhs;
}
template<class l,int N> inline iMatrix<l,N> operator * (double lhs,const iMatrix<l,N>& rhs) { return rhs*lhs; }
@ -62,24 +62,26 @@ template<class l,int N> inline iMatrix<l,N> operator * (double lhs,const iMatrix
////////////////////////////////////////////////////////////////////
template<class l> inline iScalar<l> operator * (const iScalar<l>& lhs,ComplexD rhs)
{
typename iScalar<l>::scalar_type t(rhs);
typename iScalar<l>::tensor_reduced srhs(t);
typename iScalar<l>::scalar_type t;t=rhs;
typename iScalar<l>::tensor_reduced srhs;srhs=t;
return lhs*srhs;
}
template<class l> inline iScalar<l> operator * (ComplexD lhs,const iScalar<l>& rhs) { return rhs*lhs; }
template<class l,int N> inline iVector<l,N> operator * (const iVector<l,N>& lhs,ComplexD rhs)
{
typename iScalar<l>::scalar_type t(rhs);
typename iScalar<l>::tensor_reduced srhs(t);
typename iScalar<l>::scalar_type t;t=rhs;
typename iScalar<l>::tensor_reduced srhs;srhs=t;
return lhs*srhs;
}
template<class l,int N> inline iVector<l,N> operator * (ComplexD lhs,const iVector<l,N>& rhs) { return rhs*lhs; }
template<class l,int N> inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,ComplexD rhs)
{
typename iScalar<l>::scalar_type t(rhs);
typename iScalar<l>::tensor_reduced srhs(t);
typename iScalar<l>::scalar_type t;t=rhs;
typename iScalar<l>::tensor_reduced srhs;srhs=t;
return lhs*srhs;
}
template<class l,int N> inline iMatrix<l,N> operator * (ComplexD lhs,const iMatrix<l,N>& rhs) { return rhs*lhs; }
@ -89,24 +91,24 @@ template<class l,int N> inline iMatrix<l,N> operator * (ComplexD lhs,const iMatr
////////////////////////////////////////////////////////////////////
template<class l> inline iScalar<l> operator * (const iScalar<l>& lhs,Integer rhs)
{
typename iScalar<l>::scalar_type t(rhs);
typename iScalar<l>::tensor_reduced srhs(t);
typename iScalar<l>::scalar_type t; t=rhs;
typename iScalar<l>::tensor_reduced srhs; srhs=t;
return lhs*srhs;
}
template<class l> inline iScalar<l> operator * (Integer lhs,const iScalar<l>& rhs) { return rhs*lhs; }
template<class l,int N> inline iVector<l,N> operator * (const iVector<l,N>& lhs,Integer rhs)
{
typename iScalar<l>::scalar_type t(rhs);
typename iScalar<l>::tensor_reduced srhs(t);
typename iScalar<l>::scalar_type t;t=rhs;
typename iScalar<l>::tensor_reduced srhs;srhs=t;
return lhs*srhs;
}
template<class l,int N> inline iVector<l,N> operator * (Integer lhs,const iVector<l,N>& rhs) { return rhs*lhs; }
template<class l,int N> inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,Integer rhs)
{
typename iScalar<l>::scalar_type t(rhs);
typename iScalar<l>::tensor_reduced srhs(t);
typename iScalar<l>::scalar_type t;t=rhs;
typename iScalar<l>::tensor_reduced srhs;srhs=t;
return lhs*srhs;
}
template<class l,int N> inline iMatrix<l,N> operator * (Integer lhs,const iMatrix<l,N>& rhs) { return rhs*lhs; }
@ -118,14 +120,14 @@ template<class l,int N> inline iMatrix<l,N> operator * (Integer lhs,const iMatri
///////////////////////////////////////////////////////////////////////////////////////////////
template<class l,int N> inline iScalar<l> operator + (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs)
{
typename iScalar<l>::tensor_reduced srhs(rhs);
typename iScalar<l>::tensor_reduced srhs; srhs=rhs;
return lhs+srhs;
}
template<class l,int N> inline iScalar<l> operator + (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) { return rhs+lhs; }
template<class l,int N> inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,const typename iScalar<l>::scalar_type rhs)
{
typename iMatrix<l,N>::tensor_reduced srhs(rhs);
typename iMatrix<l,N>::tensor_reduced srhs; srhs=rhs;
return lhs+srhs;
}
template<class l,int N> inline iMatrix<l,N> operator + (const typename iScalar<l>::scalar_type lhs,const iMatrix<l,N>& rhs) { return rhs+lhs; }
@ -135,16 +137,16 @@ template<class l,int N> inline iMatrix<l,N> operator + (const typename iScalar<l
////////////////////////////////////////////////////////////////////
template<class l> inline iScalar<l> operator + (const iScalar<l>& lhs,double rhs)
{
typename iScalar<l>::scalar_type t(rhs);
typename iScalar<l>::tensor_reduced srhs(t);
typename iScalar<l>::scalar_type t; t=rhs;
typename iScalar<l>::tensor_reduced srhs; srhs=t;
return lhs+srhs;
}
template<class l> inline iScalar<l> operator + (double lhs,const iScalar<l>& rhs) { return rhs+lhs; }
template<class l,int N> inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,double rhs)
{
typename iScalar<l>::scalar_type t(rhs);
typename iScalar<l>::tensor_reduced srhs(t);
typename iScalar<l>::scalar_type t;t=rhs;
typename iScalar<l>::tensor_reduced srhs;srhs=t;
return lhs+srhs;
}
template<class l,int N> inline iMatrix<l,N> operator + (double lhs,const iMatrix<l,N>& rhs) { return rhs+lhs; }
@ -155,8 +157,8 @@ template<class l,int N> inline iMatrix<l,N> operator + (double lhs,const iMatrix
template<class l> inline iScalar<l> operator + (const iScalar<l>& lhs,Integer rhs)
{
typename iScalar<l>::scalar_type t(rhs);
typename iScalar<l>::tensor_reduced srhs(t);
typename iScalar<l>::scalar_type t; t=rhs;
typename iScalar<l>::tensor_reduced srhs; srhs=t;
return lhs+srhs;
}
@ -164,8 +166,8 @@ template<class l> inline iScalar<l> operator + (Integer lhs,const iScalar<l>& rh
template<class l,int N> inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,Integer rhs)
{
typename iScalar<l>::scalar_type t(rhs);
typename iScalar<l>::tensor_reduced srhs(t);
typename iScalar<l>::scalar_type t;t=rhs;
typename iScalar<l>::tensor_reduced srhs;srhs=t;
return lhs+srhs;
}
template<class l,int N> inline iMatrix<l,N> operator + (Integer lhs,const iMatrix<l,N>& rhs) { return rhs+lhs; }
@ -176,23 +178,23 @@ template<class l,int N> inline iMatrix<l,N> operator + (Integer lhs,const iMatri
///////////////////////////////////////////////////////////////////////////////////////////////
template<class l,int N> inline iScalar<l> operator - (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs)
{
typename iScalar<l>::tensor_reduced srhs(rhs);
typename iScalar<l>::tensor_reduced srhs; srhs=rhs;
return lhs-srhs;
}
template<class l,int N> inline iScalar<l> operator - (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs)
{
typename iScalar<l>::tensor_reduced slhs(lhs);
typename iScalar<l>::tensor_reduced slhs;slhs=lhs;
return slhs-rhs;
}
template<class l,int N> inline iMatrix<l,N> operator - (const iMatrix<l,N>& lhs,const typename iScalar<l>::scalar_type rhs)
{
typename iScalar<l>::tensor_reduced srhs(rhs);
typename iScalar<l>::tensor_reduced srhs; srhs=rhs;
return lhs-srhs;
}
template<class l,int N> inline iMatrix<l,N> operator - (const typename iScalar<l>::scalar_type lhs,const iMatrix<l,N>& rhs)
{
typename iScalar<l>::tensor_reduced slhs(lhs);
typename iScalar<l>::tensor_reduced slhs;slhs=lhs;
return slhs-rhs;
}
@ -201,27 +203,27 @@ template<class l,int N> inline iMatrix<l,N> operator - (const typename iScalar<l
////////////////////////////////////////////////////////////////////
template<class l> inline iScalar<l> operator - (const iScalar<l>& lhs,double rhs)
{
typename iScalar<l>::scalar_type t(rhs);
typename iScalar<l>::tensor_reduced srhs(t);
typename iScalar<l>::scalar_type t; t=rhs;
typename iScalar<l>::tensor_reduced srhs; srhs=t;
return lhs-srhs;
}
template<class l> inline iScalar<l> operator - (double lhs,const iScalar<l>& rhs)
{
typename iScalar<l>::scalar_type t(lhs);
typename iScalar<l>::tensor_reduced slhs(t);
typename iScalar<l>::tensor_reduced slhs;slhs=t;
return slhs-rhs;
}
template<class l,int N> inline iMatrix<l,N> operator - (const iMatrix<l,N>& lhs,double rhs)
{
typename iScalar<l>::scalar_type t(rhs);
typename iScalar<l>::tensor_reduced srhs(t);
typename iScalar<l>::scalar_type t;t=rhs;
typename iScalar<l>::tensor_reduced srhs;srhs=t;
return lhs-srhs;
}
template<class l,int N> inline iMatrix<l,N> operator - (double lhs,const iMatrix<l,N>& rhs)
{
typename iScalar<l>::scalar_type t(lhs);
typename iScalar<l>::tensor_reduced slhs(t);
typename iScalar<l>::tensor_reduced slhs;slhs=t;
return slhs-rhs;
}
@ -230,26 +232,26 @@ template<class l,int N> inline iMatrix<l,N> operator - (double lhs,const iMatrix
////////////////////////////////////////////////////////////////////
template<class l> inline iScalar<l> operator - (const iScalar<l>& lhs,Integer rhs)
{
typename iScalar<l>::scalar_type t(rhs);
typename iScalar<l>::tensor_reduced srhs(t);
typename iScalar<l>::scalar_type t; t=rhs;
typename iScalar<l>::tensor_reduced srhs; srhs=t;
return lhs-srhs;
}
template<class l> inline iScalar<l> operator - (Integer lhs,const iScalar<l>& rhs)
{
typename iScalar<l>::scalar_type t(lhs);
typename iScalar<l>::tensor_reduced slhs(t);
typename iScalar<l>::scalar_type t;t=lhs;
typename iScalar<l>::tensor_reduced slhs;slhs=t;
return slhs-rhs;
}
template<class l,int N> inline iMatrix<l,N> operator - (const iMatrix<l,N>& lhs,Integer rhs)
{
typename iScalar<l>::scalar_type t(rhs);
typename iScalar<l>::tensor_reduced srhs(t);
typename iScalar<l>::scalar_type t;t=rhs;
typename iScalar<l>::tensor_reduced srhs;srhs=t;
return lhs-srhs;
}
template<class l,int N> inline iMatrix<l,N> operator - (Integer lhs,const iMatrix<l,N>& rhs)
{
typename iScalar<l>::scalar_type t(lhs);
typename iScalar<l>::tensor_reduced slhs(t);
typename iScalar<l>::scalar_type t;t=lhs;
typename iScalar<l>::tensor_reduced slhs;slhs=t;
return slhs-rhs;
}

View File

@ -17,7 +17,8 @@ namespace Grid {
auto innerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(innerProduct(lhs._internal[0],rhs._internal[0]))>
{
typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t;
iScalar<ret_t> ret=zero;
iScalar<ret_t> ret;
ret=zero;
for(int c1=0;c1<N;c1++){
ret._internal += innerProduct(lhs._internal[c1],rhs._internal[c1]);
}
@ -27,8 +28,9 @@ namespace Grid {
auto innerProduct (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iScalar<decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0]))>
{
typedef decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t;
iScalar<ret_t> ret=zero;
iScalar<ret_t> ret;
iScalar<ret_t> tmp;
ret=zero;
for(int c1=0;c1<N;c1++){
for(int c2=0;c2<N;c2++){
ret._internal+=innerProduct(lhs._internal[c1][c2],rhs._internal[c1][c2]);

View File

@ -8,6 +8,16 @@ namespace Grid {
// These can be composed to form tensor products of internal indices.
///////////////////////////////////////////////////
// It is useful to NOT have any constructors
// so that these classes assert "is_pod<class> == true"
// because then the standard C++ valarray container eliminates fill overhead on new allocation and
// non-move copying.
//
// However note that doing this eliminates some syntactical sugar such as
// calling the constructor explicitly or implicitly
//
#define TENSOR_IS_POD
template<class vtype> class iScalar
{
public:
@ -25,16 +35,22 @@ public:
// Scalar no action
// template<int Level> using tensor_reduce_level = typename iScalar<GridTypeMapper<vtype>::tensor_reduce_level<Level> >;
iScalar(){};
iScalar(scalar_type s) : _internal(s) {};// recurse down and hit the constructor for vector_type
iScalar(const Zero &z){ *this = zero; };
#ifndef TENSOR_IS_POD
iScalar(){;};
iScalar(scalar_type s) : _internal(s) {};// recurse down and hit the constructor for vector_type
iScalar(const Zero &z){ *this = zero; };
#endif
iScalar<vtype> & operator= (const Zero &hero){
zeroit(*this);
return *this;
zeroit(*this);
return *this;
}
iScalar<vtype> & operator= (const scalar_type s){
_internal=s;
return *this;
}
friend void zeroit(iScalar<vtype> &that){
zeroit(that._internal);
}
@ -114,8 +130,10 @@ public:
enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
iVector(const Zero &z){ *this = zero; };
iVector() {};// Empty constructure
#ifndef TENSOR_IS_POD
iVector(const Zero &z){ *this = zero; };
iVector() {};// Empty constructure
#endif
iVector<vtype,N> & operator= (const Zero &hero){
zeroit(*this);
@ -185,8 +203,11 @@ public:
enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
#ifndef TENSOR_IS_POD
iMatrix(const Zero &z){ *this = zero; };
iMatrix() {};
#endif
iMatrix<vtype,N> & operator= (const Zero &hero){
zeroit(*this);
return *this;

View File

@ -8,6 +8,7 @@ namespace QCD {
static const int Ns=4;
static const int Nd=4;
static const int Nhs=2; // half spinor
static const int Nds=8; // double stored gauge field
static const int CbRed =0;
static const int CbBlack=1;
@ -28,79 +29,216 @@ namespace QCD {
//
// That probably makes for GridRedBlack4dCartesian grid.
template<typename vtype> using iSinglet = iScalar<iScalar<iScalar<vtype> > >;
template<typename vtype> using iSpinMatrix = iScalar<iMatrix<iScalar<vtype>, Ns> >;
template<typename vtype> using iSpinColourMatrix = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
template<typename vtype> using iColourMatrix = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
template<typename vtype> using iLorentzColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
// s,sp,c,spc,lc
template<typename vtype> using iSinglet = iScalar<iScalar<iScalar<vtype> > >;
template<typename vtype> using iSpinMatrix = iScalar<iMatrix<iScalar<vtype>, Ns> >;
template<typename vtype> using iColourMatrix = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
template<typename vtype> using iSpinColourMatrix = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
template<typename vtype> using iLorentzColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
template<typename vtype> using iDoubleStoredColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
template<typename vtype> using iSpinVector = iScalar<iVector<iScalar<vtype>, Ns> >;
template<typename vtype> using iColourVector = iScalar<iScalar<iVector<vtype, Nc> > >;
template<typename vtype> using iSpinColourVector = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
template<typename vtype> using iHalfSpinVector = iScalar<iVector<iScalar<vtype>, Nhs> >;
template<typename vtype> using iHalfSpinColourVector = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
// Spin matrix
typedef iSpinMatrix<Complex > SpinMatrix;
typedef iSpinMatrix<ComplexF > SpinMatrixF;
typedef iSpinMatrix<ComplexD > SpinMatrixD;
template<typename vtype> using iSpinVector = iScalar<iVector<iScalar<vtype>, Ns> >;
template<typename vtype> using iColourVector = iScalar<iScalar<iVector<vtype, Nc> > >;
template<typename vtype> using iSpinColourVector = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
typedef iSpinMatrix<vComplex > vSpinMatrix;
typedef iSpinMatrix<vComplexF> vSpinMatrixF;
typedef iSpinMatrix<vComplexD> vSpinMatrixD;
template<typename vtype> using iHalfSpinVector = iScalar<iVector<iScalar<vtype>, Nhs> >;
template<typename vtype> using iHalfSpinColourVector = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
// Colour Matrix
typedef iColourMatrix<Complex > ColourMatrix;
typedef iColourMatrix<ComplexF > ColourMatrixF;
typedef iColourMatrix<ComplexD > ColourMatrixD;
typedef iSpinMatrix<Complex > SpinMatrix;
typedef iColourMatrix<Complex > ColourMatrix;
typedef iSpinColourMatrix<Complex > SpinColourMatrix;
typedef iLorentzColourMatrix<Complex > LorentzColourMatrix;
typedef iColourMatrix<vComplex > vColourMatrix;
typedef iColourMatrix<vComplexF> vColourMatrixF;
typedef iColourMatrix<vComplexD> vColourMatrixD;
// SpinColour matrix
typedef iSpinColourMatrix<Complex > SpinColourMatrix;
typedef iSpinColourMatrix<ComplexF > SpinColourMatrixF;
typedef iSpinColourMatrix<ComplexD > SpinColourMatrixD;
typedef iSpinColourMatrix<vComplex > vSpinColourMatrix;
typedef iSpinColourMatrix<vComplexF> vSpinColourMatrixF;
typedef iSpinColourMatrix<vComplexD> vSpinColourMatrixD;
// LorentzColour
typedef iLorentzColourMatrix<Complex > LorentzColourMatrix;
typedef iLorentzColourMatrix<ComplexF > LorentzColourMatrixF;
typedef iLorentzColourMatrix<ComplexD > LorentzColourMatrixD;
typedef iSpinVector<Complex > SpinVector;
typedef iColourVector<Complex > ColourVector;
typedef iSpinColourVector<Complex > SpinColourVector;
typedef iHalfSpinVector<Complex > HalfSpinVector;
typedef iHalfSpinColourVector<Complex > HalfSpinColourVector;
typedef iSpinMatrix<vComplex > vSpinMatrix;
typedef iColourMatrix<vComplex > vColourMatrix;
typedef iSpinColourMatrix<vComplex > vSpinColourMatrix;
typedef iLorentzColourMatrix<vComplex > vLorentzColourMatrix;
typedef iLorentzColourMatrix<vComplexF> vLorentzColourMatrixF;
typedef iLorentzColourMatrix<vComplexD> vLorentzColourMatrixD;
// DoubleStored gauge field
typedef iDoubleStoredColourMatrix<Complex > DoubleStoredColourMatrix;
typedef iDoubleStoredColourMatrix<ComplexF > DoubleStoredColourMatrixF;
typedef iDoubleStoredColourMatrix<ComplexD > DoubleStoredColourMatrixD;
typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;
// Spin vector
typedef iSpinVector<Complex > SpinVector;
typedef iSpinVector<ComplexF> SpinVectorF;
typedef iSpinVector<ComplexD> SpinVectorD;
typedef iSpinVector<vComplex > vSpinVector;
typedef iSpinVector<vComplexF> vSpinVectorF;
typedef iSpinVector<vComplexD> vSpinVectorD;
// Colour vector
typedef iColourVector<Complex > ColourVector;
typedef iColourVector<ComplexF> ColourVectorF;
typedef iColourVector<ComplexD> ColourVectorD;
typedef iColourVector<vComplex > vColourVector;
typedef iColourVector<vComplexF> vColourVectorF;
typedef iColourVector<vComplexD> vColourVectorD;
// SpinColourVector
typedef iSpinColourVector<Complex > SpinColourVector;
typedef iSpinColourVector<ComplexF> SpinColourVectorF;
typedef iSpinColourVector<ComplexD> SpinColourVectorD;
typedef iSpinColourVector<vComplex > vSpinColourVector;
typedef iSpinColourVector<vComplexF> vSpinColourVectorF;
typedef iSpinColourVector<vComplexD> vSpinColourVectorD;
// HalfSpin vector
typedef iHalfSpinVector<Complex > HalfSpinVector;
typedef iHalfSpinVector<ComplexF> HalfSpinVectorF;
typedef iHalfSpinVector<ComplexD> HalfSpinVectorD;
typedef iHalfSpinVector<vComplex > vHalfSpinVector;
typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
typedef iHalfSpinVector<vComplexF> vHalfSpinVectorF;
typedef iHalfSpinVector<vComplexD> vHalfSpinVectorD;
// HalfSpinColour vector
typedef iHalfSpinColourVector<Complex > HalfSpinColourVector;
typedef iHalfSpinColourVector<ComplexF> HalfSpinColourVectorF;
typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD;
typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
// singlets
typedef iSinglet<Complex > TComplex; // FIXME This is painful. Tensor singlet complex type.
typedef iSinglet<vComplex > vTComplex; // what if we don't know the tensor structure
typedef iSinglet<ComplexF> TComplexF; // FIXME This is painful. Tensor singlet complex type.
typedef iSinglet<ComplexD> TComplexD; // FIXME This is painful. Tensor singlet complex type.
typedef iSinglet<vComplex > vTComplex ; // what if we don't know the tensor structure
typedef iSinglet<vComplexF> vTComplexF; // what if we don't know the tensor structure
typedef iSinglet<vComplexD> vTComplexD; // what if we don't know the tensor structure
typedef iSinglet<Real > TReal; // Shouldn't need these; can I make it work without?
typedef iSinglet<RealF> TRealF; // Shouldn't need these; can I make it work without?
typedef iSinglet<RealD> TRealD; // Shouldn't need these; can I make it work without?
typedef iSinglet<vReal > vTReal;
typedef iSinglet<vInteger > vTInteger;
typedef iSinglet<vRealF> vTRealF;
typedef iSinglet<vRealD> vTRealD;
typedef iSinglet<vInteger> vTInteger;
typedef iSinglet<Integer > TInteger;
typedef Lattice<vTReal> LatticeReal;
typedef Lattice<vTComplex> LatticeComplex;
typedef Lattice<vTInteger> LatticeInteger; // Predicates for "where"
typedef Lattice<vColourMatrix> LatticeColourMatrix;
typedef Lattice<vSpinMatrix> LatticeSpinMatrix;
typedef Lattice<vSpinColourMatrix> LatticeSpinColourMatrix;
typedef Lattice<vSpinVector> LatticeSpinVector;
typedef Lattice<vColourVector> LatticeColourVector;
typedef Lattice<vSpinColourVector> LatticeSpinColourVector;
typedef Lattice<vHalfSpinVector> LatticeHalfSpinVector;
typedef Lattice<vHalfSpinColourVector> LatticeHalfSpinColourVector;
// Lattices of these
typedef Lattice<vColourMatrix> LatticeColourMatrix;
typedef Lattice<vColourMatrixF> LatticeColourMatrixF;
typedef Lattice<vColourMatrixD> LatticeColourMatrixD;
typedef Lattice<vSpinMatrix> LatticeSpinMatrix;
typedef Lattice<vSpinMatrixF> LatticeSpinMatrixF;
typedef Lattice<vSpinMatrixD> LatticeSpinMatrixD;
typedef Lattice<vSpinColourMatrix> LatticeSpinColourMatrix;
typedef Lattice<vSpinColourMatrixF> LatticeSpinColourMatrixF;
typedef Lattice<vSpinColourMatrixD> LatticeSpinColourMatrixD;
typedef Lattice<vLorentzColourMatrix> LatticeLorentzColourMatrix;
typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF;
typedef Lattice<vLorentzColourMatrixD> LatticeLorentzColourMatrixD;
// DoubleStored gauge field
typedef Lattice<vDoubleStoredColourMatrix> LatticeDoubleStoredColourMatrix;
typedef Lattice<vDoubleStoredColourMatrixF> LatticeDoubleStoredColourMatrixF;
typedef Lattice<vDoubleStoredColourMatrixD> LatticeDoubleStoredColourMatrixD;
typedef Lattice<vSpinVector> LatticeSpinVector;
typedef Lattice<vSpinVectorF> LatticeSpinVectorF;
typedef Lattice<vSpinVectorD> LatticeSpinVectorD;
typedef Lattice<vColourVector> LatticeColourVector;
typedef Lattice<vColourVectorF> LatticeColourVectorF;
typedef Lattice<vColourVectorD> LatticeColourVectorD;
typedef Lattice<vSpinColourVector> LatticeSpinColourVector;
typedef Lattice<vSpinColourVectorF> LatticeSpinColourVectorF;
typedef Lattice<vSpinColourVectorD> LatticeSpinColourVectorD;
typedef Lattice<vHalfSpinVector> LatticeHalfSpinVector;
typedef Lattice<vHalfSpinVectorF> LatticeHalfSpinVectorF;
typedef Lattice<vHalfSpinVectorD> LatticeHalfSpinVectorD;
typedef Lattice<vHalfSpinColourVector> LatticeHalfSpinColourVector;
typedef Lattice<vHalfSpinColourVectorF> LatticeHalfSpinColourVectorF;
typedef Lattice<vHalfSpinColourVectorD> LatticeHalfSpinColourVectorD;
typedef Lattice<vTReal> LatticeReal;
typedef Lattice<vTRealF> LatticeRealF;
typedef Lattice<vTRealD> LatticeRealD;
typedef Lattice<vTComplex> LatticeComplex;
typedef Lattice<vTComplexF> LatticeComplexF;
typedef Lattice<vTComplexD> LatticeComplexD;
typedef Lattice<vTInteger> LatticeInteger; // Predicates for "where"
///////////////////////////////////////////
// Physical names for things
///////////////////////////////////////////
typedef Lattice<vHalfSpinColourVector> LatticeHalfFermion;
typedef Lattice<vSpinColourVector> LatticeFermion;
typedef LatticeHalfSpinColourVector LatticeHalfFermion;
typedef LatticeHalfSpinColourVectorF LatticeHalfFermionF;
typedef LatticeHalfSpinColourVectorF LatticeHalfFermionD;
typedef Lattice<vSpinColourMatrix> LatticePropagator;
typedef Lattice<vLorentzColourMatrix> LatticeGaugeField;
typedef LatticeSpinColourVector LatticeFermion;
typedef LatticeSpinColourVectorF LatticeFermionF;
typedef LatticeSpinColourVectorD LatticeFermionD;
typedef LatticeSpinColourMatrix LatticePropagator;
typedef LatticeSpinColourMatrixF LatticePropagatorF;
typedef LatticeSpinColourMatrixD LatticePropagatorD;
typedef LatticeLorentzColourMatrix LatticeGaugeField;
typedef LatticeLorentzColourMatrixF LatticeGaugeFieldF;
typedef LatticeLorentzColourMatrixD LatticeGaugeFieldD;
typedef LatticeDoubleStoredColourMatrix LatticeDoubledGaugeField;
typedef LatticeDoubleStoredColourMatrixF LatticeDoubledGaugeFieldF;
typedef LatticeDoubleStoredColourMatrixD LatticeDoubledGaugeFieldD;
// Uhgg... typing this hurt ;)
// (my keyboard got burning hot when I typed this, must be the anti-Fermion)
typedef Lattice<vColourVector> LatticeStaggeredFermion;
typedef Lattice<vColourMatrix> LatticeStaggeredPropagator;
typedef Lattice<vColourVector> LatticeStaggeredFermion;
typedef Lattice<vColourVectorF> LatticeStaggeredFermionF;
typedef Lattice<vColourVectorD> LatticeStaggeredFermionD;
typedef Lattice<vColourMatrix> LatticeStaggeredPropagator;
typedef Lattice<vColourMatrixF> LatticeStaggeredPropagatorF;
typedef Lattice<vColourMatrixD> LatticeStaggeredPropagatorD;
//////////////////////////////////////////////////////////////////////////////
// Peek and Poke named after physics attributes
@ -157,11 +295,14 @@ namespace QCD {
return peekIndex<LorentzIndex>(rhs,i,j);
}
// FIXME transpose Colour, transpose Spin, traceColour traceSpin
} //namespace QCD
} // Grid
#include <qcd/Grid_qcd_dirac.h>
#include <qcd/Grid_qcd_2spinor.h>
//#include <qcd/Grid_qcd_pauli.h>
#include <qcd/Grid_qcd_wilson_dop.h>
#endif

View File

@ -17,8 +17,14 @@ namespace QCD {
GammaZ,
GammaT,
Gamma5,
// GammaXGamma5,
// GammaYGamma5,
MinusIdentity,
MinusGammaX,
MinusGammaY,
MinusGammaZ,
MinusGammaT,
MinusGamma5
// GammaXGamma5, // Rest are composite (willing to take hit for two calls sequentially)
// GammaYGamma5, // as they are less commonly used.
// GammaZGamma5,
// GammaTGamma5,
// SigmaXY,
@ -27,12 +33,6 @@ namespace QCD {
// SigmaXT,
// SigmaYT,
// SigmaZT,
MinusIdentity,
MinusGammaX,
MinusGammaY,
MinusGammaZ,
MinusGammaT,
MinusGamma5
// MinusGammaXGamma5, easiest to form by composition
// MinusGammaYGamma5, as performance is not critical for these
// MinusGammaZGamma5,
@ -54,7 +54,6 @@ namespace QCD {
};
/* Gx
* 0 0 0 i
* 0 0 i 0

View File

@ -1,157 +1,220 @@
#ifnfdef GRID_QCD_WILSON_DOP_H
#define GRID_QCD_WILSON_DOP_H
#include <Grid.h>
namespace Grid {
namespace QCD {
const std::vector<int> WilsonMatrix::directions ({0,1,2,3, 0, 1, 2, 3,0});
const std::vector<int> WilsonMatrix::displacements({1,1,1,1,-1,-1,-1,-1,0});
// Should be in header?
static const int WilsonMatrix::Xp = 0;
static const int WilsonMatrix::Yp = 1;
static const int WilsonMatrix::Zp = 2;
static const int WilsonMatrix::Tp = 3;
static const int WilsonMatrix::Xm = 4;
static const int WilsonMatrix::Ym = 5;
static const int WilsonMatrix::Zm = 6;
static const int WilsonMatrix::Tm = 7;
static const int WilsonMatrix::X0 = 8;
static const int WilsonMatrix::npoint=9;
const int WilsonMatrix::Xp = 0;
const int WilsonMatrix::Yp = 1;
const int WilsonMatrix::Zp = 2;
const int WilsonMatrix::Tp = 3;
const int WilsonMatrix::Xm = 4;
const int WilsonMatrix::Ym = 5;
const int WilsonMatrix::Zm = 6;
const int WilsonMatrix::Tm = 7;
//const int WilsonMatrix::X0 = 8;
class WilsonCompressor {
public:
int mu;
void Point(int p) { mu=p;};
vHalfSpinColourVector operator () (vSpinColourVector &in)
{
vHalfSpinColourVector ret;
switch(mu) {
case WilsonMatrix::Xp:
spProjXp(ret,in);
break;
case WilsonMatrix::Yp:
spProjYp(ret,in);
break;
case WilsonMatrix::Zp:
spProjZp(ret,in);
break;
case WilsonMatrix::Tp:
spProjTp(ret,in);
break;
case WilsonMatrix::Xm:
spProjXm(ret,in);
break;
case WilsonMatrix::Ym:
spProjYm(ret,in);
break;
case WilsonMatrix::Zm:
spProjZm(ret,in);
break;
case WilsonMatrix::Tm:
spProjTm(ret,in);
break;
default:
assert(0);
break;
}
return ret;
}
};
WilsonMatrix::WilsonMatrix(LatticeGaugeField &_Umu,int _mass)
: Stencil((&Umu._grid,npoint,0,directions,displacements),
WilsonMatrix::WilsonMatrix(LatticeGaugeField &_Umu,double _mass)
: Stencil(Umu._grid,npoint,0,directions,displacements),
mass(_mass),
Umu(_Umu)
Umu(_Umu._grid)
{
// Allocate the required comms buffer
grid = _Umu._grid;
comm_buf.resize(Stencil._unified_buffer_size);
DoubleStore(Umu,_Umu);
}
void WilsonMatrix::DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeField &Umu)
{
LatticeColourMatrix U(grid);
for(int mu=0;mu<Nd;mu++){
U = peekIndex<LorentzIndex>(Umu,mu);
pokeIndex<LorentzIndex>(Uds,U,mu);
U = adj(Cshift(U,mu,-1));
pokeIndex<LorentzIndex>(Uds,U,mu+4);
}
}
void WilsonMatrix::multiply(const LatticeFermion &in, LatticeFermion &out)
{
Dhop(in,out);
return;
}
void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
{
Stencil.HaloExchange(in,comm_buf);
// Stencil.HaloExchange(in,comm_buf);
for(int ss=0;ss<_grid->oSites();ss++){
for(int ss=0;ss<grid->oSites();ss++){
int offset,local;
vSpinColourVector result;
vHalfSpinColourVector UChi;
vHalfSpinColourVector chi;
vHalfSpinColourVector Uchi;
vHalfSpinColourVector *chi_p;
// Xp
offset = Stencil._offsets [Xp][ss];
local = Stencil._is_local[Xp][ss];
if ( local ) {
Uchi = U[]*spProjXp(in._odata[offset]);
} else {
Uchi = U[]*comm_buf._odata[offset]
}
result = ReconXp(Uchi);
chi_p = &comm_buf[offset];
if ( local ) {
spProjXp(chi,in._odata[offset]);
chi_p = &chi;
}
mult(&(Uchi()),&(Umu._odata[ss](Xp)),&(*chi_p)());
spReconXp(result,Uchi);
#if 0
// Yp
offset = Stencil._offsets [Yp][ss];
local = Stencil._is_local[Yp][ss];
chi_p = &comm_buf[offset];
if ( local ) {
Uchi = U[]*spProjYp(in._odata[offset]);
} else {
Uchi = U[]*comm_buf._odata[offset]
}
result+= ReconYp(Uchi);
spProjYp(chi,in._odata[offset]);
chi_p = &chi;
}
mult(&(Uchi()),&(Umu._odata[ss](Yp)),&(*chi_p)());
accumReconYp(result,Uchi);
// Zp
offset = Stencil._offsets [Zp][ss];
local = Stencil._is_local[Zp][ss];
chi_p = &comm_buf[offset];
if ( local ) {
Uchi = U[]*spProjZp(in._odata[offset]);
} else {
Uchi = U[]*comm_buf._odata[offset]
}
result+= ReconZp(Uchi);
spProjZp(chi,in._odata[offset]);
chi_p = &chi;
}
mult(&(Uchi()),&(Umu._odata[ss](Zp)),&(*chi_p)() );
accumReconZp(result,Uchi);
// Tp
offset = Stencil._offsets [Tp][ss];
local = Stencil._is_local[Tp][ss];
chi_p = &comm_buf[offset];
if ( local ) {
Uchi = U[]*spProjTp(in._odata[offset]);
} else {
Uchi = U[]*comm_buf._odata[offset]
}
result+= ReconTp(Uchi);
spProjTp(chi,in._odata[offset]);
chi_p = &chi;
}
mult(&(Uchi()),&(Umu._odata[ss](Tp)),&(*chi_p)());
accumReconTp(result,Uchi);
// Xm
offset = Stencil._offsets [Xm][ss];
local = Stencil._is_local[Xm][ss];
chi_p = &comm_buf[offset];
if ( local ) {
Uchi = U[]*spProjXm(in._odata[offset]);
} else {
Uchi = U[]*comm_buf._odata[offset]
}
result+= ReconXm(Uchi);
spProjXm(chi,in._odata[offset]);
chi_p = &chi;
}
mult(&(Uchi()),&(Umu._odata[ss](Xm)),&(*chi_p)());
accumReconXm(result,Uchi);
// Ym
offset = Stencil._offsets [Ym][ss];
local = Stencil._is_local[Ym][ss];
chi_p = &comm_buf[offset];
if ( local ) {
Uchi = U[]*spProjYm(in._odata[offset]);
} else {
Uchi = U[]*comm_buf._odata[offset]
}
result+= ReconYm(Uchi);
spProjYm(chi,in._odata[offset]);
chi_p = &chi;
}
mult(&(Uchi()),&(Umu._odata[ss](Ym)),&(*chi_p)());
accumReconYm(result,Uchi);
// Zm
offset = Stencil._offsets [Zm][ss];
local = Stencil._is_local[Zm][ss];
chi_p = &comm_buf[offset];
if ( local ) {
Uchi = U[]*spProjZm(in._odata[offset]);
} else {
Uchi = U[]*comm_buf._odata[offset]
}
result+= ReconZm(Uchi);
spProjZm(chi,in._odata[offset]);
chi_p = &chi;
}
mult(&(Uchi()),&(Umu._odata[ss](Zm)),&(*chi_p)());
accumReconZm(result,Uchi);
// Tm
offset = Stencil._offsets [Tm][ss];
local = Stencil._is_local[Tm][ss];
chi_p = &comm_buf[offset];
if ( local ) {
Uchi = U[]*spProjTm(in._odata[offset]);
} else {
Uchi = U[]*comm_buf._odata[offset]
}
result+= ReconTm(Uchi);
spProjTm(chi,in._odata[offset]);
chi_p = &chi;
}
mult(&(Uchi()),&(Umu._odata[ss](Tm)),&(*chi_p)());
accumReconTm(result,Uchi);
#endif
out._odata[ss] = result;
}
}
void WilsonMatrix::Dw(const LatticeFermion &in, LatticeFermion &out)
{
return;
}
void WilsonMatrix::MpcDag (const LatticeFermion &in, LatticeFermion &out)
{
return;
}
void WilsonMatrix::Mpc (const LatticeFermion &in, LatticeFermion &out)
{
return;
}
void WilsonMatrix::MpcDagMpc(const LatticeFermion &in, LatticeFermion &out)
{
return;
}
void WilsonMatrix::MDagM (const LatticeFermion &in, LatticeFermion &out)
{
return;
}
}}
#endif

View File

@ -1,4 +1,4 @@
#ifnfdef GRID_QCD_WILSON_DOP_H
#ifndef GRID_QCD_WILSON_DOP_H
#define GRID_QCD_WILSON_DOP_H
#include <Grid.h>
@ -21,21 +21,23 @@ namespace Grid {
GridBase *grid;
// Copy of the gauge field
LatticeGaugeField Umu;
LatticeDoubledGaugeField Umu;
//Defines the stencil
CartesianStencil Stencil;
static const int npoint=9;
static const std::vector<int> directions ;
static const std::vector<int> displacements;
static const int Xp,Xm,Yp,Ym,Zp,Zm,Tp,Tm;
// Comms buffer
std::vector<vSpinColourVector,alignedAllocator<vSpinColourVector> > comm_buf;
std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> > comm_buf;
// Constructor
WilsonMatrix(LatticeGaugeField &Umu,int mass);
WilsonMatrix(LatticeGaugeField &Umu,double mass);
// DoubleStore
void DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeField &Umu);
// override multiply
void multiply(const LatticeFermion &in, LatticeFermion &out);

View File

@ -251,13 +251,13 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
friend inline vComplexD conj(const vComplexD &in){
vComplexD ret ; vzero(ret);
#if defined (AVX1)|| defined (AVX2)
// addsubps 0, inv=>0+in.v[3] 0-in.v[2], 0+in.v[1], 0-in.v[0], ...
// __m256d tmp = _mm256_addsub_pd(ret.v,_mm256_shuffle_pd(in.v,in.v,0x5));
// ret.v=_mm256_shuffle_pd(tmp,tmp,0x5);
ret.v = _mm256_addsub_pd(ret.v,in.v);
// addsubps 0, inv=>0+in.v[3] 0-in.v[2], 0+in.v[1], 0-in.v[0], ...
zvec tmp = _mm256_addsub_pd(ret.v,_mm256_shuffle_pd(in.v,in.v,0x5));
ret.v =_mm256_shuffle_pd(tmp,tmp,0x5);
#endif
#ifdef SSE4
ret.v = _mm_addsub_pd(ret.v,in.v);
zvec tmp = _mm_addsub_pd(ret.v,_mm_shuffle_pd(in.v,in.v,0x1));
ret.v = _mm_shuffle_pd(tmp,tmp,0x1);
#endif
#ifdef AVX512
ret.v = _mm512_mask_sub_pd(in.v, 0xaaaa,ret.v, in.v);
@ -268,48 +268,41 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
return ret;
}
friend inline vComplexD timesI(const vComplexD &in){
friend inline vComplexD timesMinusI(const vComplexD &in){
vComplexD ret; vzero(ret);
vComplexD tmp;
#if defined (AVX1)|| defined (AVX2)
cvec tmp =_mm256_addsub_ps(ret.v,in.v); // r,-i
/*
IF IMM0[0] = 0
THEN DEST[63:0]=SRC1[63:0] ELSE DEST[63:0]=SRC1[127:64] FI;
IF IMM0[1] = 0
THEN DEST[127:64]=SRC2[63:0] ELSE DEST[127:64]=SRC2[127:64] FI;
IF IMM0[2] = 0
THEN DEST[191:128]=SRC1[191:128] ELSE DEST[191:128]=SRC1[255:192] FI;
IF IMM0[3] = 0
THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI;
*/
ret.v =_mm256_shuffle_ps(tmp,tmp,0x5);
tmp.v =_mm256_addsub_pd(ret.v,in.v); // r,-i
ret.v =_mm256_shuffle_pd(tmp.v,tmp.v,0x5);
#endif
#ifdef SSE4
cvec tmp =_mm_addsub_ps(ret.v,in.v); // r,-i
ret.v =_mm_shuffle_ps(tmp,tmp,0x5);
tmp.v =_mm_addsub_pd(ret.v,in.v); // r,-i
ret.v =_mm_shuffle_pd(tmp.v,tmp.v,0x1);
#endif
#ifdef AVX512
ret.v = _mm512_mask_sub_ps(in.v,0xaaaa,ret.v,in.v); // real -imag
ret.v = _mm512_swizzle_ps(ret.v, _MM_SWIZ_REG_CDAB);// OK
ret.v = _mm512_mask_sub_pd(in.v,0xaaaa,ret.v,in.v); // real -imag
ret.v = _mm512_swizzle_pd(ret.v, _MM_SWIZ_REG_CDAB);// OK
#endif
#ifdef QPX
assert(0);
#endif
return ret;
}
friend inline vComplexD timesMinusI(const vComplexD &in){
friend inline vComplexD timesI(const vComplexD &in){
vComplexD ret; vzero(ret);
vComplexD tmp;
#if defined (AVX1)|| defined (AVX2)
cvec tmp =_mm256_shuffle_ps(in.v,in.v,0x5);
ret.v =_mm256_addsub_ps(ret.v,tmp); // i,-r
tmp.v =_mm256_shuffle_pd(in.v,in.v,0x5);
ret.v =_mm256_addsub_pd(ret.v,tmp.v); // i,-r
#endif
#ifdef SSE4
cvec tmp =_mm_shuffle_ps(in.v,in.v,0x5);
ret.v =_mm_addsub_ps(ret.v,tmp); // r,-i
tmp.v =_mm_shuffle_pd(in.v,in.v,0x1);
ret.v =_mm_addsub_pd(ret.v,tmp.v); // r,-i
#endif
#ifdef AVX512
cvec tmp = _mm512_swizzle_ps(in.v, _MM_SWIZ_REG_CDAB);// OK
ret.v = _mm512_mask_sub_ps(tmp,0xaaaa,ret.v,tmp); // real -imag
tmp.v = _mm512_swizzle_pd(in.v, _MM_SWIZ_REG_CDAB);// OK
ret.v = _mm512_mask_sub_pd(tmp.v,0xaaaa,ret.v,tmp.v); // real -imag
#endif
#ifdef QPX
assert(0);

View File

@ -214,10 +214,10 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
{
#ifdef SSE4
union {
__m128 v1; // SSE 4 x float vector
cvec v1; // SSE 4 x float vector
float f[4]; // scalar array of 4 floats
} u128;
u128.v1= _mm_add_ps(v, _mm_shuffle_ps(v, v, 0b01001110)); // FIXME Prefer to use _MM_SHUFFLE macros
u128.v1= _mm_add_ps(in.v, _mm_shuffle_ps(in.v,in.v, 0b01001110)); // FIXME Prefer to use _MM_SHUFFLE macros
return ComplexF(u128.f[0], u128.f[1]);
#endif
#ifdef AVX1
@ -329,13 +329,15 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
friend inline vComplexF conj(const vComplexF &in){
vComplexF ret ; vzero(ret);
#if defined (AVX1)|| defined (AVX2)
// cvec tmp;
// tmp = _mm256_addsub_ps(ret.v,_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1))); // ymm1 <- br,bi
// ret.v=_mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
ret.v = _mm256_addsub_ps(ret.v,in.v);
cvec tmp;
tmp = _mm256_addsub_ps(ret.v,_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1))); // ymm1 <- br,bi
ret.v=_mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
#endif
#ifdef SSE4
ret.v = _mm_addsub_ps(ret.v,in.v);
cvec tmp;
tmp = _mm_addsub_ps(ret.v,_mm_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1))); // ymm1 <- br,bi
ret.v=_mm_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
#endif
#ifdef AVX512
ret.v = _mm512_mask_sub_ps(in.v,0xaaaa,ret.v,in.v); // Zero out 0+real 0-imag
@ -345,15 +347,16 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
#endif
return ret;
}
friend inline vComplexF timesI(const vComplexF &in){
vComplexF ret; vzero(ret);
friend inline vComplexF timesMinusI(const vComplexF &in){
vComplexF ret;
vzero(ret);
#if defined (AVX1)|| defined (AVX2)
cvec tmp =_mm256_addsub_ps(ret.v,in.v); // r,-i
ret.v = _mm256_shuffle_ps(tmp,tmp,0x5);
ret.v = _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1)); //-i,r
#endif
#ifdef SSE4
cvec tmp =_mm_addsub_ps(ret.v,in.v); // r,-i
ret.v = _mm_shuffle_ps(tmp,tmp,0x5);
ret.v = _mm_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
#endif
#ifdef AVX512
ret.v = _mm512_mask_sub_ps(in.v,0xaaaa,ret.v,in.v); // real -imag
@ -364,14 +367,14 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
#endif
return ret;
}
friend inline vComplexF timesMinusI(const vComplexF &in){
friend inline vComplexF timesI(const vComplexF &in){
vComplexF ret; vzero(ret);
#if defined (AVX1)|| defined (AVX2)
cvec tmp =_mm256_shuffle_ps(in.v,in.v,0x5);
ret.v = _mm256_addsub_ps(ret.v,tmp); // i,-r
cvec tmp =_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1));//i,r
ret.v =_mm256_addsub_ps(ret.v,tmp); //i,-r
#endif
#ifdef SSE4
cvec tmp =_mm_shuffle_ps(in.v,in.v,0x5);
cvec tmp =_mm_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1));
ret.v = _mm_addsub_ps(ret.v,tmp); // r,-i
#endif
#ifdef AVX512
@ -443,5 +446,8 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
inline vComplexF trace(const vComplexF &arg){
return arg;
}
}
#endif

View File

@ -146,7 +146,7 @@ namespace Grid {
ret.v = _mm256_set_pd(a[3],a[2],a[1],a[0]);
#endif
#ifdef SSE4
ret.v = _mm_set_pd(a[0],a[1]);
ret.v = _mm_set_pd(a[1],a[0]);
#endif
#ifdef AVX512
ret.v = _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
@ -186,6 +186,15 @@ namespace Grid {
friend inline RealD Reduce(const vRealD & in)
{
#if defined (SSE4)
// FIXME Hack
const RealD * ptr =(const RealD *) &in;
RealD ret = 0;
for(int i=0;i<vRealD::Nsimd();i++){
ret = ret+ptr[i];
}
return ret;
#endif
#if defined (AVX1) || defined(AVX2)
typedef union {
uint64_t l;

View File

@ -175,7 +175,7 @@ namespace Grid {
ret.v = _mm256_set_ps(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
#endif
#ifdef SSE4
ret.v = _mm_set_ps(a[0],a[1],a[2],a[3]);
ret.v = _mm_set_ps(a[3],a[2],a[1],a[0]);
#endif
#ifdef AVX512
ret.v = _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
@ -220,6 +220,15 @@ friend inline void vstore(const vRealF &ret, float *a){
}
friend inline RealF Reduce(const vRealF & in)
{
#if defined (SSE4)
// FIXME Hack
const RealF * ptr = (const RealF *) &in;
RealF ret = 0;
for(int i=0;i<vRealF::Nsimd();i++){
ret = ret+ptr[i];
}
return ret;
#endif
#if defined (AVX1) || defined(AVX2)
__attribute__ ((aligned(32))) float c_[16];
__m256 tmp = _mm256_permute2f128_ps(in.v,in.v,0x01);

View File

@ -121,8 +121,9 @@ namespace Grid {
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
assert(simd_layout==1);
// assert(simd_layout==1); // Why?
assert(comm_dim==1);
shift = (shift + fd) %fd;
assert(shift>=0);
assert(shift<fd);