mirror of
https://github.com/paboyle/Grid.git
synced 2024-12-24 11:55:27 +00:00
Shaken out stencil to the point where I think wilson dslash is correct.
Need to audit code carefully, consolidate between stencil and cshift, and then benchmark and optimise.
This commit is contained in:
parent
0b7d389258
commit
b0485894b3
@ -101,12 +101,12 @@ namespace Grid {
|
|||||||
std::vector<Integer> vlhs(vInteger::Nsimd()); // Use functors to reduce this to single implementation
|
std::vector<Integer> vlhs(vInteger::Nsimd()); // Use functors to reduce this to single implementation
|
||||||
std::vector<Integer> vrhs(vInteger::Nsimd());
|
std::vector<Integer> vrhs(vInteger::Nsimd());
|
||||||
vInteger ret;
|
vInteger ret;
|
||||||
extract(lhs,vlhs);
|
extract<vInteger,Integer>(lhs,vlhs);
|
||||||
extract(rhs,vrhs);
|
extract<vInteger,Integer>(rhs,vrhs);
|
||||||
for(int s=0;s<vInteger::Nsimd();s++){
|
for(int s=0;s<vInteger::Nsimd();s++){
|
||||||
vlhs[s] = sop(vlhs[s],vrhs[s]);
|
vlhs[s] = sop(vlhs[s],vrhs[s]);
|
||||||
}
|
}
|
||||||
merge(ret,vlhs);
|
merge<vInteger,Integer>(ret,vlhs);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
inline vInteger operator < (const vInteger & lhs, const vInteger & rhs)
|
inline vInteger operator < (const vInteger & lhs, const vInteger & rhs)
|
||||||
|
193
lib/Grid_extract.h
Normal file
193
lib/Grid_extract.h
Normal file
@ -0,0 +1,193 @@
|
|||||||
|
#ifndef GRID_EXTRACT_H
|
||||||
|
#define GRID_EXTRACT_H
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
// Generic extract/merge/permute
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
namespace Grid{
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Extract/merge a fundamental vector type, to pointer array with offset
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
template<class vsimd,class scalar>
|
||||||
|
inline void extract(typename std::enable_if<isGridTensor<vsimd>::notvalue, const vsimd >::type * y,
|
||||||
|
std::vector<scalar *> &extracted,int offset){
|
||||||
|
// FIXME: bounce off memory is painful
|
||||||
|
int Nextr=extracted.size();
|
||||||
|
int Nsimd=vsimd::Nsimd();
|
||||||
|
int s=Nsimd/Nextr;
|
||||||
|
|
||||||
|
scalar*buf = (scalar *)y;
|
||||||
|
for(int i=0;i<Nextr;i++){
|
||||||
|
extracted[i][offset] = buf[i*s];
|
||||||
|
}
|
||||||
|
};
|
||||||
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
// Merge simd vector from array of scalars to pointer array with offset
|
||||||
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
template<class vsimd,class scalar>
|
||||||
|
inline void merge(typename std::enable_if<isGridTensor<vsimd>::notvalue, vsimd >::type * y,
|
||||||
|
std::vector<scalar *> &extracted,int offset){
|
||||||
|
int Nextr=extracted.size();
|
||||||
|
int Nsimd=vsimd::Nsimd();
|
||||||
|
int s=Nsimd/Nextr;
|
||||||
|
|
||||||
|
scalar *buf =(scalar *) y;
|
||||||
|
for(int i=0;i<Nextr;i++){
|
||||||
|
for(int ii=0;ii<s;ii++){
|
||||||
|
buf[i*s+ii]=extracted[i][offset];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Extract a fundamental vector type to scalar array
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
template<class vsimd,class scalar>
|
||||||
|
inline void extract(typename std::enable_if<isGridTensor<vsimd>::notvalue, const vsimd >::type &y,std::vector<scalar> &extracted){
|
||||||
|
|
||||||
|
int Nextr=extracted.size();
|
||||||
|
int Nsimd=vsimd::Nsimd();
|
||||||
|
int s=Nsimd/Nextr;
|
||||||
|
|
||||||
|
scalar *buf = (scalar *)&y;
|
||||||
|
for(int i=0;i<Nextr;i++){
|
||||||
|
for(int ii=0;ii<s;ii++){
|
||||||
|
extracted[i]=buf[i*s+ii];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
// Merge simd vector from array of scalars
|
||||||
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
template<class vsimd,class scalar>
|
||||||
|
inline void merge(typename std::enable_if<isGridTensor<vsimd>::notvalue, vsimd >::type &y,std::vector<scalar> &extracted){
|
||||||
|
int Nextr=extracted.size();
|
||||||
|
int Nsimd=vsimd::Nsimd();
|
||||||
|
int s=Nsimd/Nextr;
|
||||||
|
scalar *buf = (scalar *)&y;
|
||||||
|
|
||||||
|
for(int i=0;i<Nextr;i++){
|
||||||
|
for(int ii=0;ii<s;ii++){
|
||||||
|
buf[i*s+ii]=extracted[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
template<class vsimd,class scalar>
|
||||||
|
inline void AmergeA(typename std::enable_if<isGridTensor<vsimd>::notvalue, vsimd >::type &y,std::vector<scalar> &extracted){
|
||||||
|
int Nextr=extracted.size();
|
||||||
|
int Nsimd=vsimd::Nsimd();
|
||||||
|
int s=Nsimd/Nextr;
|
||||||
|
|
||||||
|
scalar *buf = (scalar *)&y;
|
||||||
|
for(int i=0;i<Nextr;i++){
|
||||||
|
for(int ii=0;ii<s;ii++){
|
||||||
|
buf[i*s+ii]=extracted[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
// Extract to contiguous array scalar object
|
||||||
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
template<class vobj> inline void extract(const vobj &vec,std::vector<typename vobj::scalar_object> &extracted)
|
||||||
|
{
|
||||||
|
typedef typename vobj::scalar_type scalar_type ;
|
||||||
|
typedef typename vobj::vector_type vector_type ;
|
||||||
|
|
||||||
|
const int Nsimd=vobj::vector_type::Nsimd();
|
||||||
|
const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
|
|
||||||
|
extracted.resize(Nsimd);
|
||||||
|
|
||||||
|
std::vector<scalar_type *> pointers(Nsimd);
|
||||||
|
for(int i=0;i<Nsimd;i++)
|
||||||
|
pointers[i] =(scalar_type *)& extracted[i];
|
||||||
|
|
||||||
|
vector_type *vp = (vector_type *)&vec;
|
||||||
|
for(int w=0;w<words;w++){
|
||||||
|
extract<vector_type,scalar_type>(&vp[w],pointers,w);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
// Extract to a bunch of scalar object pointers, with offset
|
||||||
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
template<class vobj> inline
|
||||||
|
void extract(const vobj &vec,std::vector<typename vobj::scalar_object *> &extracted, int offset)
|
||||||
|
{
|
||||||
|
|
||||||
|
typedef typename vobj::scalar_type scalar_type ;
|
||||||
|
typedef typename vobj::vector_type vector_type ;
|
||||||
|
|
||||||
|
const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
|
const int Nsimd=vobj::vector_type::Nsimd();
|
||||||
|
|
||||||
|
assert(extracted.size()==Nsimd);
|
||||||
|
|
||||||
|
std::vector<scalar_type *> pointers(Nsimd);
|
||||||
|
for(int i=0;i<Nsimd;i++) {
|
||||||
|
pointers[i] =(scalar_type *)& extracted[i][offset];
|
||||||
|
}
|
||||||
|
|
||||||
|
vector_type *vp = (vector_type *)&vec;
|
||||||
|
for(int w=0;w<words;w++){
|
||||||
|
extract<vector_type,scalar_type>(&vp[w],pointers,w);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
// Merge a contiguous array of scalar objects
|
||||||
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
template<class vobj> inline
|
||||||
|
void merge(vobj &vec,std::vector<typename vobj::scalar_object> &extracted)
|
||||||
|
{
|
||||||
|
typedef typename vobj::scalar_type scalar_type ;
|
||||||
|
typedef typename vobj::vector_type vector_type ;
|
||||||
|
|
||||||
|
const int Nsimd=vobj::vector_type::Nsimd();
|
||||||
|
const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
|
|
||||||
|
assert(extracted.size()==Nsimd);
|
||||||
|
|
||||||
|
std::vector<scalar_type *> pointers(Nsimd);
|
||||||
|
for(int i=0;i<Nsimd;i++)
|
||||||
|
pointers[i] =(scalar_type *)& extracted[i];
|
||||||
|
|
||||||
|
vector_type *vp = (vector_type *)&vec;
|
||||||
|
for(int w=0;w<words;w++){
|
||||||
|
merge<vector_type,scalar_type>(&vp[w],pointers,w);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
// Merge a bunch of different scalar object pointers, with offset
|
||||||
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
template<class vobj> inline
|
||||||
|
void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
|
||||||
|
{
|
||||||
|
typedef typename vobj::scalar_type scalar_type ;
|
||||||
|
typedef typename vobj::vector_type vector_type ;
|
||||||
|
|
||||||
|
const int Nsimd=vobj::vector_type::Nsimd();
|
||||||
|
const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
|
|
||||||
|
assert(extracted.size()==Nsimd);
|
||||||
|
|
||||||
|
std::vector<scalar_type *> pointers(Nsimd);
|
||||||
|
for(int i=0;i<Nsimd;i++)
|
||||||
|
pointers[i] =(scalar_type *)& extracted[i][offset];
|
||||||
|
|
||||||
|
vector_type *vp = (vector_type *)&vec;
|
||||||
|
assert((void *)vp!=NULL);
|
||||||
|
for(int w=0;w<words;w++){
|
||||||
|
merge<vector_type,scalar_type>(&vp[w],pointers,w);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
@ -95,6 +95,7 @@ public:
|
|||||||
#include <lattice/Grid_lattice_reduction.h>
|
#include <lattice/Grid_lattice_reduction.h>
|
||||||
#include <lattice/Grid_lattice_peekpoke.h>
|
#include <lattice/Grid_lattice_peekpoke.h>
|
||||||
#include <lattice/Grid_lattice_reality.h>
|
#include <lattice/Grid_lattice_reality.h>
|
||||||
|
#include <Grid_extract.h>
|
||||||
#include <lattice/Grid_lattice_coordinate.h>
|
#include <lattice/Grid_lattice_coordinate.h>
|
||||||
#include <lattice/Grid_lattice_rng.h>
|
#include <lattice/Grid_lattice_rng.h>
|
||||||
#include <lattice/Grid_lattice_transfer.h>
|
#include <lattice/Grid_lattice_transfer.h>
|
||||||
|
@ -133,68 +133,6 @@ namespace Grid {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////
|
|
||||||
// Generic extract/merge/permute
|
|
||||||
/////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
template<class vsimd,class scalar>
|
|
||||||
inline void Gextract(const vsimd &y,std::vector<scalar *> &extracted){
|
|
||||||
// FIXME: bounce off memory is painful
|
|
||||||
int Nextr=extracted.size();
|
|
||||||
int Nsimd=vsimd::Nsimd();
|
|
||||||
int s=Nsimd/Nextr;
|
|
||||||
|
|
||||||
std::vector<scalar,alignedAllocator<scalar> > buf(Nsimd);
|
|
||||||
|
|
||||||
vstore(y,&buf[0]);
|
|
||||||
for(int i=0;i<Nextr;i++){
|
|
||||||
*extracted[i] = buf[i*s];
|
|
||||||
extracted[i]++;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
template<class vsimd,class scalar>
|
|
||||||
inline void Gextract(const vsimd &y,std::vector<scalar> &extracted){
|
|
||||||
int Nextr=extracted.size();
|
|
||||||
int Nsimd=vsimd::Nsimd();
|
|
||||||
int s=Nsimd/Nextr;
|
|
||||||
|
|
||||||
std::vector<scalar,alignedAllocator<scalar> > buf(Nsimd);
|
|
||||||
|
|
||||||
vstore(y,&buf[0]);
|
|
||||||
for(int i=0;i<Nextr;i++){
|
|
||||||
extracted[i] = buf[i*s];
|
|
||||||
}
|
|
||||||
};
|
|
||||||
template<class vsimd,class scalar>
|
|
||||||
inline void Gmerge(vsimd &y,std::vector<scalar *> &extracted){
|
|
||||||
int Nextr=extracted.size();
|
|
||||||
int Nsimd=vsimd::Nsimd();
|
|
||||||
int s=Nsimd/Nextr;
|
|
||||||
|
|
||||||
std::vector<scalar> buf(Nsimd);
|
|
||||||
for(int i=0;i<Nextr;i++){
|
|
||||||
for(int ii=0;ii<s;ii++){
|
|
||||||
buf[i*s+ii]=*extracted[i];
|
|
||||||
}
|
|
||||||
extracted[i]++;
|
|
||||||
}
|
|
||||||
vset(y,&buf[0]);
|
|
||||||
};
|
|
||||||
template<class vsimd,class scalar>
|
|
||||||
inline void Gmerge(vsimd &y,std::vector<scalar> &extracted){
|
|
||||||
int Nextr=extracted.size();
|
|
||||||
int Nsimd=vsimd::Nsimd();
|
|
||||||
int s=Nsimd/Nextr;
|
|
||||||
|
|
||||||
std::vector<scalar> buf(Nsimd);
|
|
||||||
for(int i=0;i<Nextr;i++){
|
|
||||||
for(int ii=0;ii<s;ii++){
|
|
||||||
buf[i*s+ii]=extracted[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
vset(y,&buf[0]);
|
|
||||||
};
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////
|
||||||
// Permute
|
// Permute
|
||||||
// Permute 0 every ABCDEFGH -> BA DC FE HG
|
// Permute 0 every ABCDEFGH -> BA DC FE HG
|
||||||
|
@ -48,100 +48,6 @@ namespace Grid {
|
|||||||
} ;
|
} ;
|
||||||
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////
|
|
||||||
// Gather for when there is no need to SIMD split with compression
|
|
||||||
///////////////////////////////////////////////////////////////////
|
|
||||||
template<class vobj,class cobj,class compressor> void
|
|
||||||
Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress)
|
|
||||||
{
|
|
||||||
int rd = rhs._grid->_rdimensions[dimension];
|
|
||||||
|
|
||||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
|
||||||
|
|
||||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
|
||||||
int o = 0; // relative offset to base within plane
|
|
||||||
int bo = 0; // offset in buffer
|
|
||||||
|
|
||||||
// Simple block stride gather of SIMD objects
|
|
||||||
#pragma omp parallel for collapse(2)
|
|
||||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
|
||||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
|
||||||
buffer[bo++]=compress(rhs._odata[so+o+b]);
|
|
||||||
}
|
|
||||||
o +=rhs._grid->_slice_stride[dimension];
|
|
||||||
}
|
|
||||||
|
|
||||||
} else {
|
|
||||||
|
|
||||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
|
||||||
int o = 0; // relative offset to base within plane
|
|
||||||
int bo = 0; // offset in buffer
|
|
||||||
|
|
||||||
#pragma omp parallel for collapse(2)
|
|
||||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
|
||||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
|
||||||
|
|
||||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
|
||||||
if ( ocb &cbmask ) {
|
|
||||||
buffer[bo]=compress(rhs._odata[so+o+b]);
|
|
||||||
bo++;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
o +=rhs._grid->_slice_stride[dimension];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////
|
|
||||||
// Gather for when there *is* need to SIMD split with compression
|
|
||||||
///////////////////////////////////////////////////////////////////
|
|
||||||
template<class cobj,class vobj,class compressor> void
|
|
||||||
Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_type *> pointers,int dimension,int plane,int cbmask,compressor &compress)
|
|
||||||
{
|
|
||||||
int rd = rhs._grid->_rdimensions[dimension];
|
|
||||||
|
|
||||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
|
||||||
|
|
||||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
|
||||||
int o = 0; // relative offset to base within plane
|
|
||||||
int bo = 0; // offset in buffer
|
|
||||||
|
|
||||||
// Simple block stride gather of SIMD objects
|
|
||||||
#pragma omp parallel for collapse(2)
|
|
||||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
|
||||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
|
||||||
cobj temp;
|
|
||||||
temp=compress(rhs._odata[so+o+b]);
|
|
||||||
extract(temp,pointers);
|
|
||||||
}
|
|
||||||
o +=rhs._grid->_slice_stride[dimension];
|
|
||||||
}
|
|
||||||
|
|
||||||
} else {
|
|
||||||
|
|
||||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
|
||||||
int o = 0; // relative offset to base within plane
|
|
||||||
int bo = 0; // offset in buffer
|
|
||||||
|
|
||||||
#pragma omp parallel for collapse(2)
|
|
||||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
|
||||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
|
||||||
|
|
||||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
|
|
||||||
if ( ocb & cbmask ) {
|
|
||||||
cobj temp;
|
|
||||||
temp =compress(rhs._odata[so+o+b]);
|
|
||||||
extract(temp,pointers);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
o +=rhs._grid->_slice_stride[dimension];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
|
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
|
||||||
public:
|
public:
|
||||||
|
|
||||||
@ -184,6 +90,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
|
|||||||
template<class vobj,class cobj, class compressor> void
|
template<class vobj,class cobj, class compressor> void
|
||||||
HaloExchange(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
|
HaloExchange(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
|
||||||
{
|
{
|
||||||
|
std::cout<< "HaloExchange comm_buf.size()="<< u_comm_buf.size()<<" unified_buffer_size"<< _unified_buffer_size<< std::endl;
|
||||||
// conformable(source._grid,_grid);
|
// conformable(source._grid,_grid);
|
||||||
assert(source._grid==_grid);
|
assert(source._grid==_grid);
|
||||||
if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size);
|
if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size);
|
||||||
@ -234,6 +141,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
std::cout<< "HaloExchange complete"<< std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj,class cobj, class compressor>
|
template<class vobj,class cobj, class compressor>
|
||||||
@ -318,6 +226,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
|
|||||||
|
|
||||||
typedef typename cobj::vector_type vector_type;
|
typedef typename cobj::vector_type vector_type;
|
||||||
typedef typename cobj::scalar_type scalar_type;
|
typedef typename cobj::scalar_type scalar_type;
|
||||||
|
typedef typename cobj::scalar_object scalar_object;
|
||||||
|
|
||||||
int fd = _grid->_fdimensions[dimension];
|
int fd = _grid->_fdimensions[dimension];
|
||||||
int rd = _grid->_rdimensions[dimension];
|
int rd = _grid->_rdimensions[dimension];
|
||||||
@ -340,12 +249,12 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
|
|||||||
int words = sizeof(cobj)/sizeof(vector_type);
|
int words = sizeof(cobj)/sizeof(vector_type);
|
||||||
|
|
||||||
/* FIXME ALTERNATE BUFFER DETERMINATION */
|
/* FIXME ALTERNATE BUFFER DETERMINATION */
|
||||||
std::vector<std::vector<scalar_type> > send_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
|
std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
|
||||||
std::vector<std::vector<scalar_type> > recv_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
|
std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
|
||||||
int bytes = buffer_size*words*sizeof(scalar_type);
|
int bytes = buffer_size*sizeof(scalar_object);
|
||||||
|
|
||||||
std::vector<scalar_type *> pointers(Nsimd); //
|
std::vector<scalar_object *> pointers(Nsimd); //
|
||||||
std::vector<scalar_type *> rpointers(Nsimd); // received pointers
|
std::vector<scalar_object *> rpointers(Nsimd); // received pointers
|
||||||
|
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
// Work out what to send where
|
// Work out what to send where
|
||||||
@ -353,62 +262,77 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
|
|||||||
|
|
||||||
int cb = (cbmask==0x2)? 1 : 0;
|
int cb = (cbmask==0x2)? 1 : 0;
|
||||||
int sshift= _grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
|
int sshift= _grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
|
||||||
|
|
||||||
// loop over outer coord planes orthog to dim
|
// loop over outer coord planes orthog to dim
|
||||||
for(int x=0;x<rd;x++){
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
// FIXME call local permute copy if none are offnode.
|
|
||||||
for(int i=0;i<Nsimd;i++){
|
|
||||||
pointers[i] = (scalar_type *)&send_buf_extract[i][0];
|
|
||||||
}
|
|
||||||
int sx = (x+sshift)%rd;
|
|
||||||
|
|
||||||
std::cout<< "Gathering "<< x <<std::endl;
|
int any_offnode = ( ((x+sshift)%fd) >= rd );
|
||||||
Gather_plane_extract<cobj>(rhs,pointers,dimension,sx,cbmask,compress);
|
std::cout<<"any_offnode ="<<any_offnode<<std::endl;
|
||||||
std::cout<< "Gathered "<<std::endl;
|
if ( any_offnode ) {
|
||||||
for(int i=0;i<Nsimd;i++){
|
// FIXME call local permute copy if none are offnode.
|
||||||
|
for(int i=0;i<Nsimd;i++){
|
||||||
int inner_bit = (Nsimd>>(permute_type+1));
|
pointers[i] = &send_buf_extract[i][0];
|
||||||
int ic= (i&inner_bit)? 1:0;
|
|
||||||
|
|
||||||
int my_coor = rd*ic + x;
|
|
||||||
int nbr_coor = my_coor+sshift;
|
|
||||||
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
|
|
||||||
|
|
||||||
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
|
|
||||||
int nbr_ox = (nbr_coor%rd); // outer coord of peer
|
|
||||||
int nbr_lane = (i&(~inner_bit));
|
|
||||||
|
|
||||||
int recv_from_rank;
|
|
||||||
int xmit_to_rank;
|
|
||||||
|
|
||||||
if (nbr_ic) nbr_lane|=inner_bit;
|
|
||||||
assert (sx == nbr_ox);
|
|
||||||
|
|
||||||
if(nbr_proc){
|
|
||||||
|
|
||||||
std::cout<< "MPI sending "<<std::endl;
|
|
||||||
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
|
||||||
|
|
||||||
_grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
|
|
||||||
xmit_to_rank,
|
|
||||||
(void *)&recv_buf_extract[i][0],
|
|
||||||
recv_from_rank,
|
|
||||||
bytes);
|
|
||||||
std::cout<< "MPI complete "<<std::endl;
|
|
||||||
|
|
||||||
rpointers[i] = (scalar_type *)&recv_buf_extract[i][0];
|
|
||||||
} else {
|
|
||||||
rpointers[i] = (scalar_type *)&send_buf_extract[nbr_lane][0];
|
|
||||||
}
|
}
|
||||||
}
|
int sx = (x+sshift)%rd;
|
||||||
|
|
||||||
|
std::cout<< "Gathering "<< x <<std::endl;
|
||||||
|
Gather_plane_extract<cobj>(rhs,pointers,dimension,sx,cbmask,compress);
|
||||||
|
std::cout<< "Gathered "<<std::endl;
|
||||||
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
|
||||||
|
std::vector<int> icoor;
|
||||||
|
_grid->iCoorFromIindex(icoor,i);
|
||||||
|
|
||||||
// Here we don't want to scatter, just place into a buffer.
|
int inner_bit = (Nsimd>>(permute_type+1));
|
||||||
std::cout<< "merging "<<std::endl;
|
int ic= (i&inner_bit)? 1:0;
|
||||||
for(int i=0;i<buffer_size;i++){
|
assert(ic==icoor[dimension]);
|
||||||
merge(u_comm_buf[u_comm_offset+i],rpointers);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
int my_coor = rd*ic + x;
|
||||||
|
int nbr_coor = my_coor+sshift;
|
||||||
|
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
|
||||||
|
int nbr_lcoor= (nbr_coor%ld);
|
||||||
|
int nbr_ic = (nbr_lcoor)/rd; // inner coord of peer
|
||||||
|
int nbr_ox = (nbr_lcoor%rd); // outer coord of peer
|
||||||
|
int nbr_lane = (i&(~inner_bit));
|
||||||
|
|
||||||
|
int recv_from_rank;
|
||||||
|
int xmit_to_rank;
|
||||||
|
|
||||||
|
if (nbr_ic) nbr_lane|=inner_bit;
|
||||||
|
assert (sx == nbr_ox);
|
||||||
|
|
||||||
|
std::cout<<"nbr_proc "<<nbr_proc<< " x "<<x<<" nbr_x "<<nbr_ox << " lane "<<i << " nbr_lane "<<nbr_lane
|
||||||
|
<< " nbr_ic "<<nbr_ic << " mycoor "<< my_coor<< " nbr_coor "<<nbr_coor<<std::endl;
|
||||||
|
|
||||||
|
if(nbr_proc){
|
||||||
|
|
||||||
|
std::cout<< "MPI sending "<<std::endl;
|
||||||
|
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
|
_grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
|
||||||
|
xmit_to_rank,
|
||||||
|
(void *)&recv_buf_extract[i][0],
|
||||||
|
recv_from_rank,
|
||||||
|
bytes);
|
||||||
|
std::cout<< "MPI complete "<<std::endl;
|
||||||
|
|
||||||
|
rpointers[i] = &recv_buf_extract[i][0];
|
||||||
|
std::cout<<"lane "<<i<<" data "<<*( (Real *) rpointers[i])<<std::endl;
|
||||||
|
} else {
|
||||||
|
rpointers[i] = &send_buf_extract[nbr_lane][0];
|
||||||
|
std::cout<<"lane "<<i<<" data "<<*( (Real *) rpointers[i])<<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Here we don't want to scatter, just place into a buffer.
|
||||||
|
std::cout<< "merging u_comm_offset "<< u_comm_offset<<" comm_buf_size" << u_comm_buf.size() <<std::endl;
|
||||||
|
|
||||||
|
for(int i=0;i<buffer_size;i++){
|
||||||
|
assert(u_comm_offset+i<_unified_buffer_size);
|
||||||
|
merge(u_comm_buf[u_comm_offset+i],rpointers,i);
|
||||||
|
}
|
||||||
|
u_comm_offset+=buffer_size;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -15,6 +15,7 @@ inline void where(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<vobj
|
|||||||
conformable(iftrue,ret);
|
conformable(iftrue,ret);
|
||||||
|
|
||||||
GridBase *grid=iftrue._grid;
|
GridBase *grid=iftrue._grid;
|
||||||
|
typedef typename vobj::scalar_object scalar_object;
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
typedef typename iobj::vector_type mask_type;
|
typedef typename iobj::vector_type mask_type;
|
||||||
@ -23,27 +24,21 @@ inline void where(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<vobj
|
|||||||
const int words = sizeof(vobj)/sizeof(vector_type);
|
const int words = sizeof(vobj)/sizeof(vector_type);
|
||||||
|
|
||||||
std::vector<Integer> mask(Nsimd);
|
std::vector<Integer> mask(Nsimd);
|
||||||
std::vector<std::vector<scalar_type> > truevals (Nsimd,std::vector<scalar_type>(words) );
|
std::vector<scalar_object> truevals (Nsimd);
|
||||||
std::vector<std::vector<scalar_type> > falsevals(Nsimd,std::vector<scalar_type>(words) );
|
std::vector<scalar_object> falsevals(Nsimd);
|
||||||
std::vector<scalar_type *> pointers(Nsimd);
|
|
||||||
|
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for(int ss=0;ss<iftrue._grid->oSites(); ss++){
|
for(int ss=0;ss<iftrue._grid->oSites(); ss++){
|
||||||
|
|
||||||
for(int s=0;s<Nsimd;s++) pointers[s] = & truevals[s][0];
|
extract(iftrue._odata[ss] ,truevals);
|
||||||
extract(iftrue._odata[ss] ,pointers);
|
extract(iffalse._odata[ss] ,falsevals);
|
||||||
|
extract<vInteger,Integer>(TensorRemove(predicate._odata[ss]),mask);
|
||||||
for(int s=0;s<Nsimd;s++) pointers[s] = & falsevals[s][0];
|
|
||||||
extract(iffalse._odata[ss] ,pointers);
|
|
||||||
|
|
||||||
extract(TensorRemove(predicate._odata[ss]),mask);
|
|
||||||
|
|
||||||
for(int s=0;s<Nsimd;s++){
|
for(int s=0;s<Nsimd;s++){
|
||||||
if (mask[s]) pointers[s]=&truevals[s][0];
|
if (mask[s]) falsevals[s]=truevals[s];
|
||||||
else pointers[s]=&falsevals[s][0];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
merge(ret._odata[ss],pointers);
|
merge(ret._odata[ss],falsevals);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2,176 +2,155 @@
|
|||||||
#define _GRID_CSHIFT_COMMON_H_
|
#define _GRID_CSHIFT_COMMON_H_
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
|
template<class vobj>
|
||||||
|
class SimpleCompressor {
|
||||||
|
public:
|
||||||
|
void Point(int) {};
|
||||||
|
|
||||||
|
vobj operator() (const vobj &arg) {
|
||||||
|
return arg;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////
|
||||||
|
// Gather for when there is no need to SIMD split with compression
|
||||||
|
///////////////////////////////////////////////////////////////////
|
||||||
|
template<class vobj,class cobj,class compressor> void
|
||||||
|
Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress)
|
||||||
|
{
|
||||||
|
int rd = rhs._grid->_rdimensions[dimension];
|
||||||
|
|
||||||
|
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
||||||
|
cbmask = 0x3;
|
||||||
|
}
|
||||||
|
|
||||||
|
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||||
|
int o = 0; // relative offset to base within plane
|
||||||
|
int bo = 0; // offset in buffer
|
||||||
|
|
||||||
|
#pragma omp parallel for collapse(2)
|
||||||
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
|
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||||
|
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||||
|
if ( ocb &cbmask ) {
|
||||||
|
buffer[bo]=compress(rhs._odata[so+o+b]);
|
||||||
|
bo++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
o +=rhs._grid->_slice_stride[dimension];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////
|
||||||
|
// Gather for when there *is* need to SIMD split with compression
|
||||||
|
///////////////////////////////////////////////////////////////////
|
||||||
|
template<class cobj,class vobj,class compressor> void
|
||||||
|
Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_object *> pointers,int dimension,int plane,int cbmask,compressor &compress)
|
||||||
|
{
|
||||||
|
int rd = rhs._grid->_rdimensions[dimension];
|
||||||
|
|
||||||
|
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
||||||
|
cbmask = 0x3;
|
||||||
|
}
|
||||||
|
|
||||||
|
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||||
|
int o = 0; // relative offset to base within plane
|
||||||
|
int bo = 0; // offset in buffer
|
||||||
|
|
||||||
|
#pragma omp parallel for collapse(2)
|
||||||
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
|
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||||
|
|
||||||
|
int offset = b+n*rhs._grid->_slice_block[dimension];
|
||||||
|
|
||||||
|
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
|
||||||
|
if ( ocb & cbmask ) {
|
||||||
|
cobj temp;
|
||||||
|
temp =compress(rhs._odata[so+o+b]);
|
||||||
|
extract<cobj>(temp,pointers,offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
o +=rhs._grid->_slice_stride[dimension];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
// Gather for when there is no need to SIMD split
|
// Gather for when there is no need to SIMD split
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
template<class vobj> void Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer, int dimension,int plane,int cbmask)
|
template<class vobj> void Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer, int dimension,int plane,int cbmask)
|
||||||
{
|
{
|
||||||
int rd = rhs._grid->_rdimensions[dimension];
|
SimpleCompressor<vobj> dontcompress;
|
||||||
|
Gather_plane_simple (rhs,buffer,dimension,plane,cbmask,dontcompress);
|
||||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
|
||||||
|
|
||||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
|
||||||
int o = 0; // relative offset to base within plane
|
|
||||||
int bo = 0; // offset in buffer
|
|
||||||
|
|
||||||
// Simple block stride gather of SIMD objects
|
|
||||||
#pragma omp parallel for collapse(2)
|
|
||||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
|
||||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
|
||||||
buffer[bo++]=rhs._odata[so+o+b];
|
|
||||||
}
|
|
||||||
o +=rhs._grid->_slice_stride[dimension];
|
|
||||||
}
|
|
||||||
|
|
||||||
} else {
|
|
||||||
|
|
||||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
|
||||||
int o = 0; // relative offset to base within plane
|
|
||||||
int bo = 0; // offset in buffer
|
|
||||||
|
|
||||||
#pragma omp parallel for collapse(2)
|
|
||||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
|
||||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
|
||||||
|
|
||||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
|
||||||
if ( ocb &cbmask ) {
|
|
||||||
buffer[bo]=rhs._odata[so+o+b];
|
|
||||||
bo++;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
o +=rhs._grid->_slice_stride[dimension];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
// Gather for when there *is* need to SIMD split
|
// Gather for when there *is* need to SIMD split
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
template<class vobj,class scalar_type> void Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane,int cbmask)
|
template<class vobj> void Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask)
|
||||||
{
|
{
|
||||||
int rd = rhs._grid->_rdimensions[dimension];
|
SimpleCompressor<vobj> dontcompress;
|
||||||
|
Gather_plane_extract<vobj,vobj,decltype(dontcompress)>(rhs,pointers,dimension,plane,cbmask,dontcompress);
|
||||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
|
||||||
|
|
||||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
|
||||||
int o = 0; // relative offset to base within plane
|
|
||||||
int bo = 0; // offset in buffer
|
|
||||||
|
|
||||||
// Simple block stride gather of SIMD objects
|
|
||||||
#pragma omp parallel for collapse(2)
|
|
||||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
|
||||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
|
||||||
extract(rhs._odata[so+o+b],pointers);
|
|
||||||
}
|
|
||||||
o +=rhs._grid->_slice_stride[dimension];
|
|
||||||
}
|
|
||||||
|
|
||||||
} else {
|
|
||||||
|
|
||||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
|
||||||
int o = 0; // relative offset to base within plane
|
|
||||||
int bo = 0; // offset in buffer
|
|
||||||
|
|
||||||
#pragma omp parallel for collapse(2)
|
|
||||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
|
||||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
|
||||||
|
|
||||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
|
|
||||||
if ( ocb & cbmask ) {
|
|
||||||
extract(rhs._odata[so+o+b],pointers);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
o +=rhs._grid->_slice_stride[dimension];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
// Scatter for when there is no need to SIMD split
|
// Scatter for when there is no need to SIMD split
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer, int dimension,int plane,int cbmask)
|
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer, int dimension,int plane,int cbmask)
|
||||||
{
|
{
|
||||||
int rd = rhs._grid->_rdimensions[dimension];
|
int rd = rhs._grid->_rdimensions[dimension];
|
||||||
|
|
||||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
||||||
|
cbmask=0x3;
|
||||||
|
}
|
||||||
|
|
||||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||||
int o = 0; // relative offset to base within plane
|
int o = 0; // relative offset to base within plane
|
||||||
int bo = 0; // offset in buffer
|
int bo = 0; // offset in buffer
|
||||||
|
|
||||||
// Simple block stride gather of SIMD objects
|
|
||||||
#pragma omp parallel for collapse(2)
|
|
||||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
|
||||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
|
||||||
rhs._odata[so+o+b]=buffer[bo++];
|
|
||||||
}
|
|
||||||
o +=rhs._grid->_slice_stride[dimension];
|
|
||||||
}
|
|
||||||
|
|
||||||
} else {
|
|
||||||
|
|
||||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
|
||||||
int o = 0; // relative offset to base within plane
|
|
||||||
int bo = 0; // offset in buffer
|
|
||||||
|
|
||||||
#pragma omp parallel for collapse(2)
|
#pragma omp parallel for collapse(2)
|
||||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||||
|
|
||||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
|
||||||
if ( ocb & cbmask ) {
|
|
||||||
rhs._odata[so+o+b]=buffer[bo++];
|
|
||||||
}
|
|
||||||
|
|
||||||
|
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||||
|
if ( ocb & cbmask ) {
|
||||||
|
rhs._odata[so+o+b]=buffer[bo++];
|
||||||
}
|
}
|
||||||
o +=rhs._grid->_slice_stride[dimension];
|
|
||||||
}
|
}
|
||||||
|
o +=rhs._grid->_slice_stride[dimension];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
// Scatter for when there *is* need to SIMD split
|
// Scatter for when there *is* need to SIMD split
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
template<class vobj,class scalar_type> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane,int cbmask)
|
template<class vobj,class cobj> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<cobj *> pointers,int dimension,int plane,int cbmask)
|
||||||
{
|
{
|
||||||
int rd = rhs._grid->_rdimensions[dimension];
|
int rd = rhs._grid->_rdimensions[dimension];
|
||||||
|
|
||||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
||||||
|
cbmask=0x3;
|
||||||
|
}
|
||||||
|
|
||||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||||
int o = 0; // relative offset to base within plane
|
int o = 0; // relative offset to base within plane
|
||||||
int bo = 0; // offset in buffer
|
int bo = 0; // offset in buffer
|
||||||
|
|
||||||
// Simple block stride gather of SIMD objects
|
|
||||||
#pragma omp parallel for collapse(2)
|
|
||||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
|
||||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
|
||||||
merge(rhs._odata[so+o+b],pointers);
|
|
||||||
}
|
|
||||||
o +=rhs._grid->_slice_stride[dimension];
|
|
||||||
}
|
|
||||||
|
|
||||||
} else {
|
|
||||||
|
|
||||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
|
||||||
int o = 0; // relative offset to base within plane
|
|
||||||
int bo = 0; // offset in buffer
|
|
||||||
|
|
||||||
#pragma omp parallel for collapse(2)
|
#pragma omp parallel for collapse(2)
|
||||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||||
|
|
||||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
|
|
||||||
if ( ocb&cbmask ) {
|
|
||||||
merge(rhs._odata[so+o+b],pointers);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
int offset = b+n*rhs._grid->_slice_block[dimension];
|
||||||
|
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
|
||||||
|
if ( ocb&cbmask ) {
|
||||||
|
merge(rhs._odata[so+o+b],pointers,offset);
|
||||||
}
|
}
|
||||||
o +=rhs._grid->_slice_stride[dimension];
|
|
||||||
}
|
}
|
||||||
|
o +=rhs._grid->_slice_stride[dimension];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -183,40 +162,26 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int
|
|||||||
int rd = rhs._grid->_rdimensions[dimension];
|
int rd = rhs._grid->_rdimensions[dimension];
|
||||||
|
|
||||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
||||||
|
cbmask=0x3;
|
||||||
|
}
|
||||||
|
|
||||||
int o = 0; // relative offset to base within plane
|
|
||||||
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
|
||||||
int lo = lplane*lhs._grid->_ostride[dimension]; // offset in buffer
|
|
||||||
|
|
||||||
// Simple block stride gather of SIMD objects
|
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||||
|
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||||
|
int o = 0; // relative offset to base within plane
|
||||||
|
|
||||||
#pragma omp parallel for collapse(2)
|
#pragma omp parallel for collapse(2)
|
||||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||||
|
|
||||||
|
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
|
||||||
|
|
||||||
|
if ( ocb&cbmask ) {
|
||||||
lhs._odata[lo+o+b]=rhs._odata[ro+o+b];
|
lhs._odata[lo+o+b]=rhs._odata[ro+o+b];
|
||||||
}
|
}
|
||||||
o +=rhs._grid->_slice_stride[dimension];
|
|
||||||
}
|
}
|
||||||
|
o +=rhs._grid->_slice_stride[dimension];
|
||||||
} else {
|
|
||||||
|
|
||||||
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
|
||||||
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
|
|
||||||
int o = 0; // relative offset to base within plane
|
|
||||||
|
|
||||||
#pragma omp parallel for collapse(2)
|
|
||||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
|
||||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
|
||||||
|
|
||||||
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
|
|
||||||
|
|
||||||
if ( ocb&cbmask ) {
|
|
||||||
lhs._odata[lo+o+b]=rhs._odata[ro+o+b];
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
o +=rhs._grid->_slice_stride[dimension];
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -224,42 +189,25 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &r
|
|||||||
{
|
{
|
||||||
int rd = rhs._grid->_rdimensions[dimension];
|
int rd = rhs._grid->_rdimensions[dimension];
|
||||||
|
|
||||||
|
|
||||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
||||||
|
cbmask=0x3;
|
||||||
|
}
|
||||||
|
|
||||||
int o = 0; // relative offset to base within plane
|
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||||
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||||
int lo = lplane*rhs._grid->_ostride[dimension]; // offset in buffer
|
int o = 0; // relative offset to base within plane
|
||||||
|
|
||||||
// Simple block stride gather of SIMD objects
|
|
||||||
#pragma omp parallel for collapse(2)
|
#pragma omp parallel for collapse(2)
|
||||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||||
|
|
||||||
|
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
|
||||||
|
if ( ocb&cbmask ) {
|
||||||
permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
|
permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
|
||||||
}
|
}
|
||||||
o +=rhs._grid->_slice_stride[dimension];
|
|
||||||
}
|
}
|
||||||
|
o +=rhs._grid->_slice_stride[dimension];
|
||||||
} else {
|
|
||||||
|
|
||||||
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
|
||||||
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
|
|
||||||
int o = 0; // relative offset to base within plane
|
|
||||||
|
|
||||||
#pragma omp parallel for collapse(2)
|
|
||||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
|
||||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
|
||||||
|
|
||||||
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
|
|
||||||
|
|
||||||
if ( ocb&cbmask ) {
|
|
||||||
permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
o +=rhs._grid->_slice_stride[dimension];
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -133,6 +133,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &r
|
|||||||
GridBase *grid=rhs._grid;
|
GridBase *grid=rhs._grid;
|
||||||
const int Nsimd = grid->Nsimd();
|
const int Nsimd = grid->Nsimd();
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
typedef typename vobj::scalar_object scalar_object;
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
|
||||||
int fd = grid->_fdimensions[dimension];
|
int fd = grid->_fdimensions[dimension];
|
||||||
@ -155,12 +156,12 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &r
|
|||||||
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
||||||
int words = sizeof(vobj)/sizeof(vector_type);
|
int words = sizeof(vobj)/sizeof(vector_type);
|
||||||
|
|
||||||
std::vector<std::vector<scalar_type> > send_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
|
std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
|
||||||
std::vector<std::vector<scalar_type> > recv_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
|
std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
|
||||||
int bytes = buffer_size*words*sizeof(scalar_type);
|
int bytes = buffer_size*sizeof(scalar_object);
|
||||||
|
|
||||||
std::vector<scalar_type *> pointers(Nsimd); //
|
std::vector<scalar_object *> pointers(Nsimd); //
|
||||||
std::vector<scalar_type *> rpointers(Nsimd); // received pointers
|
std::vector<scalar_object *> rpointers(Nsimd); // received pointers
|
||||||
|
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
// Work out what to send where
|
// Work out what to send where
|
||||||
@ -171,10 +172,9 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &r
|
|||||||
// loop over outer coord planes orthog to dim
|
// loop over outer coord planes orthog to dim
|
||||||
for(int x=0;x<rd;x++){
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
// FIXME call local permute copy if none are offnode.
|
// FIXME call local permute copy if none are offnode.
|
||||||
|
|
||||||
for(int i=0;i<Nsimd;i++){
|
for(int i=0;i<Nsimd;i++){
|
||||||
pointers[i] = (scalar_type *)&send_buf_extract[i][0];
|
pointers[i] = &send_buf_extract[i][0];
|
||||||
}
|
}
|
||||||
int sx = (x+sshift)%rd;
|
int sx = (x+sshift)%rd;
|
||||||
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
|
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
|
||||||
@ -208,9 +208,9 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &r
|
|||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes);
|
||||||
|
|
||||||
rpointers[i] = (scalar_type *)&recv_buf_extract[i][0];
|
rpointers[i] = &recv_buf_extract[i][0];
|
||||||
} else {
|
} else {
|
||||||
rpointers[i] = (scalar_type *)&send_buf_extract[nbr_lane][0];
|
rpointers[i] = &send_buf_extract[nbr_lane][0];
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -5,21 +5,23 @@ namespace Grid {
|
|||||||
|
|
||||||
template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
|
template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
|
||||||
{
|
{
|
||||||
|
typedef typename iobj::scalar_object scalar_object;
|
||||||
typedef typename iobj::scalar_type scalar_type;
|
typedef typename iobj::scalar_type scalar_type;
|
||||||
typedef typename iobj::vector_type vector_type;
|
typedef typename iobj::vector_type vector_type;
|
||||||
|
|
||||||
GridBase *grid = l._grid;
|
GridBase *grid = l._grid;
|
||||||
int Nsimd = grid->iSites();
|
int Nsimd = grid->iSites();
|
||||||
|
|
||||||
std::vector<int> gcoor;
|
std::vector<int> gcoor;
|
||||||
std::vector<scalar_type> mergebuf(Nsimd);
|
std::vector<scalar_type> mergebuf(Nsimd);
|
||||||
std::vector<scalar_type *> mergeptr(Nsimd);
|
|
||||||
vector_type vI;
|
vector_type vI;
|
||||||
for(int o=0;o<grid->oSites();o++){
|
for(int o=0;o<grid->oSites();o++){
|
||||||
for(int i=0;i<grid->iSites();i++){
|
for(int i=0;i<grid->iSites();i++){
|
||||||
grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
|
grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
|
||||||
mergebuf[i]=gcoor[mu];
|
mergebuf[i]=(Integer)gcoor[mu];
|
||||||
mergeptr[i]=&mergebuf[i];
|
|
||||||
}
|
}
|
||||||
merge(vI,mergeptr);
|
AmergeA<vector_type,scalar_type>(vI,mergebuf);
|
||||||
l._odata[o]=vI;
|
l._odata[o]=vI;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -94,15 +94,12 @@ namespace Grid {
|
|||||||
grid->Broadcast(grid->BossRank(),s);
|
grid->Broadcast(grid->BossRank(),s);
|
||||||
|
|
||||||
std::vector<sobj> buf(Nsimd);
|
std::vector<sobj> buf(Nsimd);
|
||||||
std::vector<scalar_type *> pointers(Nsimd);
|
|
||||||
|
|
||||||
// extract-modify-merge cycle is easiest way and this is not perf critical
|
// extract-modify-merge cycle is easiest way and this is not perf critical
|
||||||
if ( rank == grid->ThisRank() ) {
|
if ( rank == grid->ThisRank() ) {
|
||||||
for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
|
extract(l._odata[odx],buf);
|
||||||
extract(l._odata[odx],pointers);
|
|
||||||
buf[idx] = s;
|
buf[idx] = s;
|
||||||
for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
|
merge(l._odata[odx],buf);
|
||||||
merge(l._odata[odx],pointers);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return;
|
return;
|
||||||
@ -127,13 +124,12 @@ namespace Grid {
|
|||||||
|
|
||||||
int rank,odx,idx;
|
int rank,odx,idx;
|
||||||
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
|
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
|
||||||
std::vector<sobj> buf(Nsimd);
|
|
||||||
std::vector<scalar_type *> pointers(Nsimd);
|
|
||||||
for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
|
|
||||||
|
|
||||||
extract(l._odata[odx],pointers);
|
std::vector<sobj> buf(Nsimd);
|
||||||
|
extract(l._odata[odx],buf);
|
||||||
|
|
||||||
s = buf[idx];
|
s = buf[idx];
|
||||||
|
|
||||||
grid->Broadcast(rank,s);
|
grid->Broadcast(rank,s);
|
||||||
|
|
||||||
return;
|
return;
|
||||||
@ -160,10 +156,8 @@ namespace Grid {
|
|||||||
odx= grid->oIndex(site);
|
odx= grid->oIndex(site);
|
||||||
|
|
||||||
std::vector<sobj> buf(Nsimd);
|
std::vector<sobj> buf(Nsimd);
|
||||||
std::vector<scalar_type *> pointers(Nsimd);
|
|
||||||
for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
|
|
||||||
|
|
||||||
extract(l._odata[odx],pointers);
|
extract(l._odata[odx],buf);
|
||||||
|
|
||||||
s = buf[idx];
|
s = buf[idx];
|
||||||
|
|
||||||
@ -188,16 +182,13 @@ namespace Grid {
|
|||||||
odx= grid->oIndex(site);
|
odx= grid->oIndex(site);
|
||||||
|
|
||||||
std::vector<sobj> buf(Nsimd);
|
std::vector<sobj> buf(Nsimd);
|
||||||
std::vector<scalar_type *> pointers(Nsimd);
|
|
||||||
for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
|
|
||||||
|
|
||||||
// extract-modify-merge cycle is easiest way and this is not perf critical
|
// extract-modify-merge cycle is easiest way and this is not perf critical
|
||||||
extract(l._odata[odx],pointers);
|
extract(l._odata[odx],buf);
|
||||||
|
|
||||||
buf[idx] = s;
|
buf[idx] = s;
|
||||||
|
|
||||||
for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
|
merge(l._odata[odx],buf);
|
||||||
merge(l._odata[odx],pointers);
|
|
||||||
|
|
||||||
return;
|
return;
|
||||||
};
|
};
|
||||||
|
@ -66,9 +66,7 @@ namespace Grid {
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::vector<sobj> buf(Nsimd);
|
std::vector<sobj> buf(Nsimd);
|
||||||
std::vector<scalar_type *> pointers(Nsimd);
|
extract(vsum,buf);
|
||||||
for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
|
|
||||||
extract(vsum,pointers);
|
|
||||||
|
|
||||||
for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i];
|
for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i];
|
||||||
|
|
||||||
|
@ -26,8 +26,21 @@ namespace Grid {
|
|||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
// real scalars are one component
|
||||||
|
template<class scalar,class distribution,class generator> void fillScalar(scalar &s,distribution &dist,generator & gen)
|
||||||
|
{
|
||||||
|
s=dist(gen);
|
||||||
|
}
|
||||||
|
template<class distribution,class generator> void fillScalar(ComplexF &s,distribution &dist, generator &gen)
|
||||||
|
{
|
||||||
|
s=ComplexF(dist(gen),dist(gen));
|
||||||
|
}
|
||||||
|
template<class distribution,class generator> void fillScalar(ComplexD &s,distribution &dist,generator &gen)
|
||||||
|
{
|
||||||
|
s=ComplexD(dist(gen),dist(gen));
|
||||||
|
}
|
||||||
|
|
||||||
class GridRNGbase {
|
class GridRNGbase {
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@ -64,20 +77,6 @@ namespace Grid {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// real scalars are one component
|
|
||||||
template<class scalar,class distribution> void fillScalar(scalar &s,distribution &dist)
|
|
||||||
{
|
|
||||||
s=dist(_generators[0]);
|
|
||||||
}
|
|
||||||
template<class distribution> void fillScalar(ComplexF &s,distribution &dist)
|
|
||||||
{
|
|
||||||
s=ComplexF(dist(_generators[0]),dist(_generators[0]));
|
|
||||||
}
|
|
||||||
template<class distribution> void fillScalar(ComplexD &s,distribution &dist)
|
|
||||||
{
|
|
||||||
s=ComplexD(dist(_generators[0]),dist(_generators[0]));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template <class sobj,class distribution> inline void fill(sobj &l,distribution &dist){
|
template <class sobj,class distribution> inline void fill(sobj &l,distribution &dist){
|
||||||
|
|
||||||
@ -88,7 +87,7 @@ namespace Grid {
|
|||||||
scalar_type *buf = (scalar_type *) & l;
|
scalar_type *buf = (scalar_type *) & l;
|
||||||
|
|
||||||
for(int idx=0;idx<words;idx++){
|
for(int idx=0;idx<words;idx++){
|
||||||
fillScalar(buf[idx],dist);
|
fillScalar(buf[idx],dist,_generators[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
|
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
|
||||||
@ -96,47 +95,47 @@ namespace Grid {
|
|||||||
};
|
};
|
||||||
|
|
||||||
template <class distribution> inline void fill(ComplexF &l,distribution &dist){
|
template <class distribution> inline void fill(ComplexF &l,distribution &dist){
|
||||||
fillScalar(l,dist);
|
fillScalar(l,dist,_generators[0]);
|
||||||
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
|
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
|
||||||
}
|
}
|
||||||
template <class distribution> inline void fill(ComplexD &l,distribution &dist){
|
template <class distribution> inline void fill(ComplexD &l,distribution &dist){
|
||||||
fillScalar(l,dist);
|
fillScalar(l,dist,_generators[0]);
|
||||||
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
|
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
|
||||||
}
|
}
|
||||||
template <class distribution> inline void fill(RealF &l,distribution &dist){
|
template <class distribution> inline void fill(RealF &l,distribution &dist){
|
||||||
fillScalar(l,dist);
|
fillScalar(l,dist,_generators[0]);
|
||||||
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
|
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
|
||||||
}
|
}
|
||||||
template <class distribution> inline void fill(RealD &l,distribution &dist){
|
template <class distribution> inline void fill(RealD &l,distribution &dist){
|
||||||
fillScalar(l,dist);
|
fillScalar(l,dist,_generators[0]);
|
||||||
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
|
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
|
||||||
}
|
}
|
||||||
// vector fill
|
// vector fill
|
||||||
template <class distribution> inline void fill(vComplexF &l,distribution &dist){
|
template <class distribution> inline void fill(vComplexF &l,distribution &dist){
|
||||||
RealF *pointer=(RealF *)&l;
|
RealF *pointer=(RealF *)&l;
|
||||||
for(int i=0;i<2*vComplexF::Nsimd();i++){
|
for(int i=0;i<2*vComplexF::Nsimd();i++){
|
||||||
fillScalar(pointer[i],dist);
|
fillScalar(pointer[i],dist,_generators[0]);
|
||||||
}
|
}
|
||||||
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
|
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
|
||||||
}
|
}
|
||||||
template <class distribution> inline void fill(vComplexD &l,distribution &dist){
|
template <class distribution> inline void fill(vComplexD &l,distribution &dist){
|
||||||
RealD *pointer=(RealD *)&l;
|
RealD *pointer=(RealD *)&l;
|
||||||
for(int i=0;i<2*vComplexD::Nsimd();i++){
|
for(int i=0;i<2*vComplexD::Nsimd();i++){
|
||||||
fillScalar(pointer[i],dist);
|
fillScalar(pointer[i],dist,_generators[0]);
|
||||||
}
|
}
|
||||||
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
|
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
|
||||||
}
|
}
|
||||||
template <class distribution> inline void fill(vRealF &l,distribution &dist){
|
template <class distribution> inline void fill(vRealF &l,distribution &dist){
|
||||||
RealF *pointer=(RealF *)&l;
|
RealF *pointer=(RealF *)&l;
|
||||||
for(int i=0;i<vRealF::Nsimd();i++){
|
for(int i=0;i<vRealF::Nsimd();i++){
|
||||||
fillScalar(pointer[i],dist);
|
fillScalar(pointer[i],dist,_generators[0]);
|
||||||
}
|
}
|
||||||
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
|
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
|
||||||
}
|
}
|
||||||
template <class distribution> inline void fill(vRealD &l,distribution &dist){
|
template <class distribution> inline void fill(vRealD &l,distribution &dist){
|
||||||
RealD *pointer=(RealD *)&l;
|
RealD *pointer=(RealD *)&l;
|
||||||
for(int i=0;i<vRealD::Nsimd();i++){
|
for(int i=0;i<vRealD::Nsimd();i++){
|
||||||
fillScalar(pointer[i],dist);
|
fillScalar(pointer[i],dist,_generators[0]);
|
||||||
}
|
}
|
||||||
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
|
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
|
||||||
}
|
}
|
||||||
@ -187,18 +186,31 @@ namespace Grid {
|
|||||||
{
|
{
|
||||||
std::vector<int> gcoor;
|
std::vector<int> gcoor;
|
||||||
|
|
||||||
for(int gidx=0;gidx<_grid->_gsites;gidx++){
|
int gsites = _grid->_gsites;
|
||||||
|
|
||||||
|
typename source::result_type init = src();
|
||||||
|
std::ranlux48 pseeder(init);
|
||||||
|
std::uniform_int_distribution<uint64_t> ui;
|
||||||
|
|
||||||
|
for(int gidx=0;gidx<gsites;gidx++){
|
||||||
|
|
||||||
int rank,o_idx,i_idx;
|
int rank,o_idx,i_idx;
|
||||||
_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
|
_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
|
||||||
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
|
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
|
||||||
|
|
||||||
int l_idx=generator_idx(o_idx,i_idx);
|
int l_idx=generator_idx(o_idx,i_idx);
|
||||||
|
|
||||||
|
std::vector<int> site_seeds(4);
|
||||||
|
for(int i=0;i<4;i++){
|
||||||
|
site_seeds[i]= ui(pseeder);
|
||||||
|
}
|
||||||
|
|
||||||
typename source::result_type init = src();
|
_grid->Broadcast(0,(void *)&site_seeds[0],sizeof(int)*site_seeds.size());
|
||||||
|
|
||||||
_grid->Broadcast(0,(void *)&init,sizeof(init));
|
|
||||||
if( rank == _grid->ThisRank() ){
|
if( rank == _grid->ThisRank() ){
|
||||||
_generators[l_idx] = std::ranlux48(init);
|
fixedSeed ssrc(site_seeds);
|
||||||
|
typename source::result_type sinit = ssrc();
|
||||||
|
_generators[l_idx] = std::ranlux48(sinit);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
_seeded=1;
|
_seeded=1;
|
||||||
@ -210,6 +222,7 @@ namespace Grid {
|
|||||||
|
|
||||||
template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,distribution &dist){
|
template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,distribution &dist){
|
||||||
|
|
||||||
|
typedef typename vobj::scalar_object scalar_object;
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
|
||||||
@ -217,25 +230,22 @@ namespace Grid {
|
|||||||
|
|
||||||
int Nsimd =_grid->Nsimd();
|
int Nsimd =_grid->Nsimd();
|
||||||
int osites=_grid->oSites();
|
int osites=_grid->oSites();
|
||||||
|
int words=sizeof(scalar_object)/sizeof(scalar_type);
|
||||||
|
|
||||||
int words = sizeof(vobj)/sizeof(vector_type);
|
std::vector<scalar_object> buf(Nsimd);
|
||||||
std::vector<std::vector<scalar_type> > buf(Nsimd,std::vector<scalar_type>(words));
|
|
||||||
std::vector<scalar_type *> pointers(Nsimd);
|
|
||||||
|
|
||||||
for(int ss=0;ss<osites;ss++){
|
for(int ss=0;ss<osites;ss++){
|
||||||
|
|
||||||
for(int si=0;si<Nsimd;si++){
|
for(int si=0;si<Nsimd;si++){
|
||||||
|
|
||||||
int gdx = generator_idx(ss,si); // index of generator state
|
int gdx = generator_idx(ss,si); // index of generator state
|
||||||
|
scalar_type *pointer = (scalar_type *)&buf[si];
|
||||||
pointers[si] = (scalar_type *)&buf[si][0];
|
|
||||||
for(int idx=0;idx<words;idx++){
|
for(int idx=0;idx<words;idx++){
|
||||||
pointers[si][idx] = dist(_generators[gdx]);
|
fillScalar(pointer[idx],dist,_generators[gdx]);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
// merge into SIMD lanes
|
// merge into SIMD lanes
|
||||||
merge(l._odata[ss],pointers);
|
merge(l._odata[ss],buf);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -57,12 +57,6 @@ public:
|
|||||||
friend void permute(iScalar<vtype> &out,const iScalar<vtype> &in,int permutetype){
|
friend void permute(iScalar<vtype> &out,const iScalar<vtype> &in,int permutetype){
|
||||||
permute(out._internal,in._internal,permutetype);
|
permute(out._internal,in._internal,permutetype);
|
||||||
}
|
}
|
||||||
friend void extract(const iScalar<vtype> &in,std::vector<scalar_type *> &out){
|
|
||||||
extract(in._internal,out); // extract advances the pointers in out
|
|
||||||
}
|
|
||||||
friend void merge(iScalar<vtype> &in,std::vector<scalar_type *> &out){
|
|
||||||
merge(in._internal,out); // extract advances the pointers in out
|
|
||||||
}
|
|
||||||
|
|
||||||
// Unary negation
|
// Unary negation
|
||||||
friend inline iScalar<vtype> operator -(const iScalar<vtype> &r) {
|
friend inline iScalar<vtype> operator -(const iScalar<vtype> &r) {
|
||||||
@ -149,16 +143,6 @@ public:
|
|||||||
permute(out._internal[i],in._internal[i],permutetype);
|
permute(out._internal[i],in._internal[i],permutetype);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
friend void extract(const iVector<vtype,N> &in,std::vector<scalar_type *> &out){
|
|
||||||
for(int i=0;i<N;i++){
|
|
||||||
extract(in._internal[i],out);// extract advances pointers in out
|
|
||||||
}
|
|
||||||
}
|
|
||||||
friend void merge(iVector<vtype,N> &in,std::vector<scalar_type *> &out){
|
|
||||||
for(int i=0;i<N;i++){
|
|
||||||
merge(in._internal[i],out);// extract advances pointers in out
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Unary negation
|
// Unary negation
|
||||||
friend inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) {
|
friend inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) {
|
||||||
iVector<vtype,N> ret;
|
iVector<vtype,N> ret;
|
||||||
@ -232,18 +216,6 @@ public:
|
|||||||
permute(out._internal[i][j],in._internal[i][j],permutetype);
|
permute(out._internal[i][j],in._internal[i][j],permutetype);
|
||||||
}}
|
}}
|
||||||
}
|
}
|
||||||
friend void extract(const iMatrix<vtype,N> &in,std::vector<scalar_type *> &out){
|
|
||||||
for(int i=0;i<N;i++){
|
|
||||||
for(int j=0;j<N;j++){
|
|
||||||
extract(in._internal[i][j],out);// extract advances pointers in out
|
|
||||||
}}
|
|
||||||
}
|
|
||||||
friend void merge(iMatrix<vtype,N> &in,std::vector<scalar_type *> &out){
|
|
||||||
for(int i=0;i<N;i++){
|
|
||||||
for(int j=0;j<N;j++){
|
|
||||||
merge(in._internal[i][j],out);// extract advances pointers in out
|
|
||||||
}}
|
|
||||||
}
|
|
||||||
// Unary negation
|
// Unary negation
|
||||||
friend inline iMatrix<vtype,N> operator -(const iMatrix<vtype,N> &r) {
|
friend inline iMatrix<vtype,N> operator -(const iMatrix<vtype,N> &r) {
|
||||||
iMatrix<vtype,N> ret;
|
iMatrix<vtype,N> ret;
|
||||||
@ -285,37 +257,6 @@ public:
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
template<class vobj> inline
|
|
||||||
void extract(const vobj &vec,std::vector<typename vobj::scalar_object> &extracted)
|
|
||||||
{
|
|
||||||
typedef typename vobj::scalar_type scalar_type ;
|
|
||||||
typedef typename vobj::vector_type vector_type ;
|
|
||||||
|
|
||||||
int Nsimd=vobj::vector_type::Nsimd();
|
|
||||||
|
|
||||||
extracted.resize(Nsimd);
|
|
||||||
|
|
||||||
std::vector<scalar_type *> pointers(Nsimd);
|
|
||||||
for(int i=0;i<Nsimd;i++)
|
|
||||||
pointers[i] =(scalar_type *)& extracted[i];
|
|
||||||
|
|
||||||
extract(vec,pointers);
|
|
||||||
}
|
|
||||||
template<class vobj> inline
|
|
||||||
void merge(vobj &vec,std::vector<typename vobj::scalar_object> &extracted)
|
|
||||||
{
|
|
||||||
typedef typename vobj::scalar_type scalar_type ;
|
|
||||||
typedef typename vobj::vector_type vector_type ;
|
|
||||||
|
|
||||||
int Nsimd=vobj::vector_type::Nsimd();
|
|
||||||
assert(extracted.size()==Nsimd);
|
|
||||||
|
|
||||||
std::vector<scalar_type *> pointers(Nsimd);
|
|
||||||
for(int i=0;i<Nsimd;i++)
|
|
||||||
pointers[i] =(scalar_type *)& extracted[i];
|
|
||||||
|
|
||||||
merge(vec,pointers);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -64,6 +64,14 @@ namespace Grid {
|
|||||||
typedef ComplexD scalar_object;
|
typedef ComplexD scalar_object;
|
||||||
enum { TensorLevel = 0 };
|
enum { TensorLevel = 0 };
|
||||||
};
|
};
|
||||||
|
template<> class GridTypeMapper<Integer> {
|
||||||
|
public:
|
||||||
|
typedef Integer scalar_type;
|
||||||
|
typedef Integer vector_type;
|
||||||
|
typedef Integer tensor_reduced;
|
||||||
|
typedef Integer scalar_object;
|
||||||
|
enum { TensorLevel = 0 };
|
||||||
|
};
|
||||||
|
|
||||||
template<> class GridTypeMapper<vRealF> {
|
template<> class GridTypeMapper<vRealF> {
|
||||||
public:
|
public:
|
||||||
@ -99,10 +107,10 @@ namespace Grid {
|
|||||||
};
|
};
|
||||||
template<> class GridTypeMapper<vInteger> {
|
template<> class GridTypeMapper<vInteger> {
|
||||||
public:
|
public:
|
||||||
typedef Integer scalar_type;
|
typedef Integer scalar_type;
|
||||||
typedef vInteger vector_type;
|
typedef vInteger vector_type;
|
||||||
typedef vInteger tensor_reduced;
|
typedef vInteger tensor_reduced;
|
||||||
typedef Integer scalar_object;
|
typedef Integer scalar_object;
|
||||||
enum { TensorLevel = 0 };
|
enum { TensorLevel = 0 };
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -99,106 +99,159 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
|
|||||||
|
|
||||||
for(int ss=0;ss<grid->oSites();ss++){
|
for(int ss=0;ss<grid->oSites();ss++){
|
||||||
|
|
||||||
int offset,local;
|
int offset,local,perm, ptype;
|
||||||
|
|
||||||
vSpinColourVector result;
|
vSpinColourVector result;
|
||||||
vHalfSpinColourVector chi;
|
vHalfSpinColourVector chi;
|
||||||
|
vHalfSpinColourVector tmp;
|
||||||
vHalfSpinColourVector Uchi;
|
vHalfSpinColourVector Uchi;
|
||||||
vHalfSpinColourVector *chi_p;
|
vHalfSpinColourVector *chi_p;
|
||||||
|
|
||||||
result=zero;
|
result=zero;
|
||||||
|
|
||||||
#if 0
|
|
||||||
// Xp
|
// Xp
|
||||||
offset = Stencil._offsets [Xp][ss];
|
offset = Stencil._offsets [Xp][ss];
|
||||||
local = Stencil._is_local[Xp][ss];
|
local = Stencil._is_local[Xp][ss];
|
||||||
|
perm = Stencil._permute[Xp][ss];
|
||||||
|
ptype = Stencil._permute_type[Xp];
|
||||||
chi_p = &comm_buf[offset];
|
chi_p = &comm_buf[offset];
|
||||||
if ( local ) {
|
if ( local ) {
|
||||||
spProjXp(chi,in._odata[offset]);
|
|
||||||
chi_p = χ
|
chi_p = χ
|
||||||
}
|
spProjXp(chi,in._odata[offset]);
|
||||||
|
if ( perm ) {
|
||||||
|
permute(tmp,chi,ptype);
|
||||||
|
chi_p = &tmp;
|
||||||
|
}
|
||||||
|
}
|
||||||
mult(&(Uchi()),&(Umu._odata[ss](Xp)),&(*chi_p)());
|
mult(&(Uchi()),&(Umu._odata[ss](Xp)),&(*chi_p)());
|
||||||
spReconXp(result,Uchi);
|
spReconXp(result,Uchi);
|
||||||
|
|
||||||
// Yp
|
// Yp
|
||||||
offset = Stencil._offsets [Yp][ss];
|
offset = Stencil._offsets [Yp][ss];
|
||||||
local = Stencil._is_local[Yp][ss];
|
local = Stencil._is_local[Yp][ss];
|
||||||
|
perm = Stencil._permute[Yp][ss];
|
||||||
|
ptype = Stencil._permute_type[Yp];
|
||||||
chi_p = &comm_buf[offset];
|
chi_p = &comm_buf[offset];
|
||||||
if ( local ) {
|
if ( local ) {
|
||||||
spProjYp(chi,in._odata[offset]);
|
|
||||||
chi_p = χ
|
chi_p = χ
|
||||||
}
|
spProjYp(chi,in._odata[offset]);
|
||||||
|
if ( perm ) {
|
||||||
|
permute(tmp,chi,ptype);
|
||||||
|
chi_p = &tmp;
|
||||||
|
}
|
||||||
|
}
|
||||||
mult(&(Uchi()),&(Umu._odata[ss](Yp)),&(*chi_p)());
|
mult(&(Uchi()),&(Umu._odata[ss](Yp)),&(*chi_p)());
|
||||||
accumReconYp(result,Uchi);
|
accumReconYp(result,Uchi);
|
||||||
|
|
||||||
// Zp
|
// Zp
|
||||||
offset = Stencil._offsets [Zp][ss];
|
offset = Stencil._offsets [Zp][ss];
|
||||||
local = Stencil._is_local[Zp][ss];
|
local = Stencil._is_local[Zp][ss];
|
||||||
|
perm = Stencil._permute[Zp][ss];
|
||||||
|
ptype = Stencil._permute_type[Zp];
|
||||||
chi_p = &comm_buf[offset];
|
chi_p = &comm_buf[offset];
|
||||||
|
|
||||||
if ( local ) {
|
if ( local ) {
|
||||||
spProjZp(chi,in._odata[offset]);
|
|
||||||
chi_p = χ
|
chi_p = χ
|
||||||
}
|
spProjZp(chi,in._odata[offset]);
|
||||||
mult(&(Uchi()),&(Umu._odata[ss](Zp)),&(*chi_p)() );
|
if ( perm ) {
|
||||||
|
permute(tmp,chi,ptype);
|
||||||
|
chi_p = &tmp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mult(&(Uchi()),&(Umu._odata[ss](Zp)),&(*chi_p)());
|
||||||
accumReconZp(result,Uchi);
|
accumReconZp(result,Uchi);
|
||||||
|
|
||||||
// Tp
|
// Tp
|
||||||
offset = Stencil._offsets [Tp][ss];
|
offset = Stencil._offsets [Tp][ss];
|
||||||
local = Stencil._is_local[Tp][ss];
|
local = Stencil._is_local[Tp][ss];
|
||||||
|
perm = Stencil._permute[Tp][ss];
|
||||||
|
ptype = Stencil._permute_type[Tp];
|
||||||
chi_p = &comm_buf[offset];
|
chi_p = &comm_buf[offset];
|
||||||
|
|
||||||
if ( local ) {
|
if ( local ) {
|
||||||
spProjTp(chi,in._odata[offset]);
|
|
||||||
chi_p = χ
|
chi_p = χ
|
||||||
}
|
spProjTp(chi,in._odata[offset]);
|
||||||
|
if ( perm ) {
|
||||||
|
permute(tmp,chi,ptype);
|
||||||
|
chi_p = &tmp;
|
||||||
|
}
|
||||||
|
}
|
||||||
mult(&(Uchi()),&(Umu._odata[ss](Tp)),&(*chi_p)());
|
mult(&(Uchi()),&(Umu._odata[ss](Tp)),&(*chi_p)());
|
||||||
accumReconTp(result,Uchi);
|
accumReconTp(result,Uchi);
|
||||||
#endif
|
|
||||||
|
|
||||||
// Xm
|
// Xm
|
||||||
offset = Stencil._offsets [Xm][ss];
|
offset = Stencil._offsets [Xm][ss];
|
||||||
local = Stencil._is_local[Xm][ss];
|
local = Stencil._is_local[Xm][ss];
|
||||||
|
perm = Stencil._permute[Xm][ss];
|
||||||
|
ptype = Stencil._permute_type[Xm];
|
||||||
chi_p = &comm_buf[offset];
|
chi_p = &comm_buf[offset];
|
||||||
if ( local ) {
|
if ( local ) {
|
||||||
spProjXm(chi,in._odata[offset]);
|
|
||||||
chi_p = χ
|
chi_p = χ
|
||||||
}
|
spProjXm(chi,in._odata[offset]);
|
||||||
|
if ( perm ) {
|
||||||
|
permute(tmp,chi,ptype);
|
||||||
|
chi_p = &tmp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::cout<<"Xm for site "<<ss<<" l "<<local<<" p "<<perm<<" chi "<<Reduce(TensorRemove(innerProduct(*chi_p,*chi_p)))<<std::endl;
|
||||||
mult(&(Uchi()),&(Umu._odata[ss](Xm)),&(*chi_p)());
|
mult(&(Uchi()),&(Umu._odata[ss](Xm)),&(*chi_p)());
|
||||||
accumReconXm(result,Uchi);
|
accumReconXm(result,Uchi);
|
||||||
#if 0
|
|
||||||
|
|
||||||
// Ym
|
// Ym
|
||||||
offset = Stencil._offsets [Ym][ss];
|
offset = Stencil._offsets [Ym][ss];
|
||||||
local = Stencil._is_local[Ym][ss];
|
local = Stencil._is_local[Ym][ss];
|
||||||
|
perm = Stencil._permute[Ym][ss];
|
||||||
|
ptype = Stencil._permute_type[Ym];
|
||||||
chi_p = &comm_buf[offset];
|
chi_p = &comm_buf[offset];
|
||||||
|
|
||||||
if ( local ) {
|
if ( local ) {
|
||||||
spProjYm(chi,in._odata[offset]);
|
|
||||||
chi_p = χ
|
chi_p = χ
|
||||||
}
|
spProjYm(chi,in._odata[offset]);
|
||||||
|
if ( perm ) {
|
||||||
|
permute(tmp,chi,ptype);
|
||||||
|
chi_p = &tmp;
|
||||||
|
}
|
||||||
|
}
|
||||||
mult(&(Uchi()),&(Umu._odata[ss](Ym)),&(*chi_p)());
|
mult(&(Uchi()),&(Umu._odata[ss](Ym)),&(*chi_p)());
|
||||||
accumReconYm(result,Uchi);
|
accumReconYm(result,Uchi);
|
||||||
|
|
||||||
// Zm
|
// Zm
|
||||||
offset = Stencil._offsets [Zm][ss];
|
offset = Stencil._offsets [Zm][ss];
|
||||||
local = Stencil._is_local[Zm][ss];
|
local = Stencil._is_local[Zm][ss];
|
||||||
|
perm = Stencil._permute[Zm][ss];
|
||||||
|
ptype = Stencil._permute_type[Zm];
|
||||||
chi_p = &comm_buf[offset];
|
chi_p = &comm_buf[offset];
|
||||||
|
|
||||||
if ( local ) {
|
if ( local ) {
|
||||||
spProjZm(chi,in._odata[offset]);
|
|
||||||
chi_p = χ
|
chi_p = χ
|
||||||
}
|
spProjZm(chi,in._odata[offset]);
|
||||||
|
if ( perm ) {
|
||||||
|
permute(tmp,chi,ptype);
|
||||||
|
chi_p = &tmp;
|
||||||
|
}
|
||||||
|
}
|
||||||
mult(&(Uchi()),&(Umu._odata[ss](Zm)),&(*chi_p)());
|
mult(&(Uchi()),&(Umu._odata[ss](Zm)),&(*chi_p)());
|
||||||
accumReconZm(result,Uchi);
|
accumReconZm(result,Uchi);
|
||||||
|
|
||||||
// Tm
|
// Tm
|
||||||
offset = Stencil._offsets [Tm][ss];
|
offset = Stencil._offsets [Tm][ss];
|
||||||
local = Stencil._is_local[Tm][ss];
|
local = Stencil._is_local[Tm][ss];
|
||||||
|
perm = Stencil._permute[Tm][ss];
|
||||||
|
ptype = Stencil._permute_type[Tm];
|
||||||
chi_p = &comm_buf[offset];
|
chi_p = &comm_buf[offset];
|
||||||
|
|
||||||
if ( local ) {
|
if ( local ) {
|
||||||
spProjTm(chi,in._odata[offset]);
|
|
||||||
chi_p = χ
|
chi_p = χ
|
||||||
}
|
spProjTm(chi,in._odata[offset]);
|
||||||
|
if ( perm ) {
|
||||||
|
permute(tmp,chi,ptype);
|
||||||
|
chi_p = &tmp;
|
||||||
|
}
|
||||||
|
}
|
||||||
mult(&(Uchi()),&(Umu._odata[ss](Tm)),&(*chi_p)());
|
mult(&(Uchi()),&(Umu._odata[ss](Tm)),&(*chi_p)());
|
||||||
accumReconTm(result,Uchi);
|
accumReconTm(result,Uchi);
|
||||||
#endif
|
|
||||||
|
|
||||||
out._odata[ss] = result;
|
out._odata[ss] = result;
|
||||||
}
|
}
|
||||||
|
@ -163,6 +163,7 @@ namespace Grid {
|
|||||||
// all subtypes; may not be a good assumption, but could
|
// all subtypes; may not be a good assumption, but could
|
||||||
// add the vector width as a template param for BG/Q for example
|
// add the vector width as a template param for BG/Q for example
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
|
/*
|
||||||
friend inline void permute(vComplexD &y,vComplexD b,int perm)
|
friend inline void permute(vComplexD &y,vComplexD b,int perm)
|
||||||
{
|
{
|
||||||
Gpermute<vComplexD>(y,b,perm);
|
Gpermute<vComplexD>(y,b,perm);
|
||||||
@ -183,6 +184,7 @@ namespace Grid {
|
|||||||
{
|
{
|
||||||
Gextract<vComplexD,ComplexD>(y,extracted);
|
Gextract<vComplexD,ComplexD>(y,extracted);
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
///////////////////////
|
///////////////////////
|
||||||
// Splat
|
// Splat
|
||||||
|
@ -412,6 +412,7 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
|
|||||||
{
|
{
|
||||||
Gpermute<vComplexF>(y,b,perm);
|
Gpermute<vComplexF>(y,b,perm);
|
||||||
}
|
}
|
||||||
|
/*
|
||||||
friend inline void merge(vComplexF &y,std::vector<ComplexF *> &extracted)
|
friend inline void merge(vComplexF &y,std::vector<ComplexF *> &extracted)
|
||||||
{
|
{
|
||||||
Gmerge<vComplexF,ComplexF >(y,extracted);
|
Gmerge<vComplexF,ComplexF >(y,extracted);
|
||||||
@ -428,7 +429,7 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
|
|||||||
{
|
{
|
||||||
Gextract<vComplexF,ComplexF>(y,extracted);
|
Gextract<vComplexF,ComplexF>(y,extracted);
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -221,6 +221,7 @@ namespace Grid {
|
|||||||
{
|
{
|
||||||
Gpermute<vInteger>(y,b,perm);
|
Gpermute<vInteger>(y,b,perm);
|
||||||
}
|
}
|
||||||
|
/*
|
||||||
friend inline void merge(vInteger &y,std::vector<Integer *> &extracted)
|
friend inline void merge(vInteger &y,std::vector<Integer *> &extracted)
|
||||||
{
|
{
|
||||||
Gmerge<vInteger,Integer>(y,extracted);
|
Gmerge<vInteger,Integer>(y,extracted);
|
||||||
@ -237,7 +238,7 @@ namespace Grid {
|
|||||||
{
|
{
|
||||||
Gextract<vInteger,Integer>(y,extracted);
|
Gextract<vInteger,Integer>(y,extracted);
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
public:
|
public:
|
||||||
static inline int Nsimd(void) { return sizeof(ivec)/sizeof(Integer);}
|
static inline int Nsimd(void) { return sizeof(ivec)/sizeof(Integer);}
|
||||||
|
@ -105,6 +105,7 @@ namespace Grid {
|
|||||||
// all subtypes; may not be a good assumption, but could
|
// all subtypes; may not be a good assumption, but could
|
||||||
// add the vector width as a template param for BG/Q for example
|
// add the vector width as a template param for BG/Q for example
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
|
/*
|
||||||
friend inline void permute(vRealD &y,vRealD b,int perm)
|
friend inline void permute(vRealD &y,vRealD b,int perm)
|
||||||
{
|
{
|
||||||
Gpermute<vRealD>(y,b,perm);
|
Gpermute<vRealD>(y,b,perm);
|
||||||
@ -125,7 +126,7 @@ namespace Grid {
|
|||||||
{
|
{
|
||||||
Gextract<vRealD,RealD>(y,extracted);
|
Gextract<vRealD,RealD>(y,extracted);
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
friend inline void vsplat(vRealD &ret,double a){
|
friend inline void vsplat(vRealD &ret,double a){
|
||||||
#if defined (AVX1)|| defined (AVX2)
|
#if defined (AVX1)|| defined (AVX2)
|
||||||
|
@ -127,6 +127,7 @@ namespace Grid {
|
|||||||
// all subtypes; may not be a good assumption, but could
|
// all subtypes; may not be a good assumption, but could
|
||||||
// add the vector width as a template param for BG/Q for example
|
// add the vector width as a template param for BG/Q for example
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
|
/*
|
||||||
friend inline void permute(vRealF &y,vRealF b,int perm)
|
friend inline void permute(vRealF &y,vRealF b,int perm)
|
||||||
{
|
{
|
||||||
Gpermute<vRealF>(y,b,perm);
|
Gpermute<vRealF>(y,b,perm);
|
||||||
@ -147,7 +148,7 @@ namespace Grid {
|
|||||||
{
|
{
|
||||||
Gextract<vRealF,RealF>(y,extracted);
|
Gextract<vRealF,RealF>(y,extracted);
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////
|
||||||
|
@ -118,6 +118,7 @@ namespace Grid {
|
|||||||
|
|
||||||
int fd = _grid->_fdimensions[dimension];
|
int fd = _grid->_fdimensions[dimension];
|
||||||
int rd = _grid->_rdimensions[dimension];
|
int rd = _grid->_rdimensions[dimension];
|
||||||
|
int pd = _grid->_processors[dimension];
|
||||||
int simd_layout = _grid->_simd_layout[dimension];
|
int simd_layout = _grid->_simd_layout[dimension];
|
||||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||||
|
|
||||||
@ -136,10 +137,10 @@ namespace Grid {
|
|||||||
|
|
||||||
for(int x=0;x<rd;x++){
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
int offnode = ( x+sshift >= rd );
|
int comm_proc = ((x+sshift)/rd)%pd;
|
||||||
|
int offnode = (comm_proc!=0);
|
||||||
int sx = (x+sshift)%rd;
|
int sx = (x+sshift)%rd;
|
||||||
int comm_proc = (x+sshift)/rd;
|
|
||||||
|
|
||||||
if (!offnode) {
|
if (!offnode) {
|
||||||
|
|
||||||
int permute_slice=0;
|
int permute_slice=0;
|
||||||
|
@ -75,9 +75,9 @@ void Tester(const functor &func)
|
|||||||
random(sRNG,result[i]);
|
random(sRNG,result[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
Gmerge(v_input1,input1);
|
merge<vec,scal>(v_input1,input1);
|
||||||
Gmerge(v_input2,input2);
|
merge<vec,scal>(v_input2,input2);
|
||||||
Gmerge(v_result,result);
|
merge<vec,scal>(v_result,result);
|
||||||
|
|
||||||
func(v_result,v_input1,v_input2);
|
func(v_result,v_input1,v_input2);
|
||||||
|
|
||||||
@ -85,7 +85,7 @@ void Tester(const functor &func)
|
|||||||
func(reference[i],input1[i],input2[i]);
|
func(reference[i],input1[i],input2[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
Gextract(v_result,result);
|
extract<vec,scal>(v_result,result);
|
||||||
std::cout << " " << func.name()<<std::endl;
|
std::cout << " " << func.name()<<std::endl;
|
||||||
|
|
||||||
int ok=0;
|
int ok=0;
|
||||||
|
@ -4,21 +4,12 @@ using namespace std;
|
|||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
using namespace Grid::QCD;
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
template<class vobj>
|
|
||||||
class SimpleCompressor {
|
|
||||||
public:
|
|
||||||
void Point(int) {};
|
|
||||||
|
|
||||||
vobj operator() (const vobj &arg) {
|
|
||||||
return arg;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
int main (int argc, char ** argv)
|
int main (int argc, char ** argv)
|
||||||
{
|
{
|
||||||
Grid_init(&argc,&argv);
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
std::vector<int> simd_layout({1,1,2,2});
|
std::vector<int> simd_layout({1,1,2,2});
|
||||||
std::vector<int> mpi_layout ({2,2,2,2});
|
std::vector<int> mpi_layout ({2,2,1,2});
|
||||||
std::vector<int> latt_size ({8,8,8,8});
|
std::vector<int> latt_size ({8,8,8,8});
|
||||||
|
|
||||||
double volume = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
|
double volume = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
|
||||||
@ -26,7 +17,9 @@ int main (int argc, char ** argv)
|
|||||||
GridCartesian Fine(latt_size,simd_layout,mpi_layout);
|
GridCartesian Fine(latt_size,simd_layout,mpi_layout);
|
||||||
GridRedBlackCartesian rbFine(latt_size,simd_layout,mpi_layout);
|
GridRedBlackCartesian rbFine(latt_size,simd_layout,mpi_layout);
|
||||||
GridParallelRNG fRNG(&Fine);
|
GridParallelRNG fRNG(&Fine);
|
||||||
fRNG.SeedRandomDevice();
|
// fRNG.SeedRandomDevice();
|
||||||
|
std::vector<int> seeds({1,2,3,4});
|
||||||
|
fRNG.SeedFixedIntegers(seeds);
|
||||||
|
|
||||||
LatticeColourMatrix Foo(&Fine);
|
LatticeColourMatrix Foo(&Fine);
|
||||||
LatticeColourMatrix Bar(&Fine);
|
LatticeColourMatrix Bar(&Fine);
|
||||||
@ -38,8 +31,9 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
|
|
||||||
for(int dir=0;dir<4;dir++){
|
for(int dir=0;dir<4;dir++){
|
||||||
for(int disp=0;disp<Fine._rdimensions[dir];disp++){
|
for(int disp=0;disp<Fine._fdimensions[dir];disp++){
|
||||||
|
|
||||||
|
std::cout << "Using stencil to shift dim "<<dir<< " by "<<disp<<std::endl;
|
||||||
// start to test the Cartesian npoint stencil infrastructure
|
// start to test the Cartesian npoint stencil infrastructure
|
||||||
int npoint=1;
|
int npoint=1;
|
||||||
std::vector<int> directions(npoint,dir);
|
std::vector<int> directions(npoint,dir);
|
||||||
@ -47,22 +41,13 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
CartesianStencil myStencil(&Fine,npoint,0,directions,displacements);
|
CartesianStencil myStencil(&Fine,npoint,0,directions,displacements);
|
||||||
|
|
||||||
printf("STENCIL: osites %d %d dir %d disp %d\n",Fine.oSites(),(int)myStencil._offsets[0].size(),dir,disp);
|
|
||||||
std::vector<int> ocoor(4);
|
std::vector<int> ocoor(4);
|
||||||
for(int o=0;o<Fine.oSites();o++){
|
for(int o=0;o<Fine.oSites();o++){
|
||||||
Fine.oCoorFromOindex(ocoor,o);
|
Fine.oCoorFromOindex(ocoor,o);
|
||||||
ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
|
ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
|
||||||
int nbr = Fine.oIndexReduced(ocoor);
|
|
||||||
int stcl= myStencil._offsets[0][o];
|
|
||||||
if(nbr!=stcl){
|
|
||||||
printf("STENCIL: nbr %d stencil._offset %d\n",nbr,stcl);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("allocating %d buffers\n",myStencil._unified_buffer_size);
|
|
||||||
fflush(stdout);
|
|
||||||
std::vector<vColourMatrix,alignedAllocator<vColourMatrix> > comm_buf(myStencil._unified_buffer_size);
|
std::vector<vColourMatrix,alignedAllocator<vColourMatrix> > comm_buf(myStencil._unified_buffer_size);
|
||||||
printf("calling halo exchange\n");fflush(stdout);
|
|
||||||
SimpleCompressor<vColourMatrix> compress;
|
SimpleCompressor<vColourMatrix> compress;
|
||||||
myStencil.HaloExchange(Foo,comm_buf,compress);
|
myStencil.HaloExchange(Foo,comm_buf,compress);
|
||||||
|
|
||||||
@ -81,14 +66,12 @@ int main (int argc, char ** argv)
|
|||||||
Check._odata[i] = Foo._odata[offset];
|
Check._odata[i] = Foo._odata[offset];
|
||||||
else
|
else
|
||||||
Check._odata[i] = comm_buf[offset];
|
Check._odata[i] = comm_buf[offset];
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Real nrmC = norm2(Check);
|
Real nrmC = norm2(Check);
|
||||||
Real nrmB = norm2(Bar);
|
Real nrmB = norm2(Bar);
|
||||||
Real nrm = norm2(Check-Bar);
|
Real nrm = norm2(Check-Bar);
|
||||||
printf("N2diff = %le (%le, %le) \n",nrm,nrmC,nrmB);fflush(stdout);
|
std::cout<<"N2diff ="<<nrm<<" "<<nrmC<<" " <<nrmB<<std::endl;
|
||||||
|
|
||||||
Real snrmC =0;
|
Real snrmC =0;
|
||||||
Real snrmB =0;
|
Real snrmB =0;
|
||||||
@ -110,10 +93,11 @@ int main (int argc, char ** argv)
|
|||||||
diff =check()()(r,c)-bar()()(r,c);
|
diff =check()()(r,c)-bar()()(r,c);
|
||||||
double nn=real(conj(diff)*diff);
|
double nn=real(conj(diff)*diff);
|
||||||
if ( nn > 0){
|
if ( nn > 0){
|
||||||
printf("Coor (%d %d %d %d) \t rc %d%d \t %le %le %le\n",
|
printf("Coor (%d %d %d %d) \t rc %d%d \t %le (%le,%le) %le\n",
|
||||||
coor[0],coor[1],coor[2],coor[3],r,c,
|
coor[0],coor[1],coor[2],coor[3],r,c,
|
||||||
nn,
|
nn,
|
||||||
real(check()()(r,c)),
|
real(check()()(r,c)),
|
||||||
|
imag(check()()(r,c)),
|
||||||
real(bar()()(r,c))
|
real(bar()()(r,c))
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -124,7 +108,7 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
}}}}
|
}}}}
|
||||||
|
|
||||||
printf("scalar N2diff = %le (%le, %le) \n",snrm,snrmC,snrmB);fflush(stdout);
|
std::cout<<"scalar N2diff = "<<snrm<<" " <<snrmC<<" "<<snrmB<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -28,6 +28,7 @@ int main (int argc, char ** argv)
|
|||||||
std::vector<int> seeds({1,2,3,4});
|
std::vector<int> seeds({1,2,3,4});
|
||||||
|
|
||||||
GridParallelRNG pRNG(&Grid);
|
GridParallelRNG pRNG(&Grid);
|
||||||
|
// std::vector<int> seeds({1,2,3,4});
|
||||||
// pRNG.SeedFixedIntegers(seeds);
|
// pRNG.SeedFixedIntegers(seeds);
|
||||||
pRNG.SeedRandomDevice();
|
pRNG.SeedRandomDevice();
|
||||||
|
|
||||||
@ -44,7 +45,7 @@ int main (int argc, char ** argv)
|
|||||||
U[mu] = peekIndex<3>(Umu,mu);
|
U[mu] = peekIndex<3>(Umu,mu);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<int> mask({0,0,0,0,1,0,0,0});
|
std::vector<int> mask({1,1,1,1,1,1,1,1});
|
||||||
{ // Naive wilson implementation
|
{ // Naive wilson implementation
|
||||||
ref = zero;
|
ref = zero;
|
||||||
for(int mu=0;mu<Nd;mu++){
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
Loading…
Reference in New Issue
Block a user