1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-09 23:45:36 +00:00

Vectorise the XYZT face gathering better.

Hard coded for simd_layout <= 2 in any given spread out direction; full generality is inconsistent
with efficiency.
This commit is contained in:
paboyle 2017-02-15 11:11:04 +00:00
parent aca7a3ef0a
commit bd600702cf
10 changed files with 510 additions and 34 deletions

View File

@ -338,9 +338,9 @@ void Grid_init(int *argc,char ***argv)
QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptGeneric;
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute;
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsAndCompute;
} else {
WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsThenCompute;
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-isend") ){
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyIsend);

View File

@ -184,14 +184,18 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
struct Merge {
cobj * mpointer;
std::vector<scalar_object *> rpointers;
std::vector<cobj *> vpointers;
Integer buffer_size;
Integer packet_id;
Integer exchange;
Integer type;
};
std::vector<Merge> Mergers;
void AddMerge(cobj *merge_p,std::vector<scalar_object *> &rpointers,Integer buffer_size,Integer packet_id) {
Merge m;
m.exchange = 0;
m.mpointer = merge_p;
m.rpointers= rpointers;
m.buffer_size = buffer_size;
@ -199,6 +203,17 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
Mergers.push_back(m);
}
void AddMergeNew(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer packet_id,Integer type) {
Merge m;
m.exchange = 1;
m.type = type;
m.mpointer = merge_p;
m.vpointers= rpointers;
m.buffer_size = buffer_size;
m.packet_id = packet_id;
Mergers.push_back(m);
}
void CommsMerge(void ) {
for(int i=0;i<Mergers.size();i++){
@ -214,10 +229,18 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
// std::string fname(ss.str());
// std::ofstream fout(fname);
if ( Mergers[i].exchange == 0 ) {
PARALLEL_FOR_LOOP
for(int o=0;o<Mergers[i].buffer_size;o++){
merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
// fout<<o<<" "<<Mergers[i].mpointer[o]<<std::endl;
for(int o=0;o<Mergers[i].buffer_size;o++){
merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
// fout<<o<<" "<<Mergers[i].mpointer[o]<<std::endl;
}
} else {
PARALLEL_FOR_LOOP
for(int o=0;o<Mergers[i].buffer_size/2;o++){
exchange(Mergers[i].mpointer[2*o],Mergers[i].mpointer[2*o+1],
Mergers[i].vpointers[0][o],Mergers[i].vpointers[1][o],Mergers[i].type);
}
}
mergetime+=usecond();
}
@ -285,6 +308,8 @@ PARALLEL_FOR_LOOP
cobj* u_send_buf_p;
std::vector<scalar_object *> u_simd_send_buf;
std::vector<scalar_object *> u_simd_recv_buf;
std::vector<cobj *> new_simd_send_buf;
std::vector<cobj *> new_simd_recv_buf;
int u_comm_offset;
int _unified_buffer_size;
@ -432,12 +457,15 @@ PARALLEL_FOR_LOOP
u_simd_send_buf.resize(Nsimd);
u_simd_recv_buf.resize(Nsimd);
new_simd_send_buf.resize(Nsimd);
new_simd_recv_buf.resize(Nsimd);
u_send_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
for(int l=0;l<Nsimd;l++){
u_simd_recv_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
u_simd_send_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
new_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
new_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
}
PrecomputeByteOffsets();
@ -675,7 +703,7 @@ PARALLEL_FOR_LOOP
HaloGather(source,compress);
this->CommunicateBegin(reqs);
this->CommunicateComplete(reqs);
CommsMerge(); // spins
CommsMerge();
}
template<class compressor> void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
@ -706,7 +734,9 @@ PARALLEL_FOR_LOOP
if ( sshift[0] == sshift[1] ) {
if (splice_dim) {
splicetime-=usecond();
GatherSimd(source,dimension,shift,0x3,compress,face_idx);
// GatherSimd(source,dimension,shift,0x3,compress,face_idx);
// std::cout << "GatherSimdNew"<<std::endl;
GatherSimdNew(source,dimension,shift,0x3,compress,face_idx);
splicetime+=usecond();
} else {
nosplicetime-=usecond();
@ -716,8 +746,9 @@ PARALLEL_FOR_LOOP
} else {
if(splice_dim){
splicetime-=usecond();
GatherSimd(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes
GatherSimd(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration
// std::cout << "GatherSimdNew2calls"<<std::endl;
GatherSimdNew(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes
GatherSimdNew(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration
splicetime+=usecond();
} else {
nosplicetime-=usecond();
@ -949,6 +980,120 @@ PARALLEL_FOR_LOOP
}
}
}
template<class compressor>
void GatherSimdNew(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx)
{
const int Nsimd = _grid->Nsimd();
const int maxl =2;// max layout in a direction
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
int ld = _grid->_ldimensions[dimension];
int pd = _grid->_processors[dimension];
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
assert(comm_dim==1);
// This will not work with a rotate dim
assert(simd_layout==maxl);
assert(shift>=0);
assert(shift<fd);
int permute_type=_grid->PermuteType(dimension);
// std::cout << "SimdNew permute type "<<permute_type<<std::endl;
///////////////////////////////////////////////
// Simd direction uses an extract/merge pair
///////////////////////////////////////////////
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
int words = sizeof(cobj)/sizeof(vector_type);
assert(cbmask==0x3); // Fixme think there is a latent bug if not true
int bytes = (buffer_size*sizeof(cobj))/simd_layout;
assert(bytes*simd_layout == buffer_size*sizeof(cobj));
std::vector<cobj *> rpointers(maxl);
std::vector<cobj *> spointers(maxl);
///////////////////////////////////////////
// Work out what to send where
///////////////////////////////////////////
int cb = (cbmask==0x2)? Odd : Even;
int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
// loop over outer coord planes orthog to dim
for(int x=0;x<rd;x++){
int any_offnode = ( ((x+sshift)%fd) >= rd );
if ( any_offnode ) {
for(int i=0;i<maxl;i++){
spointers[i] = (cobj *) &new_simd_send_buf[i][u_comm_offset];
}
int sx = (x+sshift)%rd;
gathermtime-=usecond();
Gather_plane_exchange(rhs,spointers,dimension,sx,cbmask,compress,permute_type);
gathermtime+=usecond();
//spointers[0] -- low
//spointers[1] -- high
for(int i=0;i<maxl;i++){
int my_coor = rd*i + x; // self explanatory
int nbr_coor = my_coor+sshift; // self explanatory
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
int nbr_lcoor= (nbr_coor%ld); // local plane coor on neighbour node
int nbr_ic = (nbr_lcoor)/rd; // inner coord of peer simd lane "i"
int nbr_ox = (nbr_lcoor%rd); // outer coord of peer "x"
int nbr_plane = nbr_ic;
assert (sx == nbr_ox);
auto rp = &new_simd_recv_buf[i ][u_comm_offset];
auto sp = &new_simd_send_buf[nbr_plane][u_comm_offset];
if(nbr_proc){
int recv_from_rank;
int xmit_to_rank;
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
// shm == receive pointer if offnode
// shm == Translate[send pointer] if on node -- my view of his send pointer
cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp);
if (shm==NULL) {
shm = rp;
}
// if Direct, StencilSendToRecvFrom will suppress copy to a peer on node
// assuming above pointer flip
AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes);
rpointers[i] = shm;
} else {
rpointers[i] = sp;
}
}
AddMergeNew(&u_recv_buf_p[u_comm_offset],rpointers,buffer_size,Packets.size()-1,permute_type);
u_comm_offset +=buffer_size;
}
}
}
};
}

View File

@ -42,10 +42,10 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
int provided;
MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) {
// MPI_Init_thread(argc,argv,MPI_THREAD_SERIALIZED,&provided);
// assert (provided == MPI_THREAD_SERIALIZED);
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
assert (provided == MPI_THREAD_MULTIPLE);
if ( provided != MPI_THREAD_MULTIPLE ) {
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
}
}
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
ShmInitGeneric();

View File

@ -103,6 +103,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension];
int n1=rhs._grid->_slice_stride[dimension];
if ( cbmask ==0x3){
PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
@ -110,6 +111,7 @@ PARALLEL_NESTED_LOOP2
int o = n*n1;
int offset = b+n*e2;
cobj temp =compress(rhs._odata[so+o+b]);
extract<cobj>(temp,pointers,offset);
@ -137,6 +139,62 @@ PARALLEL_NESTED_LOOP2
}
}
///////////////////////////////////////////////////////////////////
// Gather for when there *is* need to SIMD split with compression
///////////////////////////////////////////////////////////////////
template<class cobj,class vobj,class compressor> void
Gather_plane_exchange(const Lattice<vobj> &rhs,
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type)
{
int rd = rhs._grid->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
cbmask = 0x3;
}
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension];
int n1=rhs._grid->_slice_stride[dimension];
// Need to switch to a table loop
std::vector<std::pair<int,int> > table;
if ( cbmask ==0x3){
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*n1;
int offset = b+n*e2;
table.push_back(std::pair<int,int> (offset,o+b));
}
}
} else {
// Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code.
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o=n*n1;
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
int offset = b+n*e2;
if ( ocb & cbmask ) {
table.push_back(std::pair<int,int> (offset,o+b));
}
}
}
}
assert( (table.size()&0x1)==0);
PARALLEL_FOR_LOOP
for(int j=0;j<table.size()/2;j++){
// buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
cobj temp1 =compress(rhs._odata[so+table[2*j].second]);
cobj temp2 =compress(rhs._odata[so+table[2*j+1].second]);
exchange(pointers[0][j],pointers[1][j],temp1,temp2,type);
}
}
//////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split
//////////////////////////////////////////////////////

View File

@ -469,9 +469,47 @@ namespace Optimization {
static inline __m256d Permute3(__m256d in){
return in;
};
};
struct Exchange{
// 3210 ordering
static inline void Exchange0(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
out1= _mm256_permute2f128_ps(in1,in2,0x20);
out2= _mm256_permute2f128_ps(in1,in2,0x31);
};
static inline void Exchange1(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
};
static inline void Exchange2(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
};
static inline void Exchange3(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
assert(0);
return;
};
static inline void Exchange0(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
out1= _mm256_permute2f128_pd(in1,in2,0x20);
out2= _mm256_permute2f128_pd(in1,in2,0x31);
return;
};
static inline void Exchange1(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
out1= _mm256_shuffle_pd(in1,in2,0x0);
out2= _mm256_shuffle_pd(in1,in2,0xF);
};
static inline void Exchange2(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
assert(0);
return;
};
static inline void Exchange3(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
assert(0);
return;
};
};
#if defined (AVX2)
#define _mm256_alignr_epi32_grid(ret,a,b,n) ret=(__m256) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*4)%16)
#define _mm256_alignr_epi64_grid(ret,a,b,n) ret=(__m256d) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*8)%16)

View File

@ -343,6 +343,46 @@ namespace Optimization {
};
// On extracting face: Ah Al , Bh Bl -> Ah Bh, Al Bl
// On merging buffers: Ah,Bh , Al Bl -> Ah Al, Bh, Bl
// The operation is its own inverse
struct Exchange{
// 3210 ordering
static inline void Exchange0(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
};
static inline void Exchange1(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
};
static inline void Exchange2(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
};
static inline void Exchange3(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
};
static inline void Exchange0(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
};
static inline void Exchange1(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
};
static inline void Exchange2(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
out1 = _mm512_shuffle_pd(in1,in2,0x00);
out2 = _mm512_shuffle_pd(in1,in2,0xFF);
};
static inline void Exchange3(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
assert(0);
return;
};
};
struct Rotate{

View File

@ -326,7 +326,43 @@ namespace Optimization {
static inline __m128d Permute3(__m128d in){
return in;
};
};
struct Exchange{
// 3210 ordering
static inline void Exchange0(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
};
static inline void Exchange1(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
};
static inline void Exchange2(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
assert(0);
return;
};
static inline void Exchange3(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
assert(0);
return;
};
static inline void Exchange0(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
out1= _mm_shuffle_pd(in1,in2,0x0);
out2= _mm_shuffle_pd(in1,in2,0x3);
};
static inline void Exchange1(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
assert(0);
return;
};
static inline void Exchange2(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
assert(0);
return;
};
static inline void Exchange3(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
assert(0);
return;
};
};
struct Rotate{

View File

@ -350,6 +350,18 @@ class Grid_simd {
return ret;
}
///////////////////////
// Exchange
// Al Ah , Bl Bh -> Al Bl Ah,Bh
///////////////////////
friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n)
{
if (n==3) Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v);
else if(n==2) Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v);
else if(n==1) Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v);
else if(n==0) Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
}
////////////////////////////////////////////////////////////////////
// General permute; assumes vector length is same across
// all subtypes; may not be a good assumption, but could
@ -372,23 +384,11 @@ class Grid_simd {
int dist = perm & 0xF;
y = rotate(b, dist);
return;
}
switch (perm) {
case 3:
permute3(y, b);
break;
case 2:
permute2(y, b);
break;
case 1:
permute1(y, b);
break;
case 0:
permute0(y, b);
break;
default:
assert(0);
}
}
else if(perm==3) permute3(y, b);
else if(perm==2) permute2(y, b);
else if(perm==1) permute1(y, b);
else if(perm==0) permute0(y, b);
}
}; // end of Grid_simd class definition
@ -444,6 +444,8 @@ inline void rbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
ret.v = unary<V>(real(typepun[lane]), VsplatSIMD());
}
///////////////////////
// Splat
///////////////////////

View File

@ -105,6 +105,11 @@ class iScalar {
friend strong_inline void rotate(iScalar<vtype> &out,const iScalar<vtype> &in,int rot){
rotate(out._internal,in._internal,rot);
}
friend strong_inline void exchange(iScalar<vtype> &out1,iScalar<vtype> &out2,
const iScalar<vtype> &in1,const iScalar<vtype> &in2,int type){
exchange(out1._internal,out2._internal,
in1._internal, in2._internal,type);
}
// Unary negation
friend strong_inline iScalar<vtype> operator-(const iScalar<vtype> &r) {
@ -248,6 +253,13 @@ class iVector {
rotate(out._internal[i],in._internal[i],rot);
}
}
friend strong_inline void exchange(iVector<vtype,N> &out1,iVector<vtype,N> &out2,
const iVector<vtype,N> &in1,const iVector<vtype,N> &in2,int type){
for(int i=0;i<N;i++){
exchange(out1._internal[i],out2._internal[i],
in1._internal[i], in2._internal[i],type);
}
}
// Unary negation
friend strong_inline iVector<vtype, N> operator-(const iVector<vtype, N> &r) {
@ -374,6 +386,14 @@ class iMatrix {
rotate(out._internal[i][j],in._internal[i][j],rot);
}}
}
friend strong_inline void exchange(iMatrix<vtype,N> &out1,iMatrix<vtype,N> &out2,
const iMatrix<vtype,N> &in1,const iMatrix<vtype,N> &in2,int type){
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
exchange(out1._internal[i][j],out2._internal[i][j],
in1._internal[i][j], in2._internal[i][j],type);
}}
}
// Unary negation
friend strong_inline iMatrix<vtype, N> operator-(const iMatrix<vtype, N> &r) {

View File

@ -113,8 +113,6 @@ public:
// outerproduct,
// zeroit
// permute
class funcReduce {
public:
funcReduce() {};
@ -168,7 +166,7 @@ void Tester(const functor &func)
int ok=0;
for(int i=0;i<Nsimd;i++){
if ( abs(reference[i]-result[i])>1.0e-7){
if ( abs(reference[i]-result[i])>1.0e-6){
std::cout<<GridLogMessage<< "*****" << std::endl;
std::cout<<GridLogMessage<< "["<<i<<"] "<< abs(reference[i]-result[i]) << " " <<reference[i]<< " " << result[i]<<std::endl;
ok++;
@ -245,6 +243,28 @@ public:
}
std::string name(void) const { return std::string("Permute"); }
};
class funcExchange {
public:
int n;
funcExchange(int _n) { n=_n;};
template<class vec> void operator()(vec &r1,vec &r2,vec &i1,vec &i2) const { exchange(r1,r2,i1,i2,n);}
template<class scal> void apply(std::vector<scal> &r1,std::vector<scal> &r2,std::vector<scal> &in1,std::vector<scal> &in2) const {
int sz=in1.size();
int msk = sz>>(n+1);
int j1=0;
int j2=0;
for(int i=0;i<sz;i++) if ( (i&msk) == 0 ) r1[j1++] = in1[ i ];
for(int i=0;i<sz;i++) if ( (i&msk) == 0 ) r1[j1++] = in2[ i ];
for(int i=0;i<sz;i++) if ( (i&msk) ) r2[j2++] = in1[ i ];
for(int i=0;i<sz;i++) if ( (i&msk) ) r2[j2++] = in2[ i ];
}
std::string name(void) const { return std::string("Exchange"); }
};
class funcRotate {
public:
int n;
@ -325,6 +345,87 @@ void PermTester(const functor &func)
assert(ok==0);
}
template<class scal, class vec,class functor >
void ExchangeTester(const functor &func)
{
GridSerialRNG sRNG;
sRNG.SeedRandomDevice();
int Nsimd = vec::Nsimd();
std::vector<scal> input1(Nsimd);
std::vector<scal> input2(Nsimd);
std::vector<scal> result1(Nsimd);
std::vector<scal> result2(Nsimd);
std::vector<scal> reference1(Nsimd);
std::vector<scal> reference2(Nsimd);
std::vector<scal> test1(Nsimd);
std::vector<scal> test2(Nsimd);
std::vector<vec,alignedAllocator<vec> > buf(6);
vec & v_input1 = buf[0];
vec & v_input2 = buf[1];
vec & v_result1 = buf[2];
vec & v_result2 = buf[3];
vec & v_test1 = buf[4];
vec & v_test2 = buf[5];
for(int i=0;i<Nsimd;i++){
random(sRNG,input1[i]);
random(sRNG,input2[i]);
random(sRNG,result1[i]);
random(sRNG,result2[i]);
}
merge<vec,scal>(v_input1,input1);
merge<vec,scal>(v_input2,input2);
merge<vec,scal>(v_result1,result1);
merge<vec,scal>(v_result2,result1);
func(v_result1,v_result2,v_input1,v_input2);
func.apply(reference1,reference2,input1,input2);
func(v_test1,v_test2,v_result1,v_result2);
extract<vec,scal>(v_result1,result1);
extract<vec,scal>(v_result2,result2);
extract<vec,scal>(v_test1,test1);
extract<vec,scal>(v_test2,test2);
std::cout<<GridLogMessage << " " << func.name() << " " <<func.n <<std::endl;
// for(int i=0;i<Nsimd;i++) std::cout << " i "<<i<<" "<<reference1[i]<<" "<<result1[i]<<std::endl;
// for(int i=0;i<Nsimd;i++) std::cout << " i "<<i<<" "<<reference2[i]<<" "<<result2[i]<<std::endl;
for(int i=0;i<Nsimd;i++){
int found=0;
for(int j=0;j<Nsimd;j++){
if(reference1[j]==result1[i]) {
found=1;
// std::cout << " i "<<i<<" j "<<j<<" "<<reference1[j]<<" "<<result1[i]<<std::endl;
}
}
assert(found==1);
}
for(int i=0;i<Nsimd;i++){
int found=0;
for(int j=0;j<Nsimd;j++){
if(reference2[j]==result2[i]) {
found=1;
// std::cout << " i "<<i<<" j "<<j<<" "<<reference2[j]<<" "<<result2[i]<<std::endl;
}
}
assert(found==1);
}
// for(int i=0;i<Nsimd;i++){
// std::cout << " i "<< i<<" test1"<<test1[i]<<" "<<input1[i]<<std::endl;
// std::cout << " i "<< i<<" test2"<<test2[i]<<" "<<input2[i]<<std::endl;
// }
}
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
@ -363,6 +464,15 @@ int main (int argc, char ** argv)
PermTester<RealF,vRealF>(funcPermute(i));
}
std::cout<<GridLogMessage << "==================================="<< std::endl;
std::cout<<GridLogMessage << "Testing vRealF exchanges "<<std::endl;
std::cout<<GridLogMessage << "==================================="<< std::endl;
// Log2 iteration
for(int i=0;(1<<i)< vRealF::Nsimd();i++){
ExchangeTester<RealF,vRealF>(funcExchange(i));
}
std::cout<<GridLogMessage << "==================================="<< std::endl;
std::cout<<GridLogMessage << "Testing vRealF rotate "<<std::endl;
std::cout<<GridLogMessage << "==================================="<< std::endl;
@ -394,6 +504,14 @@ int main (int argc, char ** argv)
PermTester<RealD,vRealD>(funcPermute(i));
}
std::cout<<GridLogMessage << "==================================="<< std::endl;
std::cout<<GridLogMessage << "Testing vRealD exchanges "<<std::endl;
std::cout<<GridLogMessage << "==================================="<< std::endl;
// Log2 iteration
for(int i=0;(1<<i)< vRealD::Nsimd();i++){
ExchangeTester<RealD,vRealD>(funcExchange(i));
}
std::cout<<GridLogMessage << "==================================="<< std::endl;
std::cout<<GridLogMessage << "Testing vRealD rotate "<<std::endl;
std::cout<<GridLogMessage << "==================================="<< std::endl;
@ -429,6 +547,16 @@ int main (int argc, char ** argv)
PermTester<ComplexF,vComplexF>(funcPermute(i));
}
std::cout<<GridLogMessage << "==================================="<< std::endl;
std::cout<<GridLogMessage << "Testing vComplexF exchanges "<<std::endl;
std::cout<<GridLogMessage << "==================================="<< std::endl;
// Log2 iteration
for(int i=0;(1<<i)< vComplexF::Nsimd();i++){
ExchangeTester<ComplexF,vComplexF>(funcExchange(i));
}
std::cout<<GridLogMessage << "==================================="<< std::endl;
std::cout<<GridLogMessage << "Testing vComplexF rotate "<<std::endl;
std::cout<<GridLogMessage << "==================================="<< std::endl;
@ -466,6 +594,15 @@ int main (int argc, char ** argv)
}
std::cout<<GridLogMessage << "==================================="<< std::endl;
std::cout<<GridLogMessage << "Testing vComplexD exchanges "<<std::endl;
std::cout<<GridLogMessage << "==================================="<< std::endl;
// Log2 iteration
for(int i=0;(1<<i)< vComplexD::Nsimd();i++){
ExchangeTester<ComplexD,vComplexD>(funcExchange(i));
}
std::cout<<GridLogMessage << "==================================="<< std::endl;
std::cout<<GridLogMessage << "Testing vComplexD rotate "<<std::endl;
std::cout<<GridLogMessage << "==================================="<< std::endl;