mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-09 21:50:45 +01:00
Merge branch 'feature/bgq-asm' of https://github.com/paboyle/Grid into feature/bgq-asm
This commit is contained in:
commit
cd0da81196
@ -240,6 +240,10 @@ PARALLEL_FOR_LOOP
|
|||||||
for(int o=0;o<Mergers[i].buffer_size/2;o++){
|
for(int o=0;o<Mergers[i].buffer_size/2;o++){
|
||||||
exchange(Mergers[i].mpointer[2*o],Mergers[i].mpointer[2*o+1],
|
exchange(Mergers[i].mpointer[2*o],Mergers[i].mpointer[2*o+1],
|
||||||
Mergers[i].vpointers[0][o],Mergers[i].vpointers[1][o],Mergers[i].type);
|
Mergers[i].vpointers[0][o],Mergers[i].vpointers[1][o],Mergers[i].type);
|
||||||
|
// cobj temp1,temp2;
|
||||||
|
// exchange(temp1,temp2,Mergers[i].vpointers[0][o],Mergers[i].vpointers[1][o],Mergers[i].type);
|
||||||
|
// vstream(Mergers[i].mpointer[2*o],temp1);
|
||||||
|
// vstream(Mergers[i].mpointer[2*o+1],temp2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
mergetime+=usecond();
|
mergetime+=usecond();
|
||||||
@ -1037,9 +1041,7 @@ PARALLEL_FOR_LOOP
|
|||||||
|
|
||||||
int sx = (x+sshift)%rd;
|
int sx = (x+sshift)%rd;
|
||||||
|
|
||||||
gathermtime-=usecond();
|
gathermtime+=Gather_plane_exchange(rhs,spointers,dimension,sx,cbmask,compress,permute_type);
|
||||||
Gather_plane_exchange(rhs,spointers,dimension,sx,cbmask,compress,permute_type);
|
|
||||||
gathermtime+=usecond();
|
|
||||||
|
|
||||||
//spointers[0] -- low
|
//spointers[0] -- low
|
||||||
//spointers[1] -- high
|
//spointers[1] -- high
|
||||||
|
@ -100,7 +100,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
|
|||||||
|
|
||||||
int flag;
|
int flag;
|
||||||
int provided;
|
int provided;
|
||||||
mtrace();
|
// mtrace();
|
||||||
|
|
||||||
MPI_Initialized(&flag); // needed to coexist with other libs apparently
|
MPI_Initialized(&flag); // needed to coexist with other libs apparently
|
||||||
if ( !flag ) {
|
if ( !flag ) {
|
||||||
@ -511,7 +511,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
|
|||||||
int myrank = _processor;
|
int myrank = _processor;
|
||||||
int ierr;
|
int ierr;
|
||||||
|
|
||||||
if ( (CommunicatorPolicy == CommunicatorPolicyIsend) ) {
|
if ( CommunicatorPolicy == CommunicatorPolicyIsend ) {
|
||||||
MPI_Request xrq;
|
MPI_Request xrq;
|
||||||
MPI_Request rrq;
|
MPI_Request rrq;
|
||||||
|
|
||||||
|
@ -142,12 +142,12 @@ PARALLEL_NESTED_LOOP2
|
|||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
// Gather for when there *is* need to SIMD split with compression
|
// Gather for when there *is* need to SIMD split with compression
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
template<class cobj,class vobj,class compressor> void
|
template<class cobj,class vobj,class compressor> double
|
||||||
Gather_plane_exchange(const Lattice<vobj> &rhs,
|
Gather_plane_exchange(const Lattice<vobj> &rhs,
|
||||||
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type)
|
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type)
|
||||||
{
|
{
|
||||||
int rd = rhs._grid->_rdimensions[dimension];
|
int rd = rhs._grid->_rdimensions[dimension];
|
||||||
|
double t1,t2;
|
||||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
||||||
cbmask = 0x3;
|
cbmask = 0x3;
|
||||||
}
|
}
|
||||||
@ -186,13 +186,20 @@ Gather_plane_exchange(const Lattice<vobj> &rhs,
|
|||||||
}
|
}
|
||||||
|
|
||||||
assert( (table.size()&0x1)==0);
|
assert( (table.size()&0x1)==0);
|
||||||
|
t1=usecond();
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int j=0;j<table.size()/2;j++){
|
for(int j=0;j<table.size()/2;j++){
|
||||||
// buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
|
// buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
|
||||||
cobj temp1 =compress(rhs._odata[so+table[2*j].second]);
|
cobj temp1 =compress(rhs._odata[so+table[2*j].second]);
|
||||||
cobj temp2 =compress(rhs._odata[so+table[2*j+1].second]);
|
cobj temp2 =compress(rhs._odata[so+table[2*j+1].second]);
|
||||||
exchange(pointers[0][j],pointers[1][j],temp1,temp2,type);
|
cobj temp3;
|
||||||
|
cobj temp4;
|
||||||
|
exchange(temp3,temp4,temp1,temp2,type);
|
||||||
|
vstream(pointers[0][j],temp3);
|
||||||
|
vstream(pointers[1][j],temp4);
|
||||||
}
|
}
|
||||||
|
t2=usecond();
|
||||||
|
return t2-t1;
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
|
@ -474,16 +474,31 @@ namespace Optimization {
|
|||||||
struct Exchange{
|
struct Exchange{
|
||||||
// 3210 ordering
|
// 3210 ordering
|
||||||
static inline void Exchange0(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
|
static inline void Exchange0(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
|
||||||
|
//Invertible
|
||||||
|
//AB CD -> AC BD
|
||||||
|
//AC BD -> AB CD
|
||||||
out1= _mm256_permute2f128_ps(in1,in2,0x20);
|
out1= _mm256_permute2f128_ps(in1,in2,0x20);
|
||||||
out2= _mm256_permute2f128_ps(in1,in2,0x31);
|
out2= _mm256_permute2f128_ps(in1,in2,0x31);
|
||||||
};
|
};
|
||||||
static inline void Exchange1(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
|
static inline void Exchange1(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
|
||||||
|
//Invertible
|
||||||
|
// ABCD EFGH ->ABEF CDGH
|
||||||
|
// ABEF CDGH ->ABCD EFGH
|
||||||
out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
|
out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
|
||||||
out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
|
out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
|
||||||
};
|
};
|
||||||
static inline void Exchange2(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
|
static inline void Exchange2(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
|
||||||
out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
|
// Invertible ?
|
||||||
out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
|
// ABCD EFGH -> ACEG BDFH
|
||||||
|
// ACEG BDFH -> AEBF CGDH
|
||||||
|
// out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
|
||||||
|
// out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
|
||||||
|
// Bollocks; need
|
||||||
|
// AECG BFDH -> ABCD EFGH
|
||||||
|
out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); /*ACEG*/
|
||||||
|
out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); /*BDFH*/
|
||||||
|
out1= _mm256_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
|
||||||
|
out2= _mm256_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
|
||||||
};
|
};
|
||||||
static inline void Exchange3(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
|
static inline void Exchange3(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
|
||||||
assert(0);
|
assert(0);
|
||||||
|
@ -419,8 +419,10 @@ void ExchangeTester(const functor &func)
|
|||||||
assert(found==1);
|
assert(found==1);
|
||||||
}
|
}
|
||||||
|
|
||||||
// for(int i=0;i<Nsimd;i++){
|
for(int i=0;i<Nsimd;i++){
|
||||||
// std::cout << " i "<< i<<" test1"<<test1[i]<<" "<<input1[i]<<std::endl;
|
assert(test1[i]==input1[i]);
|
||||||
|
assert(test2[i]==input2[i]);
|
||||||
|
}// std::cout << " i "<< i<<" test1"<<test1[i]<<" "<<input1[i]<<std::endl;
|
||||||
// std::cout << " i "<< i<<" test2"<<test2[i]<<" "<<input2[i]<<std::endl;
|
// std::cout << " i "<< i<<" test2"<<test2[i]<<" "<<input2[i]<<std::endl;
|
||||||
// }
|
// }
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user