mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-10 22:20:45 +01:00
Global changes to parallel_for structure.
Move the comms flags to more sensible names
This commit is contained in:
parent
3906cd2149
commit
3ae92fa2e6
@ -77,8 +77,7 @@ int main (int argc, char ** argv)
|
|||||||
}
|
}
|
||||||
|
|
||||||
double start=usecond();
|
double start=usecond();
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int t=0;t<threads;t++){
|
||||||
for(int t=0;t<threads;t++){
|
|
||||||
|
|
||||||
sum[t] = x[t]._odata[0];
|
sum[t] = x[t]._odata[0];
|
||||||
for(int i=0;i<Nloop;i++){
|
for(int i=0;i<Nloop;i++){
|
||||||
|
@ -342,11 +342,11 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
} else {
|
} else {
|
||||||
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
|
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
|
||||||
}
|
}
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-isend") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){
|
||||||
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyIsend);
|
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent);
|
||||||
}
|
}
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sendrecv") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sequential") ){
|
||||||
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySendrecv);
|
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
|
||||||
}
|
}
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
|
||||||
LebesgueOrder::UseLebesgueOrder=1;
|
LebesgueOrder::UseLebesgueOrder=1;
|
||||||
|
@ -81,77 +81,14 @@ template<class vobj,class cobj,class compressor>
|
|||||||
void Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)
|
void Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)
|
||||||
{
|
{
|
||||||
int num=table.size();
|
int num=table.size();
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int i=0;i<num;i++){
|
||||||
for(int i=0;i<num;i++){
|
|
||||||
vstream(buffer[off+table[i].first],compress(rhs._odata[so+table[i].second]));
|
vstream(buffer[off+table[i].first],compress(rhs._odata[so+table[i].second]));
|
||||||
// buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
// Gather for when there *is* need to SIMD split with compression
|
// Gather for when there *is* need to SIMD split with compression
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
/*
|
|
||||||
template<class cobj,class vobj,class compressor> double
|
|
||||||
Gather_plane_exchange(const Lattice<vobj> &rhs,
|
|
||||||
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type)
|
|
||||||
{
|
|
||||||
int rd = rhs._grid->_rdimensions[dimension];
|
|
||||||
double t1,t2;
|
|
||||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
|
||||||
cbmask = 0x3;
|
|
||||||
}
|
|
||||||
|
|
||||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
|
||||||
int e1 =rhs._grid->_slice_nblock[dimension];
|
|
||||||
int e2 =rhs._grid->_slice_block [dimension];
|
|
||||||
int n1 =rhs._grid->_slice_stride[dimension];
|
|
||||||
|
|
||||||
// Need to switch to a table loop
|
|
||||||
std::vector<std::pair<int,int> > table;
|
|
||||||
|
|
||||||
if ( cbmask ==0x3){
|
|
||||||
for(int n=0;n<e1;n++){
|
|
||||||
for(int b=0;b<e2;b++){
|
|
||||||
int o = n*n1;
|
|
||||||
int offset = b+n*e2;
|
|
||||||
table.push_back(std::pair<int,int> (offset,o+b));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Case of SIMD split AND checker dim cannot currently be hit, except in
|
|
||||||
// Test_cshift_red_black code.
|
|
||||||
for(int n=0;n<e1;n++){
|
|
||||||
for(int b=0;b<e2;b++){
|
|
||||||
int o=n*n1;
|
|
||||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
|
|
||||||
int offset = b+n*e2;
|
|
||||||
|
|
||||||
if ( ocb & cbmask ) {
|
|
||||||
table.push_back(std::pair<int,int> (offset,o+b));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
assert( (table.size()&0x1)==0);
|
|
||||||
t1=usecond();
|
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for(int j=0;j<table.size()/2;j++){
|
|
||||||
// buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
|
|
||||||
cobj temp1 =compress(rhs._odata[so+table[2*j].second]);
|
|
||||||
cobj temp2 =compress(rhs._odata[so+table[2*j+1].second]);
|
|
||||||
cobj temp3;
|
|
||||||
cobj temp4;
|
|
||||||
exchange(temp3,temp4,temp1,temp2,type);
|
|
||||||
vstream(pointers[0][j],temp3);
|
|
||||||
vstream(pointers[1][j],temp4);
|
|
||||||
}
|
|
||||||
t2=usecond();
|
|
||||||
return t2-t1;
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
template<class cobj,class vobj,class compressor>
|
template<class cobj,class vobj,class compressor>
|
||||||
void Gather_plane_exchange_table(const Lattice<vobj> &rhs,
|
void Gather_plane_exchange_table(const Lattice<vobj> &rhs,
|
||||||
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type) __attribute__((noinline));
|
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type) __attribute__((noinline));
|
||||||
@ -164,8 +101,7 @@ void Gather_plane_exchange_table(std::vector<std::pair<int,int> >& table,const L
|
|||||||
assert( (table.size()&0x1)==0);
|
assert( (table.size()&0x1)==0);
|
||||||
int num=table.size()/2;
|
int num=table.size()/2;
|
||||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int j=0;j<num;j++){
|
||||||
for(int j=0;j<num;j++){
|
|
||||||
// buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
|
// buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
|
||||||
cobj temp1 =compress(rhs._odata[so+table[2*j].second]);
|
cobj temp1 =compress(rhs._odata[so+table[2*j].second]);
|
||||||
cobj temp2 =compress(rhs._odata[so+table[2*j+1].second]);
|
cobj temp2 =compress(rhs._odata[so+table[2*j+1].second]);
|
||||||
@ -235,19 +171,14 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
Packets[i].recv_buf,
|
Packets[i].recv_buf,
|
||||||
Packets[i].from_rank,
|
Packets[i].from_rank,
|
||||||
Packets[i].bytes);
|
Packets[i].bytes);
|
||||||
if( _grid->CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicySendrecv ) {
|
|
||||||
_grid->StencilSendToRecvFromComplete(reqs[i]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
commtime+=usecond();
|
commtime+=usecond();
|
||||||
}
|
}
|
||||||
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
{
|
{
|
||||||
commtime-=usecond();
|
commtime-=usecond();
|
||||||
if( _grid->CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicyIsend ) {
|
for(int i=0;i<Packets.size();i++){
|
||||||
for(int i=0;i<Packets.size();i++){
|
_grid->StencilSendToRecvFromComplete(reqs[i]);
|
||||||
_grid->StencilSendToRecvFromComplete(reqs[i]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||||
commtime+=usecond();
|
commtime+=usecond();
|
||||||
@ -327,14 +258,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
// std::ofstream fout(fname);
|
// std::ofstream fout(fname);
|
||||||
|
|
||||||
if ( Mergers[i].exchange == 0 ) {
|
if ( Mergers[i].exchange == 0 ) {
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int o=0;o<Mergers[i].buffer_size;o++){
|
||||||
for(int o=0;o<Mergers[i].buffer_size;o++){
|
|
||||||
merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
|
merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
|
||||||
// fout<<o<<" "<<Mergers[i].mpointer[o]<<std::endl;
|
// fout<<o<<" "<<Mergers[i].mpointer[o]<<std::endl;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int o=0;o<Mergers[i].buffer_size/2;o++){
|
||||||
for(int o=0;o<Mergers[i].buffer_size/2;o++){
|
|
||||||
exchange(Mergers[i].mpointer[2*o],Mergers[i].mpointer[2*o+1],
|
exchange(Mergers[i].mpointer[2*o],Mergers[i].mpointer[2*o+1],
|
||||||
Mergers[i].vpointers[0][o],Mergers[i].vpointers[1][o],Mergers[i].type);
|
Mergers[i].vpointers[0][o],Mergers[i].vpointers[1][o],Mergers[i].type);
|
||||||
// cobj temp1,temp2;
|
// cobj temp1,temp2;
|
||||||
|
@ -51,6 +51,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define PARALLEL_CRITICAL
|
#define PARALLEL_CRITICAL
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define parallel_for PARALLEL_FOR_LOOP for
|
||||||
|
#define parallel_for_nest2 PARALLEL_NESTED_LOOP2 for
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
// Introduce a class to gain deterministic bit reproducible reduction.
|
// Introduce a class to gain deterministic bit reproducible reduction.
|
||||||
|
@ -267,8 +267,7 @@ namespace Grid {
|
|||||||
SimpleCompressor<siteVector> compressor;
|
SimpleCompressor<siteVector> compressor;
|
||||||
Stencil.HaloExchange(in,compressor);
|
Stencil.HaloExchange(in,compressor);
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<Grid()->oSites();ss++){
|
||||||
for(int ss=0;ss<Grid()->oSites();ss++){
|
|
||||||
siteVector res = zero;
|
siteVector res = zero;
|
||||||
siteVector nbr;
|
siteVector nbr;
|
||||||
int ptype;
|
int ptype;
|
||||||
@ -380,8 +379,7 @@ PARALLEL_FOR_LOOP
|
|||||||
Subspace.ProjectToSubspace(oProj,oblock);
|
Subspace.ProjectToSubspace(oProj,oblock);
|
||||||
// blockProject(iProj,iblock,Subspace.subspace);
|
// blockProject(iProj,iblock,Subspace.subspace);
|
||||||
// blockProject(oProj,oblock,Subspace.subspace);
|
// blockProject(oProj,oblock,Subspace.subspace);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<Grid()->oSites();ss++){
|
||||||
for(int ss=0;ss<Grid()->oSites();ss++){
|
|
||||||
for(int j=0;j<nbasis;j++){
|
for(int j=0;j<nbasis;j++){
|
||||||
if( disp!= 0 ) {
|
if( disp!= 0 ) {
|
||||||
A[p]._odata[ss](j,i) = oProj._odata[ss](j);
|
A[p]._odata[ss](j,i) = oProj._odata[ss](j);
|
||||||
|
@ -33,7 +33,7 @@ namespace Grid {
|
|||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
void * CartesianCommunicator::ShmCommBuf;
|
void * CartesianCommunicator::ShmCommBuf;
|
||||||
uint64_t CartesianCommunicator::MAX_MPI_SHM_BYTES = 128*1024*1024;
|
uint64_t CartesianCommunicator::MAX_MPI_SHM_BYTES = 128*1024*1024;
|
||||||
CartesianCommunicator::CommunicatorPolicy_t CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicySendrecv;
|
CartesianCommunicator::CommunicatorPolicy_t CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
|
||||||
|
|
||||||
/////////////////////////////////
|
/////////////////////////////////
|
||||||
// Alloc, free shmem region
|
// Alloc, free shmem region
|
||||||
|
@ -118,7 +118,7 @@ class CartesianCommunicator {
|
|||||||
static void * ShmCommBuf;
|
static void * ShmCommBuf;
|
||||||
|
|
||||||
// Isend/Irecv/Wait, or Sendrecv blocking
|
// Isend/Irecv/Wait, or Sendrecv blocking
|
||||||
enum CommunicatorPolicy_t { CommunicatorPolicyIsend , CommunicatorPolicySendrecv };
|
enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
|
||||||
static CommunicatorPolicy_t CommunicatorPolicy;
|
static CommunicatorPolicy_t CommunicatorPolicy;
|
||||||
static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
|
static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
|
||||||
|
|
||||||
|
@ -158,7 +158,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
|
|||||||
{
|
{
|
||||||
int myrank = _processor;
|
int myrank = _processor;
|
||||||
int ierr;
|
int ierr;
|
||||||
if ( CommunicatorPolicy == CommunicatorPolicyIsend ) {
|
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
|
||||||
MPI_Request xrq;
|
MPI_Request xrq;
|
||||||
MPI_Request rrq;
|
MPI_Request rrq;
|
||||||
|
|
||||||
@ -178,7 +178,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
|
|||||||
}
|
}
|
||||||
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
||||||
{
|
{
|
||||||
if ( CommunicatorPolicy == CommunicatorPolicyIsend ) {
|
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
|
||||||
int nreq=list.size();
|
int nreq=list.size();
|
||||||
std::vector<MPI_Status> status(nreq);
|
std::vector<MPI_Status> status(nreq);
|
||||||
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
|
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
|
||||||
|
@ -511,7 +511,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
|
|||||||
int myrank = _processor;
|
int myrank = _processor;
|
||||||
int ierr;
|
int ierr;
|
||||||
|
|
||||||
if ( CommunicatorPolicy == CommunicatorPolicyIsend ) {
|
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
|
||||||
MPI_Request xrq;
|
MPI_Request xrq;
|
||||||
MPI_Request rrq;
|
MPI_Request rrq;
|
||||||
|
|
||||||
@ -567,6 +567,11 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
list.push_back(xrq);
|
list.push_back(xrq);
|
||||||
off_node_bytes+=bytes;
|
off_node_bytes+=bytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
|
||||||
|
this->StencilSendToRecvFromComplete(list);
|
||||||
|
}
|
||||||
|
|
||||||
return off_node_bytes;
|
return off_node_bytes;
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
|
||||||
@ -585,8 +590,8 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
|
|||||||
|
|
||||||
std::vector<MPI_Status> status(nreq);
|
std::vector<MPI_Status> status(nreq);
|
||||||
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
|
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
|
||||||
list.resize(0);
|
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
|
list.resize(0);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::Barrier(void)
|
void CartesianCommunicator::Barrier(void)
|
||||||
{
|
{
|
||||||
|
@ -58,8 +58,7 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimen
|
|||||||
|
|
||||||
int stride=rhs._grid->_slice_stride[dimension];
|
int stride=rhs._grid->_slice_stride[dimension];
|
||||||
if ( cbmask == 0x3 ) {
|
if ( cbmask == 0x3 ) {
|
||||||
PARALLEL_NESTED_LOOP2
|
parallel_for_nest2(int n=0;n<e1;n++){
|
||||||
for(int n=0;n<e1;n++){
|
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o = n*stride;
|
int o = n*stride;
|
||||||
int bo = n*e2;
|
int bo = n*e2;
|
||||||
@ -78,8 +77,7 @@ PARALLEL_NESTED_LOOP2
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int i=0;i<table.size();i++){
|
||||||
for(int i=0;i<table.size();i++){
|
|
||||||
buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
|
buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -105,8 +103,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
|
|||||||
int n1=rhs._grid->_slice_stride[dimension];
|
int n1=rhs._grid->_slice_stride[dimension];
|
||||||
|
|
||||||
if ( cbmask ==0x3){
|
if ( cbmask ==0x3){
|
||||||
PARALLEL_NESTED_LOOP2
|
parallel_for_nest2(int n=0;n<e1;n++){
|
||||||
for(int n=0;n<e1;n++){
|
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
|
|
||||||
int o = n*n1;
|
int o = n*n1;
|
||||||
@ -122,8 +119,7 @@ PARALLEL_NESTED_LOOP2
|
|||||||
// Case of SIMD split AND checker dim cannot currently be hit, except in
|
// Case of SIMD split AND checker dim cannot currently be hit, except in
|
||||||
// Test_cshift_red_black code.
|
// Test_cshift_red_black code.
|
||||||
std::cout << " Dense packed buffer WARNING " <<std::endl;
|
std::cout << " Dense packed buffer WARNING " <<std::endl;
|
||||||
PARALLEL_NESTED_LOOP2
|
parallel_for_nest2(int n=0;n<e1;n++){
|
||||||
for(int n=0;n<e1;n++){
|
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
|
|
||||||
int o=n*n1;
|
int o=n*n1;
|
||||||
@ -175,8 +171,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
|
|||||||
int stride=rhs._grid->_slice_stride[dimension];
|
int stride=rhs._grid->_slice_stride[dimension];
|
||||||
|
|
||||||
if ( cbmask ==0x3 ) {
|
if ( cbmask ==0x3 ) {
|
||||||
PARALLEL_NESTED_LOOP2
|
parallel_for_nest2(int n=0;n<e1;n++){
|
||||||
for(int n=0;n<e1;n++){
|
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o =n*rhs._grid->_slice_stride[dimension];
|
int o =n*rhs._grid->_slice_stride[dimension];
|
||||||
int bo =n*rhs._grid->_slice_block[dimension];
|
int bo =n*rhs._grid->_slice_block[dimension];
|
||||||
@ -195,8 +190,7 @@ PARALLEL_NESTED_LOOP2
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int i=0;i<table.size();i++){
|
||||||
for(int i=0;i<table.size();i++){
|
|
||||||
// std::cout << "Rcv"<< table[i].first << " " << table[i].second << " " <<buffer[table[i].second]<<std::endl;
|
// std::cout << "Rcv"<< table[i].first << " " << table[i].second << " " <<buffer[table[i].second]<<std::endl;
|
||||||
rhs._odata[table[i].first]=buffer[table[i].second];
|
rhs._odata[table[i].first]=buffer[table[i].second];
|
||||||
}
|
}
|
||||||
@ -220,8 +214,7 @@ PARALLEL_FOR_LOOP
|
|||||||
int e2=rhs._grid->_slice_block[dimension];
|
int e2=rhs._grid->_slice_block[dimension];
|
||||||
|
|
||||||
if(cbmask ==0x3 ) {
|
if(cbmask ==0x3 ) {
|
||||||
PARALLEL_NESTED_LOOP2
|
parallel_for_nest2(int n=0;n<e1;n++){
|
||||||
for(int n=0;n<e1;n++){
|
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o = n*rhs._grid->_slice_stride[dimension];
|
int o = n*rhs._grid->_slice_stride[dimension];
|
||||||
int offset = b+n*rhs._grid->_slice_block[dimension];
|
int offset = b+n*rhs._grid->_slice_block[dimension];
|
||||||
@ -265,8 +258,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
|||||||
int e2=rhs._grid->_slice_block[dimension];
|
int e2=rhs._grid->_slice_block[dimension];
|
||||||
int stride = rhs._grid->_slice_stride[dimension];
|
int stride = rhs._grid->_slice_stride[dimension];
|
||||||
if(cbmask == 0x3 ){
|
if(cbmask == 0x3 ){
|
||||||
PARALLEL_NESTED_LOOP2
|
parallel_for_nest2(int n=0;n<e1;n++){
|
||||||
for(int n=0;n<e1;n++){
|
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
|
|
||||||
int o =n*stride+b;
|
int o =n*stride+b;
|
||||||
@ -275,8 +267,7 @@ PARALLEL_NESTED_LOOP2
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
PARALLEL_NESTED_LOOP2
|
parallel_for_nest2(int n=0;n<e1;n++){
|
||||||
for(int n=0;n<e1;n++){
|
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
|
|
||||||
int o =n*stride+b;
|
int o =n*stride+b;
|
||||||
@ -306,8 +297,8 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
|
|||||||
int e1=rhs._grid->_slice_nblock[dimension];
|
int e1=rhs._grid->_slice_nblock[dimension];
|
||||||
int e2=rhs._grid->_slice_block [dimension];
|
int e2=rhs._grid->_slice_block [dimension];
|
||||||
int stride = rhs._grid->_slice_stride[dimension];
|
int stride = rhs._grid->_slice_stride[dimension];
|
||||||
PARALLEL_NESTED_LOOP2
|
|
||||||
for(int n=0;n<e1;n++){
|
parallel_for_nest2(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
|
|
||||||
int o =n*stride;
|
int o =n*stride;
|
||||||
|
@ -39,8 +39,7 @@ namespace Grid {
|
|||||||
ret.checkerboard = lhs.checkerboard;
|
ret.checkerboard = lhs.checkerboard;
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
conformable(lhs,rhs);
|
conformable(lhs,rhs);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
|
||||||
#ifdef STREAMING_STORES
|
#ifdef STREAMING_STORES
|
||||||
obj1 tmp;
|
obj1 tmp;
|
||||||
mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
|
mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
|
||||||
@ -56,8 +55,7 @@ PARALLEL_FOR_LOOP
|
|||||||
ret.checkerboard = lhs.checkerboard;
|
ret.checkerboard = lhs.checkerboard;
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
conformable(lhs,rhs);
|
conformable(lhs,rhs);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
|
||||||
#ifdef STREAMING_STORES
|
#ifdef STREAMING_STORES
|
||||||
obj1 tmp;
|
obj1 tmp;
|
||||||
mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
|
mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
|
||||||
@ -73,8 +71,7 @@ PARALLEL_FOR_LOOP
|
|||||||
ret.checkerboard = lhs.checkerboard;
|
ret.checkerboard = lhs.checkerboard;
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
conformable(lhs,rhs);
|
conformable(lhs,rhs);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
|
||||||
#ifdef STREAMING_STORES
|
#ifdef STREAMING_STORES
|
||||||
obj1 tmp;
|
obj1 tmp;
|
||||||
sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
|
sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
|
||||||
@ -89,8 +86,7 @@ PARALLEL_FOR_LOOP
|
|||||||
ret.checkerboard = lhs.checkerboard;
|
ret.checkerboard = lhs.checkerboard;
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
conformable(lhs,rhs);
|
conformable(lhs,rhs);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
|
||||||
#ifdef STREAMING_STORES
|
#ifdef STREAMING_STORES
|
||||||
obj1 tmp;
|
obj1 tmp;
|
||||||
add(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
|
add(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
|
||||||
@ -108,8 +104,7 @@ PARALLEL_FOR_LOOP
|
|||||||
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
||||||
ret.checkerboard = lhs.checkerboard;
|
ret.checkerboard = lhs.checkerboard;
|
||||||
conformable(lhs,ret);
|
conformable(lhs,ret);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
|
||||||
obj1 tmp;
|
obj1 tmp;
|
||||||
mult(&tmp,&lhs._odata[ss],&rhs);
|
mult(&tmp,&lhs._odata[ss],&rhs);
|
||||||
vstream(ret._odata[ss],tmp);
|
vstream(ret._odata[ss],tmp);
|
||||||
@ -120,8 +115,7 @@ PARALLEL_FOR_LOOP
|
|||||||
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
||||||
ret.checkerboard = lhs.checkerboard;
|
ret.checkerboard = lhs.checkerboard;
|
||||||
conformable(ret,lhs);
|
conformable(ret,lhs);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
|
||||||
obj1 tmp;
|
obj1 tmp;
|
||||||
mac(&tmp,&lhs._odata[ss],&rhs);
|
mac(&tmp,&lhs._odata[ss],&rhs);
|
||||||
vstream(ret._odata[ss],tmp);
|
vstream(ret._odata[ss],tmp);
|
||||||
@ -132,8 +126,7 @@ PARALLEL_FOR_LOOP
|
|||||||
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
||||||
ret.checkerboard = lhs.checkerboard;
|
ret.checkerboard = lhs.checkerboard;
|
||||||
conformable(ret,lhs);
|
conformable(ret,lhs);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
|
||||||
#ifdef STREAMING_STORES
|
#ifdef STREAMING_STORES
|
||||||
obj1 tmp;
|
obj1 tmp;
|
||||||
sub(&tmp,&lhs._odata[ss],&rhs);
|
sub(&tmp,&lhs._odata[ss],&rhs);
|
||||||
@ -147,8 +140,7 @@ PARALLEL_FOR_LOOP
|
|||||||
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
||||||
ret.checkerboard = lhs.checkerboard;
|
ret.checkerboard = lhs.checkerboard;
|
||||||
conformable(lhs,ret);
|
conformable(lhs,ret);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
|
||||||
#ifdef STREAMING_STORES
|
#ifdef STREAMING_STORES
|
||||||
obj1 tmp;
|
obj1 tmp;
|
||||||
add(&tmp,&lhs._odata[ss],&rhs);
|
add(&tmp,&lhs._odata[ss],&rhs);
|
||||||
@ -166,8 +158,7 @@ PARALLEL_FOR_LOOP
|
|||||||
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
||||||
ret.checkerboard = rhs.checkerboard;
|
ret.checkerboard = rhs.checkerboard;
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<rhs._grid->oSites();ss++){
|
|
||||||
#ifdef STREAMING_STORES
|
#ifdef STREAMING_STORES
|
||||||
obj1 tmp;
|
obj1 tmp;
|
||||||
mult(&tmp,&lhs,&rhs._odata[ss]);
|
mult(&tmp,&lhs,&rhs._odata[ss]);
|
||||||
@ -182,8 +173,7 @@ PARALLEL_FOR_LOOP
|
|||||||
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
||||||
ret.checkerboard = rhs.checkerboard;
|
ret.checkerboard = rhs.checkerboard;
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<rhs._grid->oSites();ss++){
|
|
||||||
#ifdef STREAMING_STORES
|
#ifdef STREAMING_STORES
|
||||||
obj1 tmp;
|
obj1 tmp;
|
||||||
mac(&tmp,&lhs,&rhs._odata[ss]);
|
mac(&tmp,&lhs,&rhs._odata[ss]);
|
||||||
@ -198,8 +188,7 @@ PARALLEL_FOR_LOOP
|
|||||||
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
||||||
ret.checkerboard = rhs.checkerboard;
|
ret.checkerboard = rhs.checkerboard;
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<rhs._grid->oSites();ss++){
|
|
||||||
#ifdef STREAMING_STORES
|
#ifdef STREAMING_STORES
|
||||||
obj1 tmp;
|
obj1 tmp;
|
||||||
sub(&tmp,&lhs,&rhs._odata[ss]);
|
sub(&tmp,&lhs,&rhs._odata[ss]);
|
||||||
@ -213,8 +202,7 @@ PARALLEL_FOR_LOOP
|
|||||||
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
||||||
ret.checkerboard = rhs.checkerboard;
|
ret.checkerboard = rhs.checkerboard;
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<rhs._grid->oSites();ss++){
|
|
||||||
#ifdef STREAMING_STORES
|
#ifdef STREAMING_STORES
|
||||||
obj1 tmp;
|
obj1 tmp;
|
||||||
add(&tmp,&lhs,&rhs._odata[ss]);
|
add(&tmp,&lhs,&rhs._odata[ss]);
|
||||||
@ -230,8 +218,7 @@ PARALLEL_FOR_LOOP
|
|||||||
ret.checkerboard = x.checkerboard;
|
ret.checkerboard = x.checkerboard;
|
||||||
conformable(ret,x);
|
conformable(ret,x);
|
||||||
conformable(x,y);
|
conformable(x,y);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<x._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<x._grid->oSites();ss++){
|
|
||||||
#ifdef STREAMING_STORES
|
#ifdef STREAMING_STORES
|
||||||
vobj tmp = a*x._odata[ss]+y._odata[ss];
|
vobj tmp = a*x._odata[ss]+y._odata[ss];
|
||||||
vstream(ret._odata[ss],tmp);
|
vstream(ret._odata[ss],tmp);
|
||||||
@ -245,8 +232,7 @@ PARALLEL_FOR_LOOP
|
|||||||
ret.checkerboard = x.checkerboard;
|
ret.checkerboard = x.checkerboard;
|
||||||
conformable(ret,x);
|
conformable(ret,x);
|
||||||
conformable(x,y);
|
conformable(x,y);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<x._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<x._grid->oSites();ss++){
|
|
||||||
#ifdef STREAMING_STORES
|
#ifdef STREAMING_STORES
|
||||||
vobj tmp = a*x._odata[ss]+b*y._odata[ss];
|
vobj tmp = a*x._odata[ss]+b*y._odata[ss];
|
||||||
vstream(ret._odata[ss],tmp);
|
vstream(ret._odata[ss],tmp);
|
||||||
|
@ -121,8 +121,7 @@ public:
|
|||||||
assert( (cb==Odd) || (cb==Even));
|
assert( (cb==Odd) || (cb==Even));
|
||||||
checkerboard=cb;
|
checkerboard=cb;
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<_grid->oSites();ss++){
|
||||||
for(int ss=0;ss<_grid->oSites();ss++){
|
|
||||||
#ifdef STREAMING_STORES
|
#ifdef STREAMING_STORES
|
||||||
vobj tmp = eval(ss,expr);
|
vobj tmp = eval(ss,expr);
|
||||||
vstream(_odata[ss] ,tmp);
|
vstream(_odata[ss] ,tmp);
|
||||||
@ -144,8 +143,7 @@ PARALLEL_FOR_LOOP
|
|||||||
assert( (cb==Odd) || (cb==Even));
|
assert( (cb==Odd) || (cb==Even));
|
||||||
checkerboard=cb;
|
checkerboard=cb;
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<_grid->oSites();ss++){
|
||||||
for(int ss=0;ss<_grid->oSites();ss++){
|
|
||||||
#ifdef STREAMING_STORES
|
#ifdef STREAMING_STORES
|
||||||
vobj tmp = eval(ss,expr);
|
vobj tmp = eval(ss,expr);
|
||||||
vstream(_odata[ss] ,tmp);
|
vstream(_odata[ss] ,tmp);
|
||||||
@ -167,8 +165,7 @@ PARALLEL_FOR_LOOP
|
|||||||
assert( (cb==Odd) || (cb==Even));
|
assert( (cb==Odd) || (cb==Even));
|
||||||
checkerboard=cb;
|
checkerboard=cb;
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<_grid->oSites();ss++){
|
||||||
for(int ss=0;ss<_grid->oSites();ss++){
|
|
||||||
#ifdef STREAMING_STORES
|
#ifdef STREAMING_STORES
|
||||||
//vobj tmp = eval(ss,expr);
|
//vobj tmp = eval(ss,expr);
|
||||||
vstream(_odata[ss] ,eval(ss,expr));
|
vstream(_odata[ss] ,eval(ss,expr));
|
||||||
@ -191,8 +188,7 @@ PARALLEL_FOR_LOOP
|
|||||||
checkerboard=cb;
|
checkerboard=cb;
|
||||||
|
|
||||||
_odata.resize(_grid->oSites());
|
_odata.resize(_grid->oSites());
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<_grid->oSites();ss++){
|
||||||
for(int ss=0;ss<_grid->oSites();ss++){
|
|
||||||
#ifdef STREAMING_STORES
|
#ifdef STREAMING_STORES
|
||||||
vobj tmp = eval(ss,expr);
|
vobj tmp = eval(ss,expr);
|
||||||
vstream(_odata[ss] ,tmp);
|
vstream(_odata[ss] ,tmp);
|
||||||
@ -213,8 +209,7 @@ PARALLEL_FOR_LOOP
|
|||||||
checkerboard=cb;
|
checkerboard=cb;
|
||||||
|
|
||||||
_odata.resize(_grid->oSites());
|
_odata.resize(_grid->oSites());
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<_grid->oSites();ss++){
|
||||||
for(int ss=0;ss<_grid->oSites();ss++){
|
|
||||||
#ifdef STREAMING_STORES
|
#ifdef STREAMING_STORES
|
||||||
vobj tmp = eval(ss,expr);
|
vobj tmp = eval(ss,expr);
|
||||||
vstream(_odata[ss] ,tmp);
|
vstream(_odata[ss] ,tmp);
|
||||||
@ -235,8 +230,7 @@ PARALLEL_FOR_LOOP
|
|||||||
checkerboard=cb;
|
checkerboard=cb;
|
||||||
|
|
||||||
_odata.resize(_grid->oSites());
|
_odata.resize(_grid->oSites());
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<_grid->oSites();ss++){
|
||||||
for(int ss=0;ss<_grid->oSites();ss++){
|
|
||||||
vstream(_odata[ss] ,eval(ss,expr));
|
vstream(_odata[ss] ,eval(ss,expr));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -258,8 +252,7 @@ PARALLEL_FOR_LOOP
|
|||||||
_grid = r._grid;
|
_grid = r._grid;
|
||||||
checkerboard = r.checkerboard;
|
checkerboard = r.checkerboard;
|
||||||
_odata.resize(_grid->oSites());// essential
|
_odata.resize(_grid->oSites());// essential
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<_grid->oSites();ss++){
|
||||||
for(int ss=0;ss<_grid->oSites();ss++){
|
|
||||||
_odata[ss]=r._odata[ss];
|
_odata[ss]=r._odata[ss];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -269,8 +262,7 @@ PARALLEL_FOR_LOOP
|
|||||||
virtual ~Lattice(void) = default;
|
virtual ~Lattice(void) = default;
|
||||||
|
|
||||||
template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
|
template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<_grid->oSites();ss++){
|
||||||
for(int ss=0;ss<_grid->oSites();ss++){
|
|
||||||
this->_odata[ss]=r;
|
this->_odata[ss]=r;
|
||||||
}
|
}
|
||||||
return *this;
|
return *this;
|
||||||
@ -279,8 +271,7 @@ PARALLEL_FOR_LOOP
|
|||||||
this->checkerboard = r.checkerboard;
|
this->checkerboard = r.checkerboard;
|
||||||
conformable(*this,r);
|
conformable(*this,r);
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<_grid->oSites();ss++){
|
||||||
for(int ss=0;ss<_grid->oSites();ss++){
|
|
||||||
this->_odata[ss]=r._odata[ss];
|
this->_odata[ss]=r._odata[ss];
|
||||||
}
|
}
|
||||||
return *this;
|
return *this;
|
||||||
|
@ -45,90 +45,87 @@ namespace Grid {
|
|||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
template<class vfunctor,class lobj,class robj>
|
template<class vfunctor,class lobj,class robj>
|
||||||
inline Lattice<vInteger> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
|
inline Lattice<vInteger> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
|
||||||
{
|
{
|
||||||
Lattice<vInteger> ret(rhs._grid);
|
Lattice<vInteger> ret(rhs._grid);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]);
|
||||||
ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]);
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
// compare lattice to scalar
|
// compare lattice to scalar
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
template<class vfunctor,class lobj,class robj>
|
template<class vfunctor,class lobj,class robj>
|
||||||
inline Lattice<vInteger> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
|
inline Lattice<vInteger> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
|
||||||
{
|
{
|
||||||
Lattice<vInteger> ret(lhs._grid);
|
Lattice<vInteger> ret(lhs._grid);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
|
||||||
for(int ss=0;ss<lhs._grid->oSites(); ss++){
|
ret._odata[ss]=op(lhs._odata[ss],rhs);
|
||||||
ret._odata[ss]=op(lhs._odata[ss],rhs);
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
// compare scalar to lattice
|
// compare scalar to lattice
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
template<class vfunctor,class lobj,class robj>
|
template<class vfunctor,class lobj,class robj>
|
||||||
inline Lattice<vInteger> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
|
inline Lattice<vInteger> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
|
||||||
{
|
{
|
||||||
Lattice<vInteger> ret(rhs._grid);
|
Lattice<vInteger> ret(rhs._grid);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
ret._odata[ss]=op(lhs._odata[ss],rhs);
|
||||||
ret._odata[ss]=op(lhs._odata[ss],rhs);
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
// Map to functors
|
// Map to functors
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
// Less than
|
// Less than
|
||||||
template<class lobj,class robj>
|
template<class lobj,class robj>
|
||||||
inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
|
inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
|
||||||
return LLComparison(vlt<lobj,robj>(),lhs,rhs);
|
return LLComparison(vlt<lobj,robj>(),lhs,rhs);
|
||||||
}
|
}
|
||||||
template<class lobj,class robj>
|
template<class lobj,class robj>
|
||||||
inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
|
inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
|
||||||
return LSComparison(vlt<lobj,robj>(),lhs,rhs);
|
return LSComparison(vlt<lobj,robj>(),lhs,rhs);
|
||||||
}
|
}
|
||||||
template<class lobj,class robj>
|
template<class lobj,class robj>
|
||||||
inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
|
inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
|
||||||
return SLComparison(vlt<lobj,robj>(),lhs,rhs);
|
return SLComparison(vlt<lobj,robj>(),lhs,rhs);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Less than equal
|
// Less than equal
|
||||||
template<class lobj,class robj>
|
template<class lobj,class robj>
|
||||||
inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
|
inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
|
||||||
return LLComparison(vle<lobj,robj>(),lhs,rhs);
|
return LLComparison(vle<lobj,robj>(),lhs,rhs);
|
||||||
}
|
}
|
||||||
template<class lobj,class robj>
|
template<class lobj,class robj>
|
||||||
inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
|
inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
|
||||||
return LSComparison(vle<lobj,robj>(),lhs,rhs);
|
return LSComparison(vle<lobj,robj>(),lhs,rhs);
|
||||||
}
|
}
|
||||||
template<class lobj,class robj>
|
template<class lobj,class robj>
|
||||||
inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
|
inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
|
||||||
return SLComparison(vle<lobj,robj>(),lhs,rhs);
|
return SLComparison(vle<lobj,robj>(),lhs,rhs);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Greater than
|
// Greater than
|
||||||
template<class lobj,class robj>
|
template<class lobj,class robj>
|
||||||
inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
|
inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
|
||||||
return LLComparison(vgt<lobj,robj>(),lhs,rhs);
|
return LLComparison(vgt<lobj,robj>(),lhs,rhs);
|
||||||
}
|
}
|
||||||
template<class lobj,class robj>
|
template<class lobj,class robj>
|
||||||
inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
|
inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
|
||||||
return LSComparison(vgt<lobj,robj>(),lhs,rhs);
|
return LSComparison(vgt<lobj,robj>(),lhs,rhs);
|
||||||
}
|
}
|
||||||
template<class lobj,class robj>
|
template<class lobj,class robj>
|
||||||
inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
|
inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
|
||||||
return SLComparison(vgt<lobj,robj>(),lhs,rhs);
|
return SLComparison(vgt<lobj,robj>(),lhs,rhs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Greater than equal
|
// Greater than equal
|
||||||
template<class lobj,class robj>
|
template<class lobj,class robj>
|
||||||
inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
|
inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
|
||||||
return LLComparison(vge<lobj,robj>(),lhs,rhs);
|
return LLComparison(vge<lobj,robj>(),lhs,rhs);
|
||||||
}
|
}
|
||||||
template<class lobj,class robj>
|
template<class lobj,class robj>
|
||||||
@ -136,38 +133,37 @@ PARALLEL_FOR_LOOP
|
|||||||
return LSComparison(vge<lobj,robj>(),lhs,rhs);
|
return LSComparison(vge<lobj,robj>(),lhs,rhs);
|
||||||
}
|
}
|
||||||
template<class lobj,class robj>
|
template<class lobj,class robj>
|
||||||
inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
|
inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
|
||||||
return SLComparison(vge<lobj,robj>(),lhs,rhs);
|
return SLComparison(vge<lobj,robj>(),lhs,rhs);
|
||||||
}
|
}
|
||||||
|
|
||||||
// equal
|
// equal
|
||||||
template<class lobj,class robj>
|
template<class lobj,class robj>
|
||||||
inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
|
inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
|
||||||
return LLComparison(veq<lobj,robj>(),lhs,rhs);
|
return LLComparison(veq<lobj,robj>(),lhs,rhs);
|
||||||
}
|
}
|
||||||
template<class lobj,class robj>
|
template<class lobj,class robj>
|
||||||
inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
|
inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
|
||||||
return LSComparison(veq<lobj,robj>(),lhs,rhs);
|
return LSComparison(veq<lobj,robj>(),lhs,rhs);
|
||||||
}
|
}
|
||||||
template<class lobj,class robj>
|
template<class lobj,class robj>
|
||||||
inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
|
inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
|
||||||
return SLComparison(veq<lobj,robj>(),lhs,rhs);
|
return SLComparison(veq<lobj,robj>(),lhs,rhs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// not equal
|
// not equal
|
||||||
template<class lobj,class robj>
|
template<class lobj,class robj>
|
||||||
inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
|
inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
|
||||||
return LLComparison(vne<lobj,robj>(),lhs,rhs);
|
return LLComparison(vne<lobj,robj>(),lhs,rhs);
|
||||||
}
|
}
|
||||||
template<class lobj,class robj>
|
template<class lobj,class robj>
|
||||||
inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
|
inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
|
||||||
return LSComparison(vne<lobj,robj>(),lhs,rhs);
|
return LSComparison(vne<lobj,robj>(),lhs,rhs);
|
||||||
}
|
}
|
||||||
template<class lobj,class robj>
|
template<class lobj,class robj>
|
||||||
inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
|
inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
|
||||||
return SLComparison(vne<lobj,robj>(),lhs,rhs);
|
return SLComparison(vne<lobj,robj>(),lhs,rhs);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -34,47 +34,42 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
/////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////
|
||||||
// Non site, reduced locally reduced routines
|
// Non site, reduced locally reduced routines
|
||||||
/////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////
|
||||||
|
|
||||||
// localNorm2,
|
// localNorm2,
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
|
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
|
||||||
{
|
{
|
||||||
Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
|
Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]);
|
||||||
ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]);
|
}
|
||||||
}
|
return ret;
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// localInnerProduct
|
// localInnerProduct
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
|
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
|
||||||
{
|
{
|
||||||
Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
|
Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
|
||||||
ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]);
|
ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
// outerProduct Scalar x Scalar -> Scalar
|
// outerProduct Scalar x Scalar -> Scalar
|
||||||
// Vector x Vector -> Matrix
|
// Vector x Vector -> Matrix
|
||||||
template<class ll,class rr>
|
template<class ll,class rr>
|
||||||
inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
|
inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
|
||||||
{
|
{
|
||||||
Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
|
Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
|
||||||
ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
|
}
|
||||||
}
|
return ret;
|
||||||
return ret;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -37,8 +37,7 @@ namespace Grid {
|
|||||||
inline Lattice<vobj> operator -(const Lattice<vobj> &r)
|
inline Lattice<vobj> operator -(const Lattice<vobj> &r)
|
||||||
{
|
{
|
||||||
Lattice<vobj> ret(r._grid);
|
Lattice<vobj> ret(r._grid);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<r._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<r._grid->oSites();ss++){
|
|
||||||
vstream(ret._odata[ss], -r._odata[ss]);
|
vstream(ret._odata[ss], -r._odata[ss]);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
@ -74,8 +73,7 @@ PARALLEL_FOR_LOOP
|
|||||||
inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
|
inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
|
||||||
{
|
{
|
||||||
Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
|
Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
|
||||||
decltype(lhs*rhs._odata[0]) tmp=lhs*rhs._odata[ss];
|
decltype(lhs*rhs._odata[0]) tmp=lhs*rhs._odata[ss];
|
||||||
vstream(ret._odata[ss],tmp);
|
vstream(ret._odata[ss],tmp);
|
||||||
// ret._odata[ss]=lhs*rhs._odata[ss];
|
// ret._odata[ss]=lhs*rhs._odata[ss];
|
||||||
@ -86,8 +84,7 @@ PARALLEL_FOR_LOOP
|
|||||||
inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs+rhs._odata[0])>
|
inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs+rhs._odata[0])>
|
||||||
{
|
{
|
||||||
Lattice<decltype(lhs+rhs._odata[0])> ret(rhs._grid);
|
Lattice<decltype(lhs+rhs._odata[0])> ret(rhs._grid);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
|
||||||
decltype(lhs+rhs._odata[0]) tmp =lhs-rhs._odata[ss];
|
decltype(lhs+rhs._odata[0]) tmp =lhs-rhs._odata[ss];
|
||||||
vstream(ret._odata[ss],tmp);
|
vstream(ret._odata[ss],tmp);
|
||||||
// ret._odata[ss]=lhs+rhs._odata[ss];
|
// ret._odata[ss]=lhs+rhs._odata[ss];
|
||||||
@ -98,11 +95,9 @@ PARALLEL_FOR_LOOP
|
|||||||
inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs-rhs._odata[0])>
|
inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs-rhs._odata[0])>
|
||||||
{
|
{
|
||||||
Lattice<decltype(lhs-rhs._odata[0])> ret(rhs._grid);
|
Lattice<decltype(lhs-rhs._odata[0])> ret(rhs._grid);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
|
||||||
decltype(lhs-rhs._odata[0]) tmp=lhs-rhs._odata[ss];
|
decltype(lhs-rhs._odata[0]) tmp=lhs-rhs._odata[ss];
|
||||||
vstream(ret._odata[ss],tmp);
|
vstream(ret._odata[ss],tmp);
|
||||||
// ret._odata[ss]=lhs-rhs._odata[ss];
|
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
@ -110,8 +105,7 @@ PARALLEL_FOR_LOOP
|
|||||||
inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
|
inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
|
||||||
{
|
{
|
||||||
Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
|
Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
|
||||||
for(int ss=0;ss<lhs._grid->oSites(); ss++){
|
|
||||||
decltype(lhs._odata[0]*rhs) tmp =lhs._odata[ss]*rhs;
|
decltype(lhs._odata[0]*rhs) tmp =lhs._odata[ss]*rhs;
|
||||||
vstream(ret._odata[ss],tmp);
|
vstream(ret._odata[ss],tmp);
|
||||||
// ret._odata[ss]=lhs._odata[ss]*rhs;
|
// ret._odata[ss]=lhs._odata[ss]*rhs;
|
||||||
@ -122,8 +116,7 @@ PARALLEL_FOR_LOOP
|
|||||||
inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]+rhs)>
|
inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]+rhs)>
|
||||||
{
|
{
|
||||||
Lattice<decltype(lhs._odata[0]+rhs)> ret(lhs._grid);
|
Lattice<decltype(lhs._odata[0]+rhs)> ret(lhs._grid);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
|
||||||
decltype(lhs._odata[0]+rhs) tmp=lhs._odata[ss]+rhs;
|
decltype(lhs._odata[0]+rhs) tmp=lhs._odata[ss]+rhs;
|
||||||
vstream(ret._odata[ss],tmp);
|
vstream(ret._odata[ss],tmp);
|
||||||
// ret._odata[ss]=lhs._odata[ss]+rhs;
|
// ret._odata[ss]=lhs._odata[ss]+rhs;
|
||||||
@ -134,15 +127,12 @@ PARALLEL_FOR_LOOP
|
|||||||
inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]-rhs)>
|
inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]-rhs)>
|
||||||
{
|
{
|
||||||
Lattice<decltype(lhs._odata[0]-rhs)> ret(lhs._grid);
|
Lattice<decltype(lhs._odata[0]-rhs)> ret(lhs._grid);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
|
||||||
decltype(lhs._odata[0]-rhs) tmp=lhs._odata[ss]-rhs;
|
decltype(lhs._odata[0]-rhs) tmp=lhs._odata[ss]-rhs;
|
||||||
vstream(ret._odata[ss],tmp);
|
vstream(ret._odata[ss],tmp);
|
||||||
// ret._odata[ss]=lhs._odata[ss]-rhs;
|
// ret._odata[ss]=lhs._odata[ss]-rhs;
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -44,22 +44,20 @@ namespace Grid {
|
|||||||
{
|
{
|
||||||
Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> ret(lhs._grid);
|
Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> ret(lhs._grid);
|
||||||
ret.checkerboard=lhs.checkerboard;
|
ret.checkerboard=lhs.checkerboard;
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i);
|
||||||
ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i);
|
}
|
||||||
}
|
return ret;
|
||||||
return ret;
|
|
||||||
};
|
};
|
||||||
template<int Index,class vobj>
|
template<int Index,class vobj>
|
||||||
auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
|
auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
|
||||||
{
|
{
|
||||||
Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> ret(lhs._grid);
|
Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> ret(lhs._grid);
|
||||||
ret.checkerboard=lhs.checkerboard;
|
ret.checkerboard=lhs.checkerboard;
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j);
|
||||||
ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j);
|
}
|
||||||
}
|
return ret;
|
||||||
return ret;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -68,18 +66,16 @@ PARALLEL_FOR_LOOP
|
|||||||
template<int Index,class vobj>
|
template<int Index,class vobj>
|
||||||
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0))> & rhs,int i)
|
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0))> & rhs,int i)
|
||||||
{
|
{
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i);
|
||||||
pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i);
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
template<int Index,class vobj>
|
template<int Index,class vobj>
|
||||||
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0,0))> & rhs,int i,int j)
|
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0,0))> & rhs,int i,int j)
|
||||||
{
|
{
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i,j);
|
||||||
pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i,j);
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
@ -131,9 +127,6 @@ PARALLEL_FOR_LOOP
|
|||||||
|
|
||||||
assert( l.checkerboard == l._grid->CheckerBoard(site));
|
assert( l.checkerboard == l._grid->CheckerBoard(site));
|
||||||
|
|
||||||
// FIXME
|
|
||||||
// assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
|
||||||
|
|
||||||
int rank,odx,idx;
|
int rank,odx,idx;
|
||||||
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
|
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
|
||||||
|
|
||||||
|
@ -40,8 +40,7 @@ namespace Grid {
|
|||||||
|
|
||||||
template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
|
template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
|
||||||
Lattice<vobj> ret(lhs._grid);
|
Lattice<vobj> ret(lhs._grid);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
|
||||||
ret._odata[ss] = adj(lhs._odata[ss]);
|
ret._odata[ss] = adj(lhs._odata[ss]);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
@ -49,13 +48,10 @@ PARALLEL_FOR_LOOP
|
|||||||
|
|
||||||
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
|
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
|
||||||
Lattice<vobj> ret(lhs._grid);
|
Lattice<vobj> ret(lhs._grid);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
ret._odata[ss] = conjugate(lhs._odata[ss]);
|
||||||
ret._odata[ss] = conjugate(lhs._odata[ss]);
|
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -57,8 +57,7 @@ namespace Grid {
|
|||||||
sumarray[i]=zero;
|
sumarray[i]=zero;
|
||||||
}
|
}
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
|
||||||
for(int thr=0;thr<grid->SumArraySize();thr++){
|
|
||||||
int nwork, mywork, myoff;
|
int nwork, mywork, myoff;
|
||||||
GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
|
GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
|
||||||
|
|
||||||
@ -68,7 +67,7 @@ PARALLEL_FOR_LOOP
|
|||||||
}
|
}
|
||||||
sumarray[thr]=TensorRemove(vnrm) ;
|
sumarray[thr]=TensorRemove(vnrm) ;
|
||||||
}
|
}
|
||||||
|
|
||||||
vector_type vvnrm; vvnrm=zero; // sum across threads
|
vector_type vvnrm; vvnrm=zero; // sum across threads
|
||||||
for(int i=0;i<grid->SumArraySize();i++){
|
for(int i=0;i<grid->SumArraySize();i++){
|
||||||
vvnrm = vvnrm+sumarray[i];
|
vvnrm = vvnrm+sumarray[i];
|
||||||
@ -114,18 +113,17 @@ PARALLEL_FOR_LOOP
|
|||||||
sumarray[i]=zero;
|
sumarray[i]=zero;
|
||||||
}
|
}
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
|
||||||
for(int thr=0;thr<grid->SumArraySize();thr++){
|
|
||||||
int nwork, mywork, myoff;
|
int nwork, mywork, myoff;
|
||||||
GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
|
GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
|
||||||
|
|
||||||
vobj vvsum=zero;
|
vobj vvsum=zero;
|
||||||
for(int ss=myoff;ss<mywork+myoff; ss++){
|
for(int ss=myoff;ss<mywork+myoff; ss++){
|
||||||
vvsum = vvsum + arg._odata[ss];
|
vvsum = vvsum + arg._odata[ss];
|
||||||
}
|
}
|
||||||
sumarray[thr]=vvsum;
|
sumarray[thr]=vvsum;
|
||||||
}
|
}
|
||||||
|
|
||||||
vobj vsum=zero; // sum across threads
|
vobj vsum=zero; // sum across threads
|
||||||
for(int i=0;i<grid->SumArraySize();i++){
|
for(int i=0;i<grid->SumArraySize();i++){
|
||||||
vsum = vsum+sumarray[i];
|
vsum = vsum+sumarray[i];
|
||||||
|
@ -302,8 +302,7 @@ namespace Grid {
|
|||||||
int words=sizeof(scalar_object)/sizeof(scalar_type);
|
int words=sizeof(scalar_object)/sizeof(scalar_type);
|
||||||
|
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<osites;ss++){
|
||||||
for(int ss=0;ss<osites;ss++){
|
|
||||||
|
|
||||||
std::vector<scalar_object> buf(Nsimd);
|
std::vector<scalar_object> buf(Nsimd);
|
||||||
for(int m=0;m<multiplicity;m++) {// Draw from same generator multiplicity times
|
for(int m=0;m<multiplicity;m++) {// Draw from same generator multiplicity times
|
||||||
|
@ -42,8 +42,7 @@ namespace Grid {
|
|||||||
-> Lattice<decltype(trace(lhs._odata[0]))>
|
-> Lattice<decltype(trace(lhs._odata[0]))>
|
||||||
{
|
{
|
||||||
Lattice<decltype(trace(lhs._odata[0]))> ret(lhs._grid);
|
Lattice<decltype(trace(lhs._odata[0]))> ret(lhs._grid);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
|
||||||
ret._odata[ss] = trace(lhs._odata[ss]);
|
ret._odata[ss] = trace(lhs._odata[ss]);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
@ -56,8 +55,7 @@ PARALLEL_FOR_LOOP
|
|||||||
inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))>
|
inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))>
|
||||||
{
|
{
|
||||||
Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
|
Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
|
||||||
ret._odata[ss] = traceIndex<Index>(lhs._odata[ss]);
|
ret._odata[ss] = traceIndex<Index>(lhs._odata[ss]);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
|
@ -51,7 +51,7 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
|
|||||||
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
|
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
|
||||||
half.checkerboard = cb;
|
half.checkerboard = cb;
|
||||||
int ssh=0;
|
int ssh=0;
|
||||||
//PARALLEL_FOR_LOOP
|
//parallel_for
|
||||||
for(int ss=0;ss<full._grid->oSites();ss++){
|
for(int ss=0;ss<full._grid->oSites();ss++){
|
||||||
std::vector<int> coor;
|
std::vector<int> coor;
|
||||||
int cbos;
|
int cbos;
|
||||||
@ -68,7 +68,7 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
|
|||||||
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
|
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
|
||||||
int cb = half.checkerboard;
|
int cb = half.checkerboard;
|
||||||
int ssh=0;
|
int ssh=0;
|
||||||
//PARALLEL_FOR_LOOP
|
//parallel_for
|
||||||
for(int ss=0;ss<full._grid->oSites();ss++){
|
for(int ss=0;ss<full._grid->oSites();ss++){
|
||||||
std::vector<int> coor;
|
std::vector<int> coor;
|
||||||
int cbos;
|
int cbos;
|
||||||
@ -153,8 +153,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
|
|||||||
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
|
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
|
||||||
}
|
}
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int sf=0;sf<fine->oSites();sf++){
|
||||||
for(int sf=0;sf<fine->oSites();sf++){
|
|
||||||
|
|
||||||
int sc;
|
int sc;
|
||||||
std::vector<int> coor_c(_ndimension);
|
std::vector<int> coor_c(_ndimension);
|
||||||
@ -186,8 +185,7 @@ template<class vobj,class CComplex>
|
|||||||
|
|
||||||
fine_inner = localInnerProduct(fineX,fineY);
|
fine_inner = localInnerProduct(fineX,fineY);
|
||||||
blockSum(coarse_inner,fine_inner);
|
blockSum(coarse_inner,fine_inner);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<coarse->oSites();ss++){
|
||||||
for(int ss=0;ss<coarse->oSites();ss++){
|
|
||||||
CoarseInner._odata[ss] = coarse_inner._odata[ss];
|
CoarseInner._odata[ss] = coarse_inner._odata[ss];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -347,8 +345,7 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
|
|||||||
assert(ig->lSites() == og->lSites());
|
assert(ig->lSites() == og->lSites());
|
||||||
}
|
}
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int idx=0;idx<ig->lSites();idx++){
|
||||||
for(int idx=0;idx<ig->lSites();idx++){
|
|
||||||
sobj s;
|
sobj s;
|
||||||
ssobj ss;
|
ssobj ss;
|
||||||
|
|
||||||
@ -386,8 +383,7 @@ void InsertSlice(Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int
|
|||||||
}
|
}
|
||||||
|
|
||||||
// the above should guarantee that the operations are local
|
// the above should guarantee that the operations are local
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int idx=0;idx<lg->lSites();idx++){
|
||||||
for(int idx=0;idx<lg->lSites();idx++){
|
|
||||||
sobj s;
|
sobj s;
|
||||||
std::vector<int> lcoor(nl);
|
std::vector<int> lcoor(nl);
|
||||||
std::vector<int> hcoor(nh);
|
std::vector<int> hcoor(nh);
|
||||||
@ -428,8 +424,7 @@ void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, in
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
// the above should guarantee that the operations are local
|
// the above should guarantee that the operations are local
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int idx=0;idx<lg->lSites();idx++){
|
||||||
for(int idx=0;idx<lg->lSites();idx++){
|
|
||||||
sobj s;
|
sobj s;
|
||||||
std::vector<int> lcoor(nl);
|
std::vector<int> lcoor(nl);
|
||||||
std::vector<int> hcoor(nh);
|
std::vector<int> hcoor(nh);
|
||||||
@ -468,8 +463,7 @@ void InsertSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice
|
|||||||
}
|
}
|
||||||
|
|
||||||
// the above should guarantee that the operations are local
|
// the above should guarantee that the operations are local
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int idx=0;idx<lg->lSites();idx++){
|
||||||
for(int idx=0;idx<lg->lSites();idx++){
|
|
||||||
sobj s;
|
sobj s;
|
||||||
std::vector<int> lcoor(nl);
|
std::vector<int> lcoor(nl);
|
||||||
std::vector<int> hcoor(nh);
|
std::vector<int> hcoor(nh);
|
||||||
@ -504,8 +498,7 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slic
|
|||||||
}
|
}
|
||||||
|
|
||||||
// the above should guarantee that the operations are local
|
// the above should guarantee that the operations are local
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int idx=0;idx<lg->lSites();idx++){
|
||||||
for(int idx=0;idx<lg->lSites();idx++){
|
|
||||||
sobj s;
|
sobj s;
|
||||||
std::vector<int> lcoor(nl);
|
std::vector<int> lcoor(nl);
|
||||||
std::vector<int> hcoor(nh);
|
std::vector<int> hcoor(nh);
|
||||||
@ -574,8 +567,7 @@ typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>
|
|||||||
in_grid->iCoorFromIindex(in_icoor[lane], lane);
|
in_grid->iCoorFromIindex(in_icoor[lane], lane);
|
||||||
}
|
}
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index
|
||||||
for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index
|
|
||||||
//Assemble vector of pointers to output elements
|
//Assemble vector of pointers to output elements
|
||||||
std::vector<sobj*> out_ptrs(in_nsimd);
|
std::vector<sobj*> out_ptrs(in_nsimd);
|
||||||
|
|
||||||
@ -623,8 +615,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
|
|||||||
std::vector<SobjOut> in_slex_conv(in_grid->lSites());
|
std::vector<SobjOut> in_slex_conv(in_grid->lSites());
|
||||||
unvectorizeToLexOrdArray(in_slex_conv, in);
|
unvectorizeToLexOrdArray(in_slex_conv, in);
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){
|
||||||
for(int out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){
|
|
||||||
std::vector<int> out_ocoor(ndim);
|
std::vector<int> out_ocoor(ndim);
|
||||||
out_grid->oCoorFromOindex(out_ocoor, out_oidx);
|
out_grid->oCoorFromOindex(out_ocoor, out_oidx);
|
||||||
|
|
||||||
@ -642,10 +633,6 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
|
|||||||
merge(out._odata[out_oidx], ptrs, 0);
|
merge(out._odata[out_oidx], ptrs, 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -40,27 +40,24 @@ namespace Grid {
|
|||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
|
inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
|
||||||
Lattice<vobj> ret(lhs._grid);
|
Lattice<vobj> ret(lhs._grid);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
ret._odata[ss] = transpose(lhs._odata[ss]);
|
||||||
ret._odata[ss] = transpose(lhs._odata[ss]);
|
}
|
||||||
}
|
return ret;
|
||||||
return ret;
|
};
|
||||||
};
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Index level dependent transpose
|
// Index level dependent transpose
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template<int Index,class vobj>
|
template<int Index,class vobj>
|
||||||
inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))>
|
inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))>
|
||||||
{
|
{
|
||||||
Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
|
Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
ret._odata[ss] = transposeIndex<Index>(lhs._odata[ss]);
|
||||||
ret._odata[ss] = transposeIndex<Index>(lhs._odata[ss]);
|
}
|
||||||
}
|
return ret;
|
||||||
return ret;
|
};
|
||||||
};
|
|
||||||
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -37,8 +37,7 @@ namespace Grid {
|
|||||||
Lattice<obj> ret(rhs._grid);
|
Lattice<obj> ret(rhs._grid);
|
||||||
ret.checkerboard = rhs.checkerboard;
|
ret.checkerboard = rhs.checkerboard;
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<rhs._grid->oSites();ss++){
|
|
||||||
ret._odata[ss]=pow(rhs._odata[ss],y);
|
ret._odata[ss]=pow(rhs._odata[ss],y);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
@ -47,8 +46,7 @@ PARALLEL_FOR_LOOP
|
|||||||
Lattice<obj> ret(rhs._grid);
|
Lattice<obj> ret(rhs._grid);
|
||||||
ret.checkerboard = rhs.checkerboard;
|
ret.checkerboard = rhs.checkerboard;
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<rhs._grid->oSites();ss++){
|
|
||||||
ret._odata[ss]=mod(rhs._odata[ss],y);
|
ret._odata[ss]=mod(rhs._odata[ss],y);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
@ -58,8 +56,7 @@ PARALLEL_FOR_LOOP
|
|||||||
Lattice<obj> ret(rhs._grid);
|
Lattice<obj> ret(rhs._grid);
|
||||||
ret.checkerboard = rhs.checkerboard;
|
ret.checkerboard = rhs.checkerboard;
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<rhs._grid->oSites();ss++){
|
|
||||||
ret._odata[ss]=div(rhs._odata[ss],y);
|
ret._odata[ss]=div(rhs._odata[ss],y);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
@ -69,8 +66,7 @@ PARALLEL_FOR_LOOP
|
|||||||
Lattice<obj> ret(rhs._grid);
|
Lattice<obj> ret(rhs._grid);
|
||||||
ret.checkerboard = rhs.checkerboard;
|
ret.checkerboard = rhs.checkerboard;
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<rhs._grid->oSites();ss++){
|
|
||||||
ret._odata[ss]=Exponentiate(rhs._odata[ss],alpha, Nexp);
|
ret._odata[ss]=Exponentiate(rhs._odata[ss],alpha, Nexp);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
|
@ -56,8 +56,7 @@ inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<
|
|||||||
std::vector<scalar_object> truevals (Nsimd);
|
std::vector<scalar_object> truevals (Nsimd);
|
||||||
std::vector<scalar_object> falsevals(Nsimd);
|
std::vector<scalar_object> falsevals(Nsimd);
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<iftrue._grid->oSites(); ss++){
|
||||||
for(int ss=0;ss<iftrue._grid->oSites(); ss++){
|
|
||||||
|
|
||||||
extract(iftrue._odata[ss] ,truevals);
|
extract(iftrue._odata[ss] ,truevals);
|
||||||
extract(iffalse._odata[ss] ,falsevals);
|
extract(iffalse._odata[ss] ,falsevals);
|
||||||
|
@ -54,8 +54,8 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
|
|||||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||||
M5Dcalls++;
|
M5Dcalls++;
|
||||||
M5Dtime-=usecond();
|
M5Dtime-=usecond();
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
auto tmp = psi._odata[0];
|
auto tmp = psi._odata[0];
|
||||||
if ( s==0 ) {
|
if ( s==0 ) {
|
||||||
@ -98,8 +98,8 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
|
|||||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||||
M5Dcalls++;
|
M5Dcalls++;
|
||||||
M5Dtime-=usecond();
|
M5Dtime-=usecond();
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
||||||
auto tmp = psi._odata[0];
|
auto tmp = psi._odata[0];
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
if ( s==0 ) {
|
if ( s==0 ) {
|
||||||
@ -137,8 +137,7 @@ void CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi, FermionField &
|
|||||||
MooeeInvCalls++;
|
MooeeInvCalls++;
|
||||||
MooeeInvTime-=usecond();
|
MooeeInvTime-=usecond();
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
||||||
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
|
||||||
auto tmp = psi._odata[0];
|
auto tmp = psi._odata[0];
|
||||||
|
|
||||||
// flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls = 12*Ls * (9) = 108*Ls flops
|
// flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls = 12*Ls * (9) = 108*Ls flops
|
||||||
@ -184,8 +183,7 @@ void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &
|
|||||||
MooeeInvCalls++;
|
MooeeInvCalls++;
|
||||||
MooeeInvTime-=usecond();
|
MooeeInvTime-=usecond();
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
||||||
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
|
||||||
|
|
||||||
auto tmp = psi._odata[0];
|
auto tmp = psi._odata[0];
|
||||||
|
|
||||||
|
@ -91,8 +91,7 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
|
|||||||
|
|
||||||
assert(Nc==3);
|
assert(Nc==3);
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
|
||||||
for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
|
|
||||||
#if 0
|
#if 0
|
||||||
alignas(64) SiteHalfSpinor hp;
|
alignas(64) SiteHalfSpinor hp;
|
||||||
alignas(64) SiteHalfSpinor hm;
|
alignas(64) SiteHalfSpinor hm;
|
||||||
@ -232,8 +231,7 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
|
|||||||
|
|
||||||
M5Dcalls++;
|
M5Dcalls++;
|
||||||
M5Dtime-=usecond();
|
M5Dtime-=usecond();
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
|
||||||
for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
|
|
||||||
#if 0
|
#if 0
|
||||||
alignas(64) SiteHalfSpinor hp;
|
alignas(64) SiteHalfSpinor hp;
|
||||||
alignas(64) SiteHalfSpinor hm;
|
alignas(64) SiteHalfSpinor hm;
|
||||||
@ -792,13 +790,11 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
|
|||||||
MooeeInvTime-=usecond();
|
MooeeInvTime-=usecond();
|
||||||
|
|
||||||
if ( switcheroo<Coeff_t>::iscomplex() ) {
|
if ( switcheroo<Coeff_t>::iscomplex() ) {
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(auto site=0;site<vol;site++){
|
||||||
for(auto site=0;site<vol;site++){
|
|
||||||
MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
|
MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(auto site=0;site<vol;site++){
|
||||||
for(auto site=0;site<vol;site++){
|
|
||||||
MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
|
MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -194,8 +194,7 @@ namespace QCD {
|
|||||||
GaugeLinkField tmp(mat._grid);
|
GaugeLinkField tmp(mat._grid);
|
||||||
tmp = zero;
|
tmp = zero;
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int sss=0;sss<tmp._grid->oSites();sss++){
|
||||||
for(int sss=0;sss<tmp._grid->oSites();sss++){
|
|
||||||
int sU=sss;
|
int sU=sss;
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
int sF = s+Ls*sU;
|
int sF = s+Ls*sU;
|
||||||
@ -445,8 +444,7 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
|
|||||||
Uconj = where(coor==neglink,-Uconj,Uconj);
|
Uconj = where(coor==neglink,-Uconj,Uconj);
|
||||||
}
|
}
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(auto ss=U.begin();ss<U.end();ss++){
|
||||||
for(auto ss=U.begin();ss<U.end();ss++){
|
|
||||||
Uds[ss](0)(mu) = U[ss]();
|
Uds[ss](0)(mu) = U[ss]();
|
||||||
Uds[ss](1)(mu) = Uconj[ss]();
|
Uds[ss](1)(mu) = Uconj[ss]();
|
||||||
}
|
}
|
||||||
@ -459,8 +457,7 @@ PARALLEL_FOR_LOOP
|
|||||||
Utmp = where(coor==0,Uconj,Utmp);
|
Utmp = where(coor==0,Uconj,Utmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(auto ss=U.begin();ss<U.end();ss++){
|
||||||
for(auto ss=U.begin();ss<U.end();ss++){
|
|
||||||
Uds[ss](0)(mu+4) = Utmp[ss]();
|
Uds[ss](0)(mu+4) = Utmp[ss]();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -469,8 +466,7 @@ PARALLEL_FOR_LOOP
|
|||||||
Utmp = where(coor==0,U,Utmp);
|
Utmp = where(coor==0,U,Utmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(auto ss=U.begin();ss<U.end();ss++){
|
||||||
for(auto ss=U.begin();ss<U.end();ss++){
|
|
||||||
Uds[ss](1)(mu+4) = Utmp[ss]();
|
Uds[ss](1)(mu+4) = Utmp[ss]();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -484,8 +480,7 @@ PARALLEL_FOR_LOOP
|
|||||||
GaugeLinkField link(mat._grid);
|
GaugeLinkField link(mat._grid);
|
||||||
// use lorentz for flavour as hack.
|
// use lorentz for flavour as hack.
|
||||||
auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
|
auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(auto ss = tmp.begin(); ss < tmp.end(); ss++) {
|
||||||
for (auto ss = tmp.begin(); ss < tmp.end(); ss++) {
|
|
||||||
link[ss]() = tmp[ss](0, 0) - conjugate(tmp[ss](1, 1));
|
link[ss]() = tmp[ss](0, 0) - conjugate(tmp[ss](1, 1));
|
||||||
}
|
}
|
||||||
PokeIndex<LorentzIndex>(mat, link, mu);
|
PokeIndex<LorentzIndex>(mat, link, mu);
|
||||||
@ -498,8 +493,7 @@ PARALLEL_FOR_LOOP
|
|||||||
|
|
||||||
GaugeLinkField tmp(mat._grid);
|
GaugeLinkField tmp(mat._grid);
|
||||||
tmp = zero;
|
tmp = zero;
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss = 0; ss < tmp._grid->oSites(); ss++) {
|
||||||
for (int ss = 0; ss < tmp._grid->oSites(); ss++) {
|
|
||||||
for (int s = 0; s < Ls; s++) {
|
for (int s = 0; s < Ls; s++) {
|
||||||
int sF = s + Ls * ss;
|
int sF = s + Ls * ss;
|
||||||
auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde[sF], Atilde[sF]));
|
auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde[sF], Atilde[sF]));
|
||||||
|
@ -222,8 +222,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
|
|||||||
////////////////////////
|
////////////////////////
|
||||||
// Call the single hop
|
// Call the single hop
|
||||||
////////////////////////
|
////////////////////////
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for (int sss = 0; sss < B._grid->oSites(); sss++) {
|
||||||
for (int sss = 0; sss < B._grid->oSites(); sss++) {
|
|
||||||
Kernels::DhopDir(st, U, st.CommBuf(), sss, sss, B, Btilde, mu,
|
Kernels::DhopDir(st, U, st.CommBuf(), sss, sss, B, Btilde, mu,
|
||||||
gamma);
|
gamma);
|
||||||
}
|
}
|
||||||
@ -333,8 +332,7 @@ void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,
|
|||||||
|
|
||||||
Stencil.HaloExchange(in, compressor);
|
Stencil.HaloExchange(in, compressor);
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||||
for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
|
||||||
Kernels::DhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma);
|
Kernels::DhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -350,13 +348,11 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
st.HaloExchange(in, compressor);
|
st.HaloExchange(in, compressor);
|
||||||
|
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||||
for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
|
||||||
Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
|
Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||||
for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
|
||||||
Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
|
Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -275,8 +275,7 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
|
|||||||
assert(dirdisp<=7);
|
assert(dirdisp<=7);
|
||||||
assert(dirdisp>=0);
|
assert(dirdisp>=0);
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<Umu._grid->oSites();ss++){
|
||||||
for(int ss=0;ss<Umu._grid->oSites();ss++){
|
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
int sU=ss;
|
int sU=ss;
|
||||||
int sF = s+Ls*sU;
|
int sF = s+Ls*sU;
|
||||||
@ -323,8 +322,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
|
|||||||
////////////////////////
|
////////////////////////
|
||||||
|
|
||||||
DerivDhopComputeTime -= usecond();
|
DerivDhopComputeTime -= usecond();
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for (int sss = 0; sss < U._grid->oSites(); sss++) {
|
||||||
for (int sss = 0; sss < U._grid->oSites(); sss++) {
|
|
||||||
for (int s = 0; s < Ls; s++) {
|
for (int s = 0; s < Ls; s++) {
|
||||||
int sU = sss;
|
int sU = sss;
|
||||||
int sF = s + Ls * sU;
|
int sF = s + Ls * sU;
|
||||||
@ -493,73 +491,18 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
|
|||||||
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
||||||
|
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
||||||
for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
|
||||||
int sU = ss;
|
int sU = ss;
|
||||||
int sF = LLs * sU;
|
int sF = LLs * sU;
|
||||||
Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
|
Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
||||||
for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
|
||||||
int sU = ss;
|
int sU = ss;
|
||||||
int sF = LLs * sU;
|
int sF = LLs * sU;
|
||||||
Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
|
Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/*
|
|
||||||
|
|
||||||
if (dag == DaggerYes) {
|
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
|
||||||
int sU = ss;
|
|
||||||
int sF = LLs * sU;
|
|
||||||
Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
|
|
||||||
}
|
|
||||||
#ifdef AVX512_SWITCHOFF
|
|
||||||
} else if (stat.is_init() ) {
|
|
||||||
|
|
||||||
int nthreads;
|
|
||||||
stat.start();
|
|
||||||
#pragma omp parallel
|
|
||||||
{
|
|
||||||
#pragma omp master
|
|
||||||
nthreads = omp_get_num_threads();
|
|
||||||
int mythread = omp_get_thread_num();
|
|
||||||
stat.enter(mythread);
|
|
||||||
#pragma omp for nowait
|
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++) {
|
|
||||||
int sU=ss;
|
|
||||||
int sF=LLs*sU;
|
|
||||||
Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
|
|
||||||
}
|
|
||||||
stat.exit(mythread);
|
|
||||||
}
|
|
||||||
stat.accum(nthreads);
|
|
||||||
#endif
|
|
||||||
} else {
|
|
||||||
#if 1
|
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
|
||||||
int sU = ss;
|
|
||||||
int sF = LLs * sU;
|
|
||||||
Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
#ifdef GRID_OMP
|
|
||||||
#pragma omp parallel
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
int len = U._grid->oSites();
|
|
||||||
int me, myoff,mywork;
|
|
||||||
GridThread::GetWorkBarrier(len,me, mywork,myoff);
|
|
||||||
int sF = LLs * myoff;
|
|
||||||
Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
DhopComputeTime+=usecond();
|
DhopComputeTime+=usecond();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -66,8 +66,7 @@ public:
|
|||||||
// Move this elsewhere? FIXME
|
// Move this elsewhere? FIXME
|
||||||
static inline void AddGaugeLink(GaugeField &U, GaugeLinkField &W,
|
static inline void AddGaugeLink(GaugeField &U, GaugeLinkField &W,
|
||||||
int mu) { // U[mu] += W
|
int mu) { // U[mu] += W
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for (auto ss = 0; ss < U._grid->oSites(); ss++) {
|
||||||
for (auto ss = 0; ss < U._grid->oSites(); ss++) {
|
|
||||||
U._odata[ss]._internal[mu] =
|
U._odata[ss]._internal[mu] =
|
||||||
U._odata[ss]._internal[mu] + W._odata[ss]._internal;
|
U._odata[ss]._internal[mu] + W._odata[ss]._internal;
|
||||||
}
|
}
|
||||||
|
@ -48,8 +48,7 @@ void axpibg5x(Lattice<vobj> &z,const Lattice<vobj> &x,Coeff a,Coeff b)
|
|||||||
GridBase *grid=x._grid;
|
GridBase *grid=x._grid;
|
||||||
|
|
||||||
Gamma G5(Gamma::Gamma5);
|
Gamma G5(Gamma::Gamma5);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<grid->oSites();ss++){
|
||||||
for(int ss=0;ss<grid->oSites();ss++){
|
|
||||||
vobj tmp;
|
vobj tmp;
|
||||||
tmp = a*x._odata[ss];
|
tmp = a*x._odata[ss];
|
||||||
tmp = tmp + G5*(b*timesI(x._odata[ss]));
|
tmp = tmp + G5*(b*timesI(x._odata[ss]));
|
||||||
@ -65,8 +64,7 @@ void axpby_ssp(Lattice<vobj> &z, Coeff a,const Lattice<vobj> &x,Coeff b,const La
|
|||||||
conformable(x,z);
|
conformable(x,z);
|
||||||
GridBase *grid=x._grid;
|
GridBase *grid=x._grid;
|
||||||
int Ls = grid->_rdimensions[0];
|
int Ls = grid->_rdimensions[0];
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
||||||
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
|
||||||
vobj tmp = a*x._odata[ss+s]+b*y._odata[ss+sp];
|
vobj tmp = a*x._odata[ss+s]+b*y._odata[ss+sp];
|
||||||
vstream(z._odata[ss+s],tmp);
|
vstream(z._odata[ss+s],tmp);
|
||||||
}
|
}
|
||||||
@ -81,8 +79,7 @@ void ag5xpby_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const L
|
|||||||
GridBase *grid=x._grid;
|
GridBase *grid=x._grid;
|
||||||
int Ls = grid->_rdimensions[0];
|
int Ls = grid->_rdimensions[0];
|
||||||
Gamma G5(Gamma::Gamma5);
|
Gamma G5(Gamma::Gamma5);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
||||||
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
|
||||||
vobj tmp;
|
vobj tmp;
|
||||||
tmp = G5*x._odata[ss+s]*a;
|
tmp = G5*x._odata[ss+s]*a;
|
||||||
tmp = tmp + b*y._odata[ss+sp];
|
tmp = tmp + b*y._odata[ss+sp];
|
||||||
@ -99,8 +96,7 @@ void axpbg5y_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const L
|
|||||||
GridBase *grid=x._grid;
|
GridBase *grid=x._grid;
|
||||||
int Ls = grid->_rdimensions[0];
|
int Ls = grid->_rdimensions[0];
|
||||||
Gamma G5(Gamma::Gamma5);
|
Gamma G5(Gamma::Gamma5);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
||||||
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
|
||||||
vobj tmp;
|
vobj tmp;
|
||||||
tmp = G5*y._odata[ss+sp]*b;
|
tmp = G5*y._odata[ss+sp]*b;
|
||||||
tmp = tmp + a*x._odata[ss+s];
|
tmp = tmp + a*x._odata[ss+s];
|
||||||
@ -117,8 +113,7 @@ void ag5xpbg5y_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const
|
|||||||
GridBase *grid=x._grid;
|
GridBase *grid=x._grid;
|
||||||
int Ls = grid->_rdimensions[0];
|
int Ls = grid->_rdimensions[0];
|
||||||
Gamma G5(Gamma::Gamma5);
|
Gamma G5(Gamma::Gamma5);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
||||||
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
|
||||||
vobj tmp1;
|
vobj tmp1;
|
||||||
vobj tmp2;
|
vobj tmp2;
|
||||||
tmp1 = a*x._odata[ss+s]+b*y._odata[ss+sp];
|
tmp1 = a*x._odata[ss+s]+b*y._odata[ss+sp];
|
||||||
@ -135,8 +130,7 @@ void axpby_ssp_pminus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,co
|
|||||||
conformable(x,z);
|
conformable(x,z);
|
||||||
GridBase *grid=x._grid;
|
GridBase *grid=x._grid;
|
||||||
int Ls = grid->_rdimensions[0];
|
int Ls = grid->_rdimensions[0];
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
||||||
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
|
||||||
vobj tmp;
|
vobj tmp;
|
||||||
spProj5m(tmp,y._odata[ss+sp]);
|
spProj5m(tmp,y._odata[ss+sp]);
|
||||||
tmp = a*x._odata[ss+s]+b*tmp;
|
tmp = a*x._odata[ss+s]+b*tmp;
|
||||||
@ -152,8 +146,7 @@ void axpby_ssp_pplus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,con
|
|||||||
conformable(x,z);
|
conformable(x,z);
|
||||||
GridBase *grid=x._grid;
|
GridBase *grid=x._grid;
|
||||||
int Ls = grid->_rdimensions[0];
|
int Ls = grid->_rdimensions[0];
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
||||||
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
|
||||||
vobj tmp;
|
vobj tmp;
|
||||||
spProj5p(tmp,y._odata[ss+sp]);
|
spProj5p(tmp,y._odata[ss+sp]);
|
||||||
tmp = a*x._odata[ss+s]+b*tmp;
|
tmp = a*x._odata[ss+s]+b*tmp;
|
||||||
@ -169,8 +162,7 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
|
|||||||
conformable(x,z);
|
conformable(x,z);
|
||||||
int Ls = grid->_rdimensions[0];
|
int Ls = grid->_rdimensions[0];
|
||||||
Gamma G5(Gamma::Gamma5);
|
Gamma G5(Gamma::Gamma5);
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
||||||
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
|
||||||
vobj tmp;
|
vobj tmp;
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
int sp = Ls-1-s;
|
int sp = Ls-1-s;
|
||||||
|
@ -221,8 +221,7 @@ class SU {
|
|||||||
int i0, i1;
|
int i0, i1;
|
||||||
su2SubGroupIndex(i0, i1, su2_index);
|
su2SubGroupIndex(i0, i1, su2_index);
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for (int ss = 0; ss < grid->oSites(); ss++) {
|
||||||
for (int ss = 0; ss < grid->oSites(); ss++) {
|
|
||||||
subgroup._odata[ss]()()(0, 0) = source._odata[ss]()()(i0, i0);
|
subgroup._odata[ss]()()(0, 0) = source._odata[ss]()()(i0, i0);
|
||||||
subgroup._odata[ss]()()(0, 1) = source._odata[ss]()()(i0, i1);
|
subgroup._odata[ss]()()(0, 1) = source._odata[ss]()()(i0, i1);
|
||||||
subgroup._odata[ss]()()(1, 0) = source._odata[ss]()()(i1, i0);
|
subgroup._odata[ss]()()(1, 0) = source._odata[ss]()()(i1, i0);
|
||||||
@ -252,8 +251,7 @@ class SU {
|
|||||||
su2SubGroupIndex(i0, i1, su2_index);
|
su2SubGroupIndex(i0, i1, su2_index);
|
||||||
|
|
||||||
dest = 1.0; // start out with identity
|
dest = 1.0; // start out with identity
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for (int ss = 0; ss < grid->oSites(); ss++) {
|
||||||
for (int ss = 0; ss < grid->oSites(); ss++) {
|
|
||||||
dest._odata[ss]()()(i0, i0) = subgroup._odata[ss]()()(0, 0);
|
dest._odata[ss]()()(i0, i0) = subgroup._odata[ss]()()(0, 0);
|
||||||
dest._odata[ss]()()(i0, i1) = subgroup._odata[ss]()()(0, 1);
|
dest._odata[ss]()()(i0, i1) = subgroup._odata[ss]()()(0, 1);
|
||||||
dest._odata[ss]()()(i1, i0) = subgroup._odata[ss]()()(1, 0);
|
dest._odata[ss]()()(i1, i0) = subgroup._odata[ss]()()(1, 0);
|
||||||
|
@ -31,8 +31,6 @@ using namespace std;
|
|||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
using namespace Grid::QCD;
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
#define parallel_for PARALLEL_FOR_LOOP for
|
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
int main (int argc, char ** argv)
|
||||||
{
|
{
|
||||||
Grid_init(&argc,&argv);
|
Grid_init(&argc,&argv);
|
||||||
|
@ -31,8 +31,6 @@ using namespace std;
|
|||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
using namespace Grid::QCD;
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
#define parallel_for PARALLEL_FOR_LOOP for
|
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
int main (int argc, char ** argv)
|
||||||
{
|
{
|
||||||
Grid_init(&argc,&argv);
|
Grid_init(&argc,&argv);
|
||||||
|
@ -31,7 +31,7 @@ using namespace std;
|
|||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
using namespace Grid::QCD;
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
#define parallel_for PARALLEL_FOR_LOOP for
|
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
int main (int argc, char ** argv)
|
||||||
{
|
{
|
||||||
|
@ -31,7 +31,7 @@ using namespace std;
|
|||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
using namespace Grid::QCD;
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
#define parallel_for PARALLEL_FOR_LOOP for
|
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
int main (int argc, char ** argv)
|
||||||
{
|
{
|
||||||
|
@ -31,7 +31,7 @@ using namespace std;
|
|||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
using namespace Grid::QCD;
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
#define parallel_for PARALLEL_FOR_LOOP for
|
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
int main (int argc, char ** argv)
|
||||||
{
|
{
|
||||||
|
@ -31,7 +31,7 @@ using namespace std;
|
|||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
using namespace Grid::QCD;
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
#define parallel_for PARALLEL_FOR_LOOP for
|
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
int main (int argc, char ** argv)
|
||||||
{
|
{
|
||||||
|
@ -31,7 +31,7 @@ using namespace std;
|
|||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
using namespace Grid::QCD;
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
#define parallel_for PARALLEL_FOR_LOOP for
|
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
int main (int argc, char ** argv)
|
||||||
{
|
{
|
||||||
|
@ -31,7 +31,7 @@ using namespace std;
|
|||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
using namespace Grid::QCD;
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
#define parallel_for PARALLEL_FOR_LOOP for
|
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
int main (int argc, char ** argv)
|
||||||
{
|
{
|
||||||
|
@ -31,7 +31,7 @@ using namespace std;
|
|||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
using namespace Grid::QCD;
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
#define parallel_for PARALLEL_FOR_LOOP for
|
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
int main (int argc, char ** argv)
|
||||||
{
|
{
|
||||||
|
@ -31,7 +31,7 @@ using namespace std;
|
|||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
using namespace Grid::QCD;
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
#define parallel_for PARALLEL_FOR_LOOP for
|
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
int main (int argc, char ** argv)
|
||||||
{
|
{
|
||||||
|
@ -31,7 +31,7 @@ using namespace std;
|
|||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
using namespace Grid::QCD;
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
#define parallel_for PARALLEL_FOR_LOOP for
|
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
int main (int argc, char ** argv)
|
||||||
{
|
{
|
||||||
|
Loading…
x
Reference in New Issue
Block a user