mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-03 21:44:33 +00:00 
			
		
		
		
	Global changes to parallel_for structure.
Move the comms flags to more sensible names
This commit is contained in:
		@@ -77,8 +77,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    double start=usecond();
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int t=0;t<threads;t++){
 | 
			
		||||
    parallel_for(int t=0;t<threads;t++){
 | 
			
		||||
 | 
			
		||||
      sum[t] = x[t]._odata[0];
 | 
			
		||||
      for(int i=0;i<Nloop;i++){
 | 
			
		||||
 
 | 
			
		||||
@@ -342,11 +342,11 @@ void Grid_init(int *argc,char ***argv)
 | 
			
		||||
  } else {
 | 
			
		||||
    QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
 | 
			
		||||
  }
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-isend") ){
 | 
			
		||||
    CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyIsend);
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){
 | 
			
		||||
    CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent);
 | 
			
		||||
  }
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sendrecv") ){
 | 
			
		||||
    CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySendrecv);
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sequential") ){
 | 
			
		||||
    CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
 | 
			
		||||
  }
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
 | 
			
		||||
    LebesgueOrder::UseLebesgueOrder=1;
 | 
			
		||||
 
 | 
			
		||||
@@ -81,77 +81,14 @@ template<class vobj,class cobj,class compressor>
 | 
			
		||||
void Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)
 | 
			
		||||
{
 | 
			
		||||
  int num=table.size();
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int i=0;i<num;i++){
 | 
			
		||||
  parallel_for(int i=0;i<num;i++){
 | 
			
		||||
    vstream(buffer[off+table[i].first],compress(rhs._odata[so+table[i].second]));
 | 
			
		||||
    //    buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
///////////////////////////////////////////////////////////////////
 | 
			
		||||
// Gather for when there *is* need to SIMD split with compression
 | 
			
		||||
///////////////////////////////////////////////////////////////////
 | 
			
		||||
/*
 | 
			
		||||
template<class cobj,class vobj,class compressor> double
 | 
			
		||||
Gather_plane_exchange(const Lattice<vobj> &rhs,
 | 
			
		||||
		      std::vector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type)
 | 
			
		||||
{
 | 
			
		||||
  int rd = rhs._grid->_rdimensions[dimension];
 | 
			
		||||
  double t1,t2;
 | 
			
		||||
  if ( !rhs._grid->CheckerBoarded(dimension) ) {
 | 
			
		||||
    cbmask = 0x3;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
  int e1  =rhs._grid->_slice_nblock[dimension];
 | 
			
		||||
  int e2  =rhs._grid->_slice_block [dimension];
 | 
			
		||||
  int n1  =rhs._grid->_slice_stride[dimension];
 | 
			
		||||
 | 
			
		||||
  // Need to switch to a table loop
 | 
			
		||||
  std::vector<std::pair<int,int> > table;
 | 
			
		||||
 | 
			
		||||
  if ( cbmask ==0x3){
 | 
			
		||||
    for(int n=0;n<e1;n++){
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
	int o      =   n*n1;
 | 
			
		||||
	int offset = b+n*e2;
 | 
			
		||||
	table.push_back(std::pair<int,int> (offset,o+b));
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    // Case of SIMD split AND checker dim cannot currently be hit, except in 
 | 
			
		||||
    // Test_cshift_red_black code.
 | 
			
		||||
    for(int n=0;n<e1;n++){
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
	int o=n*n1;
 | 
			
		||||
	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
 | 
			
		||||
	int offset = b+n*e2;
 | 
			
		||||
 | 
			
		||||
	if ( ocb & cbmask ) {
 | 
			
		||||
	  table.push_back(std::pair<int,int> (offset,o+b));
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  assert( (table.size()&0x1)==0);
 | 
			
		||||
  t1=usecond();
 | 
			
		||||
PARALLEL_FOR_LOOP     
 | 
			
		||||
  for(int j=0;j<table.size()/2;j++){
 | 
			
		||||
    //    buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
 | 
			
		||||
    cobj temp1 =compress(rhs._odata[so+table[2*j].second]);
 | 
			
		||||
    cobj temp2 =compress(rhs._odata[so+table[2*j+1].second]);
 | 
			
		||||
    cobj temp3;
 | 
			
		||||
    cobj temp4;
 | 
			
		||||
    exchange(temp3,temp4,temp1,temp2,type);
 | 
			
		||||
    vstream(pointers[0][j],temp3);
 | 
			
		||||
    vstream(pointers[1][j],temp4);
 | 
			
		||||
  }
 | 
			
		||||
  t2=usecond();
 | 
			
		||||
  return t2-t1;
 | 
			
		||||
}
 | 
			
		||||
*/
 | 
			
		||||
 | 
			
		||||
template<class cobj,class vobj,class compressor>
 | 
			
		||||
void Gather_plane_exchange_table(const Lattice<vobj> &rhs,
 | 
			
		||||
				 std::vector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type) __attribute__((noinline));
 | 
			
		||||
@@ -164,8 +101,7 @@ void Gather_plane_exchange_table(std::vector<std::pair<int,int> >& table,const L
 | 
			
		||||
  assert( (table.size()&0x1)==0);
 | 
			
		||||
  int num=table.size()/2;
 | 
			
		||||
  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
PARALLEL_FOR_LOOP     
 | 
			
		||||
  for(int j=0;j<num;j++){
 | 
			
		||||
  parallel_for(int j=0;j<num;j++){
 | 
			
		||||
    //    buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
 | 
			
		||||
    cobj temp1 =compress(rhs._odata[so+table[2*j].second]);
 | 
			
		||||
    cobj temp2 =compress(rhs._odata[so+table[2*j+1].second]);
 | 
			
		||||
@@ -235,19 +171,14 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
					  Packets[i].recv_buf,
 | 
			
		||||
					  Packets[i].from_rank,
 | 
			
		||||
					  Packets[i].bytes);
 | 
			
		||||
      if( _grid->CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicySendrecv ) {
 | 
			
		||||
	_grid->StencilSendToRecvFromComplete(reqs[i]);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    commtime+=usecond();
 | 
			
		||||
  }
 | 
			
		||||
  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
 | 
			
		||||
  {
 | 
			
		||||
    commtime-=usecond();
 | 
			
		||||
    if( _grid->CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicyIsend ) {
 | 
			
		||||
      for(int i=0;i<Packets.size();i++){
 | 
			
		||||
	_grid->StencilSendToRecvFromComplete(reqs[i]);
 | 
			
		||||
      }
 | 
			
		||||
    for(int i=0;i<Packets.size();i++){
 | 
			
		||||
      _grid->StencilSendToRecvFromComplete(reqs[i]);
 | 
			
		||||
    }
 | 
			
		||||
    _grid->StencilBarrier();// Synch shared memory on a single nodes
 | 
			
		||||
    commtime+=usecond();
 | 
			
		||||
@@ -327,14 +258,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
      //      std::ofstream fout(fname);
 | 
			
		||||
 | 
			
		||||
      if ( Mergers[i].exchange == 0 ) { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int o=0;o<Mergers[i].buffer_size;o++){
 | 
			
		||||
	parallel_for(int o=0;o<Mergers[i].buffer_size;o++){
 | 
			
		||||
	  merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
 | 
			
		||||
	  //	fout<<o<<" "<<Mergers[i].mpointer[o]<<std::endl;
 | 
			
		||||
	}
 | 
			
		||||
      } else { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int o=0;o<Mergers[i].buffer_size/2;o++){
 | 
			
		||||
	parallel_for(int o=0;o<Mergers[i].buffer_size/2;o++){
 | 
			
		||||
	  exchange(Mergers[i].mpointer[2*o],Mergers[i].mpointer[2*o+1],
 | 
			
		||||
		   Mergers[i].vpointers[0][o],Mergers[i].vpointers[1][o],Mergers[i].type);
 | 
			
		||||
	  //	  cobj temp1,temp2;
 | 
			
		||||
 
 | 
			
		||||
@@ -51,6 +51,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#define PARALLEL_CRITICAL
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define parallel_for       PARALLEL_FOR_LOOP for
 | 
			
		||||
#define parallel_for_nest2 PARALLEL_NESTED_LOOP2 for
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
  // Introduce a class to gain deterministic bit reproducible reduction.
 | 
			
		||||
 
 | 
			
		||||
@@ -267,8 +267,7 @@ namespace Grid {
 | 
			
		||||
      SimpleCompressor<siteVector> compressor;
 | 
			
		||||
      Stencil.HaloExchange(in,compressor);
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<Grid()->oSites();ss++){
 | 
			
		||||
      parallel_for(int ss=0;ss<Grid()->oSites();ss++){
 | 
			
		||||
        siteVector res = zero;
 | 
			
		||||
	siteVector nbr;
 | 
			
		||||
	int ptype;
 | 
			
		||||
@@ -380,8 +379,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
	  Subspace.ProjectToSubspace(oProj,oblock);
 | 
			
		||||
	  //	  blockProject(iProj,iblock,Subspace.subspace);
 | 
			
		||||
	  //	  blockProject(oProj,oblock,Subspace.subspace);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
	  for(int ss=0;ss<Grid()->oSites();ss++){
 | 
			
		||||
	  parallel_for(int ss=0;ss<Grid()->oSites();ss++){
 | 
			
		||||
	    for(int j=0;j<nbasis;j++){
 | 
			
		||||
	      if( disp!= 0 ) {
 | 
			
		||||
		A[p]._odata[ss](j,i) = oProj._odata[ss](j);
 | 
			
		||||
 
 | 
			
		||||
@@ -33,7 +33,7 @@ namespace Grid {
 | 
			
		||||
///////////////////////////////////////////////////////////////
 | 
			
		||||
void *              CartesianCommunicator::ShmCommBuf;
 | 
			
		||||
uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 128*1024*1024; 
 | 
			
		||||
CartesianCommunicator::CommunicatorPolicy_t  CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicySendrecv;
 | 
			
		||||
CartesianCommunicator::CommunicatorPolicy_t  CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
 | 
			
		||||
 | 
			
		||||
/////////////////////////////////
 | 
			
		||||
// Alloc, free shmem region
 | 
			
		||||
 
 | 
			
		||||
@@ -118,7 +118,7 @@ class CartesianCommunicator {
 | 
			
		||||
  static void * ShmCommBuf;
 | 
			
		||||
 | 
			
		||||
  // Isend/Irecv/Wait, or Sendrecv blocking
 | 
			
		||||
  enum CommunicatorPolicy_t { CommunicatorPolicyIsend , CommunicatorPolicySendrecv };
 | 
			
		||||
  enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
 | 
			
		||||
  static CommunicatorPolicy_t CommunicatorPolicy;
 | 
			
		||||
  static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -158,7 +158,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 | 
			
		||||
{
 | 
			
		||||
  int myrank = _processor;
 | 
			
		||||
  int ierr;
 | 
			
		||||
  if ( CommunicatorPolicy == CommunicatorPolicyIsend ) { 
 | 
			
		||||
  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
 | 
			
		||||
    MPI_Request xrq;
 | 
			
		||||
    MPI_Request rrq;
 | 
			
		||||
 | 
			
		||||
@@ -178,7 +178,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 | 
			
		||||
{
 | 
			
		||||
  if ( CommunicatorPolicy == CommunicatorPolicyIsend ) { 
 | 
			
		||||
  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
 | 
			
		||||
    int nreq=list.size();
 | 
			
		||||
    std::vector<MPI_Status> status(nreq);
 | 
			
		||||
    int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
 | 
			
		||||
 
 | 
			
		||||
@@ -511,7 +511,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 | 
			
		||||
  int myrank = _processor;
 | 
			
		||||
  int ierr;
 | 
			
		||||
 | 
			
		||||
  if ( CommunicatorPolicy == CommunicatorPolicyIsend ) { 
 | 
			
		||||
  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
 | 
			
		||||
    MPI_Request xrq;
 | 
			
		||||
    MPI_Request rrq;
 | 
			
		||||
 | 
			
		||||
@@ -567,6 +567,11 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 | 
			
		||||
    list.push_back(xrq);
 | 
			
		||||
    off_node_bytes+=bytes;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( CommunicatorPolicy == CommunicatorPolicySequential ) { 
 | 
			
		||||
    this->StencilSendToRecvFromComplete(list);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  return off_node_bytes;
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
 | 
			
		||||
@@ -585,8 +590,8 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
 | 
			
		||||
 | 
			
		||||
  std::vector<MPI_Status> status(nreq);
 | 
			
		||||
  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
 | 
			
		||||
  list.resize(0);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
  list.resize(0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::Barrier(void)
 | 
			
		||||
{
 | 
			
		||||
 
 | 
			
		||||
@@ -58,8 +58,7 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimen
 | 
			
		||||
 | 
			
		||||
  int stride=rhs._grid->_slice_stride[dimension];
 | 
			
		||||
  if ( cbmask == 0x3 ) { 
 | 
			
		||||
PARALLEL_NESTED_LOOP2
 | 
			
		||||
    for(int n=0;n<e1;n++){
 | 
			
		||||
    parallel_for_nest2(int n=0;n<e1;n++){
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
	int o  = n*stride;
 | 
			
		||||
	int bo = n*e2;
 | 
			
		||||
@@ -78,8 +77,7 @@ PARALLEL_NESTED_LOOP2
 | 
			
		||||
	 }
 | 
			
		||||
       }
 | 
			
		||||
     }
 | 
			
		||||
PARALLEL_FOR_LOOP     
 | 
			
		||||
     for(int i=0;i<table.size();i++){
 | 
			
		||||
     parallel_for(int i=0;i<table.size();i++){
 | 
			
		||||
       buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
 | 
			
		||||
     }
 | 
			
		||||
  }
 | 
			
		||||
@@ -105,8 +103,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
 | 
			
		||||
  int n1=rhs._grid->_slice_stride[dimension];
 | 
			
		||||
 | 
			
		||||
  if ( cbmask ==0x3){
 | 
			
		||||
PARALLEL_NESTED_LOOP2
 | 
			
		||||
    for(int n=0;n<e1;n++){
 | 
			
		||||
    parallel_for_nest2(int n=0;n<e1;n++){
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
 | 
			
		||||
	int o      =   n*n1;
 | 
			
		||||
@@ -122,8 +119,7 @@ PARALLEL_NESTED_LOOP2
 | 
			
		||||
    // Case of SIMD split AND checker dim cannot currently be hit, except in 
 | 
			
		||||
    // Test_cshift_red_black code.
 | 
			
		||||
    std::cout << " Dense packed buffer WARNING " <<std::endl;
 | 
			
		||||
PARALLEL_NESTED_LOOP2
 | 
			
		||||
    for(int n=0;n<e1;n++){
 | 
			
		||||
    parallel_for_nest2(int n=0;n<e1;n++){
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
 | 
			
		||||
	int o=n*n1;
 | 
			
		||||
@@ -175,8 +171,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
 | 
			
		||||
  int stride=rhs._grid->_slice_stride[dimension];
 | 
			
		||||
  
 | 
			
		||||
  if ( cbmask ==0x3 ) {
 | 
			
		||||
PARALLEL_NESTED_LOOP2
 | 
			
		||||
    for(int n=0;n<e1;n++){
 | 
			
		||||
    parallel_for_nest2(int n=0;n<e1;n++){
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
	int o   =n*rhs._grid->_slice_stride[dimension];
 | 
			
		||||
	int bo  =n*rhs._grid->_slice_block[dimension];
 | 
			
		||||
@@ -195,8 +190,7 @@ PARALLEL_NESTED_LOOP2
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
PARALLEL_FOR_LOOP     
 | 
			
		||||
     for(int i=0;i<table.size();i++){
 | 
			
		||||
    parallel_for(int i=0;i<table.size();i++){
 | 
			
		||||
       //       std::cout << "Rcv"<< table[i].first << " " << table[i].second << " " <<buffer[table[i].second]<<std::endl;
 | 
			
		||||
       rhs._odata[table[i].first]=buffer[table[i].second];
 | 
			
		||||
     }
 | 
			
		||||
@@ -220,8 +214,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
  int e2=rhs._grid->_slice_block[dimension];
 | 
			
		||||
 | 
			
		||||
  if(cbmask ==0x3 ) {
 | 
			
		||||
PARALLEL_NESTED_LOOP2
 | 
			
		||||
    for(int n=0;n<e1;n++){
 | 
			
		||||
    parallel_for_nest2(int n=0;n<e1;n++){
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
	int o      = n*rhs._grid->_slice_stride[dimension];
 | 
			
		||||
	int offset = b+n*rhs._grid->_slice_block[dimension];
 | 
			
		||||
@@ -265,8 +258,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
 | 
			
		||||
  int e2=rhs._grid->_slice_block[dimension];
 | 
			
		||||
  int stride = rhs._grid->_slice_stride[dimension];
 | 
			
		||||
  if(cbmask == 0x3 ){
 | 
			
		||||
PARALLEL_NESTED_LOOP2
 | 
			
		||||
    for(int n=0;n<e1;n++){
 | 
			
		||||
    parallel_for_nest2(int n=0;n<e1;n++){
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
 
 | 
			
		||||
        int o =n*stride+b;
 | 
			
		||||
@@ -275,8 +267,7 @@ PARALLEL_NESTED_LOOP2
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
PARALLEL_NESTED_LOOP2
 | 
			
		||||
    for(int n=0;n<e1;n++){
 | 
			
		||||
    parallel_for_nest2(int n=0;n<e1;n++){
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
 
 | 
			
		||||
        int o =n*stride+b;
 | 
			
		||||
@@ -306,8 +297,8 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
 | 
			
		||||
  int e1=rhs._grid->_slice_nblock[dimension];
 | 
			
		||||
  int e2=rhs._grid->_slice_block [dimension];
 | 
			
		||||
  int stride = rhs._grid->_slice_stride[dimension];
 | 
			
		||||
PARALLEL_NESTED_LOOP2
 | 
			
		||||
  for(int n=0;n<e1;n++){
 | 
			
		||||
 | 
			
		||||
  parallel_for_nest2(int n=0;n<e1;n++){
 | 
			
		||||
  for(int b=0;b<e2;b++){
 | 
			
		||||
 | 
			
		||||
      int o  =n*stride;
 | 
			
		||||
 
 | 
			
		||||
@@ -39,8 +39,7 @@ namespace Grid {
 | 
			
		||||
    ret.checkerboard = lhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
    conformable(lhs,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
 | 
			
		||||
@@ -56,8 +55,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    ret.checkerboard = lhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
    conformable(lhs,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
 | 
			
		||||
@@ -73,8 +71,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    ret.checkerboard = lhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
    conformable(lhs,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
 | 
			
		||||
@@ -89,8 +86,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    ret.checkerboard = lhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
    conformable(lhs,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      add(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
 | 
			
		||||
@@ -108,8 +104,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 | 
			
		||||
    ret.checkerboard = lhs.checkerboard;
 | 
			
		||||
    conformable(lhs,ret);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      mult(&tmp,&lhs._odata[ss],&rhs);
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
@@ -120,8 +115,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 | 
			
		||||
    ret.checkerboard = lhs.checkerboard;
 | 
			
		||||
    conformable(ret,lhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      mac(&tmp,&lhs._odata[ss],&rhs);
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
@@ -132,8 +126,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 | 
			
		||||
    ret.checkerboard = lhs.checkerboard;
 | 
			
		||||
    conformable(ret,lhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      sub(&tmp,&lhs._odata[ss],&rhs);
 | 
			
		||||
@@ -147,8 +140,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 | 
			
		||||
    ret.checkerboard = lhs.checkerboard;
 | 
			
		||||
    conformable(lhs,ret);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      add(&tmp,&lhs._odata[ss],&rhs);
 | 
			
		||||
@@ -166,8 +158,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 | 
			
		||||
    ret.checkerboard = rhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      mult(&tmp,&lhs,&rhs._odata[ss]);
 | 
			
		||||
@@ -182,8 +173,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 | 
			
		||||
    ret.checkerboard = rhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      mac(&tmp,&lhs,&rhs._odata[ss]);
 | 
			
		||||
@@ -198,8 +188,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 | 
			
		||||
    ret.checkerboard = rhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      sub(&tmp,&lhs,&rhs._odata[ss]);
 | 
			
		||||
@@ -213,8 +202,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 | 
			
		||||
    ret.checkerboard = rhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      add(&tmp,&lhs,&rhs._odata[ss]);
 | 
			
		||||
@@ -230,8 +218,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    ret.checkerboard = x.checkerboard;
 | 
			
		||||
    conformable(ret,x);
 | 
			
		||||
    conformable(x,y);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<x._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<x._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      vobj tmp = a*x._odata[ss]+y._odata[ss];
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
@@ -245,8 +232,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    ret.checkerboard = x.checkerboard;
 | 
			
		||||
    conformable(ret,x);
 | 
			
		||||
    conformable(x,y);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<x._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<x._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      vobj tmp = a*x._odata[ss]+b*y._odata[ss];
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
 
 | 
			
		||||
@@ -121,8 +121,7 @@ public:
 | 
			
		||||
    assert( (cb==Odd) || (cb==Even));
 | 
			
		||||
    checkerboard=cb;
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      vobj tmp = eval(ss,expr);
 | 
			
		||||
      vstream(_odata[ss] ,tmp);
 | 
			
		||||
@@ -144,8 +143,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    assert( (cb==Odd) || (cb==Even));
 | 
			
		||||
    checkerboard=cb;
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      vobj tmp = eval(ss,expr);
 | 
			
		||||
      vstream(_odata[ss] ,tmp);
 | 
			
		||||
@@ -167,8 +165,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    assert( (cb==Odd) || (cb==Even));
 | 
			
		||||
    checkerboard=cb;
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      //vobj tmp = eval(ss,expr);
 | 
			
		||||
      vstream(_odata[ss] ,eval(ss,expr));
 | 
			
		||||
@@ -191,8 +188,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    checkerboard=cb;
 | 
			
		||||
 | 
			
		||||
    _odata.resize(_grid->oSites());
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      vobj tmp = eval(ss,expr);
 | 
			
		||||
      vstream(_odata[ss] ,tmp);
 | 
			
		||||
@@ -213,8 +209,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    checkerboard=cb;
 | 
			
		||||
 | 
			
		||||
    _odata.resize(_grid->oSites());
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      vobj tmp = eval(ss,expr);
 | 
			
		||||
      vstream(_odata[ss] ,tmp);
 | 
			
		||||
@@ -235,8 +230,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    checkerboard=cb;
 | 
			
		||||
 | 
			
		||||
    _odata.resize(_grid->oSites());
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
      vstream(_odata[ss] ,eval(ss,expr));
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
@@ -258,8 +252,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    	_grid = r._grid;
 | 
			
		||||
    	checkerboard = r.checkerboard;
 | 
			
		||||
    	_odata.resize(_grid->oSites());// essential
 | 
			
		||||
  		PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
	parallel_for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
            _odata[ss]=r._odata[ss];
 | 
			
		||||
        }  	
 | 
			
		||||
    }
 | 
			
		||||
@@ -269,8 +262,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    virtual ~Lattice(void) = default;
 | 
			
		||||
    
 | 
			
		||||
    template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
      parallel_for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
            this->_odata[ss]=r;
 | 
			
		||||
        }
 | 
			
		||||
        return *this;
 | 
			
		||||
@@ -279,8 +271,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
      this->checkerboard = r.checkerboard;
 | 
			
		||||
      conformable(*this,r);
 | 
			
		||||
      
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
      parallel_for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
            this->_odata[ss]=r._odata[ss];
 | 
			
		||||
        }
 | 
			
		||||
        return *this;
 | 
			
		||||
 
 | 
			
		||||
@@ -45,90 +45,87 @@ namespace Grid {
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  template<class vfunctor,class lobj,class robj>  
 | 
			
		||||
    inline Lattice<vInteger> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
 | 
			
		||||
    {
 | 
			
		||||
      Lattice<vInteger> ret(rhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
	  ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]);
 | 
			
		||||
        }
 | 
			
		||||
        return ret;
 | 
			
		||||
  {
 | 
			
		||||
    Lattice<vInteger> ret(rhs._grid);
 | 
			
		||||
    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
      ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]);
 | 
			
		||||
    }
 | 
			
		||||
    return ret;
 | 
			
		||||
  }
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // compare lattice to scalar
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    template<class vfunctor,class lobj,class robj> 
 | 
			
		||||
  template<class vfunctor,class lobj,class robj> 
 | 
			
		||||
    inline Lattice<vInteger> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
 | 
			
		||||
    {
 | 
			
		||||
      Lattice<vInteger> ret(lhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<lhs._grid->oSites(); ss++){
 | 
			
		||||
	  ret._odata[ss]=op(lhs._odata[ss],rhs);
 | 
			
		||||
        }
 | 
			
		||||
        return ret;
 | 
			
		||||
  {
 | 
			
		||||
    Lattice<vInteger> ret(lhs._grid);
 | 
			
		||||
    parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
 | 
			
		||||
      ret._odata[ss]=op(lhs._odata[ss],rhs);
 | 
			
		||||
    }
 | 
			
		||||
    return ret;
 | 
			
		||||
  }
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // compare scalar to lattice
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    template<class vfunctor,class lobj,class robj> 
 | 
			
		||||
  template<class vfunctor,class lobj,class robj> 
 | 
			
		||||
    inline Lattice<vInteger> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
 | 
			
		||||
    {
 | 
			
		||||
      Lattice<vInteger> ret(rhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
	  ret._odata[ss]=op(lhs._odata[ss],rhs);
 | 
			
		||||
        }
 | 
			
		||||
        return ret;
 | 
			
		||||
  {
 | 
			
		||||
    Lattice<vInteger> ret(rhs._grid);
 | 
			
		||||
    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
      ret._odata[ss]=op(lhs._odata[ss],rhs);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return ret;
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Map to functors
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // Less than
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     return LLComparison(vlt<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
 | 
			
		||||
     return LSComparison(vlt<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     return SLComparison(vlt<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
 | 
			
		||||
   // Less than equal
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     return LLComparison(vle<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
 | 
			
		||||
     return LSComparison(vle<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     return SLComparison(vle<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
 | 
			
		||||
   // Greater than 
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     return LLComparison(vgt<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
 | 
			
		||||
     return LSComparison(vgt<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
  // Less than
 | 
			
		||||
  template<class lobj,class robj>
 | 
			
		||||
    inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
    return LLComparison(vlt<lobj,robj>(),lhs,rhs);
 | 
			
		||||
  }
 | 
			
		||||
  template<class lobj,class robj>
 | 
			
		||||
    inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
 | 
			
		||||
    return LSComparison(vlt<lobj,robj>(),lhs,rhs);
 | 
			
		||||
  }
 | 
			
		||||
  template<class lobj,class robj>
 | 
			
		||||
    inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
    return SLComparison(vlt<lobj,robj>(),lhs,rhs);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  // Less than equal
 | 
			
		||||
  template<class lobj,class robj>
 | 
			
		||||
    inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
    return LLComparison(vle<lobj,robj>(),lhs,rhs);
 | 
			
		||||
  }
 | 
			
		||||
  template<class lobj,class robj>
 | 
			
		||||
    inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
 | 
			
		||||
    return LSComparison(vle<lobj,robj>(),lhs,rhs);
 | 
			
		||||
  }
 | 
			
		||||
  template<class lobj,class robj>
 | 
			
		||||
    inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
    return SLComparison(vle<lobj,robj>(),lhs,rhs);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  // Greater than 
 | 
			
		||||
  template<class lobj,class robj>
 | 
			
		||||
    inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
    return LLComparison(vgt<lobj,robj>(),lhs,rhs);
 | 
			
		||||
  }
 | 
			
		||||
  template<class lobj,class robj>
 | 
			
		||||
    inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
 | 
			
		||||
    return LSComparison(vgt<lobj,robj>(),lhs,rhs);
 | 
			
		||||
  }
 | 
			
		||||
  template<class lobj,class robj>
 | 
			
		||||
    inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     return SLComparison(vgt<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
   // Greater than equal
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  
 | 
			
		||||
  // Greater than equal
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     return LLComparison(vge<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
@@ -136,38 +133,37 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
     return LSComparison(vge<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     return SLComparison(vge<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
 | 
			
		||||
   
 | 
			
		||||
   // equal
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     return LLComparison(veq<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
 | 
			
		||||
     inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
 | 
			
		||||
     return LSComparison(veq<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     return SLComparison(veq<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
   
 | 
			
		||||
   
 | 
			
		||||
   // not equal
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     return LLComparison(vne<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
 | 
			
		||||
     inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
 | 
			
		||||
     return LSComparison(vne<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     return SLComparison(vne<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -34,47 +34,42 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
    /////////////////////////////////////////////////////
 | 
			
		||||
    // Non site, reduced locally reduced routines
 | 
			
		||||
    /////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
    // localNorm2,
 | 
			
		||||
    template<class vobj>
 | 
			
		||||
  /////////////////////////////////////////////////////
 | 
			
		||||
  // Non site, reduced locally reduced routines
 | 
			
		||||
  /////////////////////////////////////////////////////
 | 
			
		||||
  
 | 
			
		||||
  // localNorm2,
 | 
			
		||||
  template<class vobj>
 | 
			
		||||
    inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
 | 
			
		||||
    {
 | 
			
		||||
      Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
	  ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]);
 | 
			
		||||
        }
 | 
			
		||||
        return ret;
 | 
			
		||||
      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
	ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]);
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    // localInnerProduct
 | 
			
		||||
    template<class vobj>
 | 
			
		||||
  
 | 
			
		||||
  // localInnerProduct
 | 
			
		||||
  template<class vobj>
 | 
			
		||||
    inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
 | 
			
		||||
    {
 | 
			
		||||
      Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
	ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]);
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    // outerProduct Scalar x Scalar -> Scalar
 | 
			
		||||
    //              Vector x Vector -> Matrix
 | 
			
		||||
    template<class ll,class rr>
 | 
			
		||||
  
 | 
			
		||||
  // outerProduct Scalar x Scalar -> Scalar
 | 
			
		||||
  //              Vector x Vector -> Matrix
 | 
			
		||||
  template<class ll,class rr>
 | 
			
		||||
    inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
 | 
			
		||||
    {
 | 
			
		||||
        Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
            ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
 | 
			
		||||
        }
 | 
			
		||||
        return ret;
 | 
			
		||||
     }
 | 
			
		||||
 | 
			
		||||
  {
 | 
			
		||||
    Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
 | 
			
		||||
    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
      ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
 | 
			
		||||
    }
 | 
			
		||||
    return ret;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -37,8 +37,7 @@ namespace Grid {
 | 
			
		||||
  inline Lattice<vobj> operator -(const Lattice<vobj> &r)
 | 
			
		||||
  {
 | 
			
		||||
    Lattice<vobj> ret(r._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<r._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<r._grid->oSites();ss++){
 | 
			
		||||
      vstream(ret._odata[ss], -r._odata[ss]);
 | 
			
		||||
    }
 | 
			
		||||
    return ret;
 | 
			
		||||
@@ -74,8 +73,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
  inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
 | 
			
		||||
  {
 | 
			
		||||
    Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
      decltype(lhs*rhs._odata[0]) tmp=lhs*rhs._odata[ss]; 
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
	   //      ret._odata[ss]=lhs*rhs._odata[ss];
 | 
			
		||||
@@ -86,8 +84,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs+rhs._odata[0])>
 | 
			
		||||
    {
 | 
			
		||||
      Lattice<decltype(lhs+rhs._odata[0])> ret(rhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
	decltype(lhs+rhs._odata[0]) tmp =lhs-rhs._odata[ss];  
 | 
			
		||||
	vstream(ret._odata[ss],tmp);
 | 
			
		||||
	//	ret._odata[ss]=lhs+rhs._odata[ss];
 | 
			
		||||
@@ -98,11 +95,9 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs-rhs._odata[0])>
 | 
			
		||||
  {
 | 
			
		||||
    Lattice<decltype(lhs-rhs._odata[0])> ret(rhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
      decltype(lhs-rhs._odata[0]) tmp=lhs-rhs._odata[ss];  
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
      //      ret._odata[ss]=lhs-rhs._odata[ss];
 | 
			
		||||
    }
 | 
			
		||||
    return ret;
 | 
			
		||||
  }
 | 
			
		||||
@@ -110,8 +105,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
      inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
 | 
			
		||||
    {
 | 
			
		||||
      Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<lhs._grid->oSites(); ss++){
 | 
			
		||||
      parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
 | 
			
		||||
	decltype(lhs._odata[0]*rhs) tmp =lhs._odata[ss]*rhs;
 | 
			
		||||
	vstream(ret._odata[ss],tmp);
 | 
			
		||||
	//            ret._odata[ss]=lhs._odata[ss]*rhs;
 | 
			
		||||
@@ -122,8 +116,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
      inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]+rhs)>
 | 
			
		||||
    {
 | 
			
		||||
        Lattice<decltype(lhs._odata[0]+rhs)> ret(lhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
	parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
	  decltype(lhs._odata[0]+rhs) tmp=lhs._odata[ss]+rhs; 
 | 
			
		||||
	  vstream(ret._odata[ss],tmp);
 | 
			
		||||
	  //	  ret._odata[ss]=lhs._odata[ss]+rhs;
 | 
			
		||||
@@ -134,15 +127,12 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
      inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]-rhs)>
 | 
			
		||||
    {
 | 
			
		||||
      Lattice<decltype(lhs._odata[0]-rhs)> ret(lhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
	  decltype(lhs._odata[0]-rhs) tmp=lhs._odata[ss]-rhs;
 | 
			
		||||
	  vstream(ret._odata[ss],tmp);
 | 
			
		||||
	  //	ret._odata[ss]=lhs._odata[ss]-rhs;
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -44,22 +44,20 @@ namespace Grid {
 | 
			
		||||
    {
 | 
			
		||||
      Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> ret(lhs._grid);
 | 
			
		||||
      ret.checkerboard=lhs.checkerboard;
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i);
 | 
			
		||||
        }
 | 
			
		||||
        return ret;
 | 
			
		||||
      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
	ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i);
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    };
 | 
			
		||||
    template<int Index,class vobj>
 | 
			
		||||
       auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
 | 
			
		||||
      auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
 | 
			
		||||
    {
 | 
			
		||||
      Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> ret(lhs._grid);
 | 
			
		||||
      ret.checkerboard=lhs.checkerboard;
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j);
 | 
			
		||||
        }
 | 
			
		||||
        return ret;
 | 
			
		||||
      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
	ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j);
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -68,18 +66,16 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    template<int Index,class vobj> 
 | 
			
		||||
    void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0))> & rhs,int i)
 | 
			
		||||
    {
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
	  pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i);
 | 
			
		||||
	}      
 | 
			
		||||
      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
	pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i);
 | 
			
		||||
      }      
 | 
			
		||||
    }
 | 
			
		||||
    template<int Index,class vobj>
 | 
			
		||||
      void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0,0))> & rhs,int i,int j)
 | 
			
		||||
    {
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
	  pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i,j);
 | 
			
		||||
	}      
 | 
			
		||||
      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
	pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i,j);
 | 
			
		||||
      }      
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    //////////////////////////////////////////////////////
 | 
			
		||||
@@ -131,9 +127,6 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
 | 
			
		||||
      assert( l.checkerboard == l._grid->CheckerBoard(site));
 | 
			
		||||
 | 
			
		||||
      // FIXME
 | 
			
		||||
      //      assert( sizeof(sobj)*Nsimd == sizeof(vobj));
 | 
			
		||||
 | 
			
		||||
      int rank,odx,idx;
 | 
			
		||||
      grid->GlobalCoorToRankIndex(rank,odx,idx,site);
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -40,8 +40,7 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
    template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
 | 
			
		||||
        Lattice<vobj> ret(lhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
	parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
            ret._odata[ss] = adj(lhs._odata[ss]);
 | 
			
		||||
        }
 | 
			
		||||
        return ret;
 | 
			
		||||
@@ -49,13 +48,10 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
 | 
			
		||||
    template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
 | 
			
		||||
        Lattice<vobj> ret(lhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
            ret._odata[ss] = conjugate(lhs._odata[ss]);
 | 
			
		||||
	parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
	  ret._odata[ss] = conjugate(lhs._odata[ss]);
 | 
			
		||||
        }
 | 
			
		||||
        return ret;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -57,8 +57,7 @@ namespace Grid {
 | 
			
		||||
	sumarray[i]=zero;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int thr=0;thr<grid->SumArraySize();thr++){
 | 
			
		||||
      parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
 | 
			
		||||
	int nwork, mywork, myoff;
 | 
			
		||||
	GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
 | 
			
		||||
	
 | 
			
		||||
@@ -68,7 +67,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
	}
 | 
			
		||||
	sumarray[thr]=TensorRemove(vnrm) ;
 | 
			
		||||
      }
 | 
			
		||||
    
 | 
			
		||||
      
 | 
			
		||||
      vector_type vvnrm; vvnrm=zero;  // sum across threads
 | 
			
		||||
      for(int i=0;i<grid->SumArraySize();i++){
 | 
			
		||||
	vvnrm = vvnrm+sumarray[i];
 | 
			
		||||
@@ -114,18 +113,17 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
	sumarray[i]=zero;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int thr=0;thr<grid->SumArraySize();thr++){
 | 
			
		||||
      parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
 | 
			
		||||
	int nwork, mywork, myoff;
 | 
			
		||||
	GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
 | 
			
		||||
 | 
			
		||||
	
 | 
			
		||||
	vobj vvsum=zero;
 | 
			
		||||
        for(int ss=myoff;ss<mywork+myoff; ss++){
 | 
			
		||||
	  vvsum = vvsum + arg._odata[ss];
 | 
			
		||||
	}
 | 
			
		||||
	sumarray[thr]=vvsum;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      
 | 
			
		||||
      vobj vsum=zero;  // sum across threads
 | 
			
		||||
      for(int i=0;i<grid->SumArraySize();i++){
 | 
			
		||||
	vsum = vsum+sumarray[i];
 | 
			
		||||
 
 | 
			
		||||
@@ -302,8 +302,7 @@ namespace Grid {
 | 
			
		||||
      int words=sizeof(scalar_object)/sizeof(scalar_type);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<osites;ss++){
 | 
			
		||||
      parallel_for(int ss=0;ss<osites;ss++){
 | 
			
		||||
 | 
			
		||||
	std::vector<scalar_object> buf(Nsimd);
 | 
			
		||||
	for(int m=0;m<multiplicity;m++) {// Draw from same generator multiplicity times
 | 
			
		||||
 
 | 
			
		||||
@@ -42,8 +42,7 @@ namespace Grid {
 | 
			
		||||
      -> Lattice<decltype(trace(lhs._odata[0]))>
 | 
			
		||||
    {
 | 
			
		||||
      Lattice<decltype(trace(lhs._odata[0]))> ret(lhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
            ret._odata[ss] = trace(lhs._odata[ss]);
 | 
			
		||||
        }
 | 
			
		||||
        return ret;
 | 
			
		||||
@@ -56,8 +55,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))>
 | 
			
		||||
    {
 | 
			
		||||
      Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
	ret._odata[ss] = traceIndex<Index>(lhs._odata[ss]);
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
 
 | 
			
		||||
@@ -51,7 +51,7 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
 | 
			
		||||
  template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
 | 
			
		||||
    half.checkerboard = cb;
 | 
			
		||||
    int ssh=0;
 | 
			
		||||
    //PARALLEL_FOR_LOOP
 | 
			
		||||
    //parallel_for
 | 
			
		||||
    for(int ss=0;ss<full._grid->oSites();ss++){
 | 
			
		||||
      std::vector<int> coor;
 | 
			
		||||
      int cbos;
 | 
			
		||||
@@ -68,7 +68,7 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
 | 
			
		||||
  template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
 | 
			
		||||
    int cb = half.checkerboard;
 | 
			
		||||
    int ssh=0;
 | 
			
		||||
    //PARALLEL_FOR_LOOP
 | 
			
		||||
    //parallel_for
 | 
			
		||||
    for(int ss=0;ss<full._grid->oSites();ss++){
 | 
			
		||||
      std::vector<int> coor;
 | 
			
		||||
      int cbos;
 | 
			
		||||
@@ -153,8 +153,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
 | 
			
		||||
    assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int sf=0;sf<fine->oSites();sf++){
 | 
			
		||||
  parallel_for(int sf=0;sf<fine->oSites();sf++){
 | 
			
		||||
    
 | 
			
		||||
    int sc;
 | 
			
		||||
    std::vector<int> coor_c(_ndimension);
 | 
			
		||||
@@ -186,8 +185,7 @@ template<class vobj,class CComplex>
 | 
			
		||||
 | 
			
		||||
  fine_inner = localInnerProduct(fineX,fineY);
 | 
			
		||||
  blockSum(coarse_inner,fine_inner);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int ss=0;ss<coarse->oSites();ss++){
 | 
			
		||||
  parallel_for(int ss=0;ss<coarse->oSites();ss++){
 | 
			
		||||
    CoarseInner._odata[ss] = coarse_inner._odata[ss];
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -347,8 +345,7 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
 | 
			
		||||
    assert(ig->lSites() == og->lSites());
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int idx=0;idx<ig->lSites();idx++){
 | 
			
		||||
  parallel_for(int idx=0;idx<ig->lSites();idx++){
 | 
			
		||||
    sobj s;
 | 
			
		||||
    ssobj ss;
 | 
			
		||||
 | 
			
		||||
@@ -386,8 +383,7 @@ void InsertSlice(Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // the above should guarantee that the operations are local
 | 
			
		||||
  PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int idx=0;idx<lg->lSites();idx++){
 | 
			
		||||
  parallel_for(int idx=0;idx<lg->lSites();idx++){
 | 
			
		||||
    sobj s;
 | 
			
		||||
    std::vector<int> lcoor(nl);
 | 
			
		||||
    std::vector<int> hcoor(nh);
 | 
			
		||||
@@ -428,8 +424,7 @@ void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, in
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  // the above should guarantee that the operations are local
 | 
			
		||||
  PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int idx=0;idx<lg->lSites();idx++){
 | 
			
		||||
  parallel_for(int idx=0;idx<lg->lSites();idx++){
 | 
			
		||||
    sobj s;
 | 
			
		||||
    std::vector<int> lcoor(nl);
 | 
			
		||||
    std::vector<int> hcoor(nh);
 | 
			
		||||
@@ -468,8 +463,7 @@ void InsertSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // the above should guarantee that the operations are local
 | 
			
		||||
  PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int idx=0;idx<lg->lSites();idx++){
 | 
			
		||||
  parallel_for(int idx=0;idx<lg->lSites();idx++){
 | 
			
		||||
    sobj s;
 | 
			
		||||
    std::vector<int> lcoor(nl);
 | 
			
		||||
    std::vector<int> hcoor(nh);
 | 
			
		||||
@@ -504,8 +498,7 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slic
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // the above should guarantee that the operations are local
 | 
			
		||||
  PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int idx=0;idx<lg->lSites();idx++){
 | 
			
		||||
  parallel_for(int idx=0;idx<lg->lSites();idx++){
 | 
			
		||||
    sobj s;
 | 
			
		||||
    std::vector<int> lcoor(nl);
 | 
			
		||||
    std::vector<int> hcoor(nh);
 | 
			
		||||
@@ -574,8 +567,7 @@ typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>
 | 
			
		||||
    in_grid->iCoorFromIindex(in_icoor[lane], lane);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index
 | 
			
		||||
  parallel_for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index
 | 
			
		||||
    //Assemble vector of pointers to output elements
 | 
			
		||||
    std::vector<sobj*> out_ptrs(in_nsimd);
 | 
			
		||||
 | 
			
		||||
@@ -623,8 +615,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
 | 
			
		||||
  std::vector<SobjOut> in_slex_conv(in_grid->lSites());
 | 
			
		||||
  unvectorizeToLexOrdArray(in_slex_conv, in);
 | 
			
		||||
    
 | 
			
		||||
  PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){
 | 
			
		||||
  parallel_for(int out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){
 | 
			
		||||
    std::vector<int> out_ocoor(ndim);
 | 
			
		||||
    out_grid->oCoorFromOindex(out_ocoor, out_oidx);
 | 
			
		||||
 | 
			
		||||
@@ -642,10 +633,6 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
 | 
			
		||||
    merge(out._odata[out_oidx], ptrs, 0);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -40,27 +40,24 @@ namespace Grid {
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  template<class vobj>
 | 
			
		||||
    inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
 | 
			
		||||
        Lattice<vobj> ret(lhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
            ret._odata[ss] = transpose(lhs._odata[ss]);
 | 
			
		||||
        }
 | 
			
		||||
        return ret;
 | 
			
		||||
    };
 | 
			
		||||
    Lattice<vobj> ret(lhs._grid);
 | 
			
		||||
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
      ret._odata[ss] = transpose(lhs._odata[ss]);
 | 
			
		||||
    }
 | 
			
		||||
    return ret;
 | 
			
		||||
  };
 | 
			
		||||
    
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // Index level dependent transpose
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    template<int Index,class vobj>
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Index level dependent transpose
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  template<int Index,class vobj>
 | 
			
		||||
    inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))>
 | 
			
		||||
    {
 | 
			
		||||
      Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
            ret._odata[ss] = transposeIndex<Index>(lhs._odata[ss]);
 | 
			
		||||
        }
 | 
			
		||||
        return ret;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
  {
 | 
			
		||||
    Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
 | 
			
		||||
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
      ret._odata[ss] = transposeIndex<Index>(lhs._odata[ss]);
 | 
			
		||||
    }
 | 
			
		||||
    return ret;
 | 
			
		||||
  };
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -37,8 +37,7 @@ namespace Grid {
 | 
			
		||||
    Lattice<obj> ret(rhs._grid);
 | 
			
		||||
    ret.checkerboard = rhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
      ret._odata[ss]=pow(rhs._odata[ss],y);
 | 
			
		||||
    }
 | 
			
		||||
    return ret;
 | 
			
		||||
@@ -47,8 +46,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    Lattice<obj> ret(rhs._grid);
 | 
			
		||||
    ret.checkerboard = rhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
      ret._odata[ss]=mod(rhs._odata[ss],y);
 | 
			
		||||
    }
 | 
			
		||||
    return ret;
 | 
			
		||||
@@ -58,8 +56,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    Lattice<obj> ret(rhs._grid);
 | 
			
		||||
    ret.checkerboard = rhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
      ret._odata[ss]=div(rhs._odata[ss],y);
 | 
			
		||||
    }
 | 
			
		||||
    return ret;
 | 
			
		||||
@@ -69,8 +66,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    Lattice<obj> ret(rhs._grid);
 | 
			
		||||
    ret.checkerboard = rhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
      ret._odata[ss]=Exponentiate(rhs._odata[ss],alpha, Nexp);
 | 
			
		||||
    }
 | 
			
		||||
    return ret;
 | 
			
		||||
 
 | 
			
		||||
@@ -56,8 +56,7 @@ inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<
 | 
			
		||||
  std::vector<scalar_object> truevals (Nsimd);
 | 
			
		||||
  std::vector<scalar_object> falsevals(Nsimd);
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int ss=0;ss<iftrue._grid->oSites(); ss++){
 | 
			
		||||
  parallel_for(int ss=0;ss<iftrue._grid->oSites(); ss++){
 | 
			
		||||
 | 
			
		||||
    extract(iftrue._odata[ss]   ,truevals);
 | 
			
		||||
    extract(iffalse._odata[ss]  ,falsevals);
 | 
			
		||||
 
 | 
			
		||||
@@ -54,8 +54,8 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 | 
			
		||||
  // Flops = 6.0*(Nc*Ns) *Ls*vol
 | 
			
		||||
  M5Dcalls++;
 | 
			
		||||
  M5Dtime-=usecond();
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
 | 
			
		||||
 | 
			
		||||
  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
 | 
			
		||||
    for(int s=0;s<Ls;s++){
 | 
			
		||||
      auto tmp = psi._odata[0];
 | 
			
		||||
      if ( s==0 ) {
 | 
			
		||||
@@ -98,8 +98,8 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 | 
			
		||||
  // Flops = 6.0*(Nc*Ns) *Ls*vol
 | 
			
		||||
  M5Dcalls++;
 | 
			
		||||
  M5Dtime-=usecond();
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
 | 
			
		||||
 | 
			
		||||
  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
 | 
			
		||||
    auto tmp = psi._odata[0];
 | 
			
		||||
    for(int s=0;s<Ls;s++){
 | 
			
		||||
      if ( s==0 ) {
 | 
			
		||||
@@ -137,8 +137,7 @@ void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &
 | 
			
		||||
  MooeeInvCalls++;
 | 
			
		||||
  MooeeInvTime-=usecond();
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
 | 
			
		||||
  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
 | 
			
		||||
    auto tmp = psi._odata[0];
 | 
			
		||||
 | 
			
		||||
    // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
 | 
			
		||||
@@ -184,8 +183,7 @@ void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &
 | 
			
		||||
  MooeeInvCalls++;
 | 
			
		||||
  MooeeInvTime-=usecond();
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
 | 
			
		||||
  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
 | 
			
		||||
 | 
			
		||||
    auto tmp = psi._odata[0];
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -91,8 +91,7 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 | 
			
		||||
 | 
			
		||||
  assert(Nc==3);
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
 | 
			
		||||
  parallel_for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
 | 
			
		||||
#if 0
 | 
			
		||||
      alignas(64) SiteHalfSpinor hp;
 | 
			
		||||
      alignas(64) SiteHalfSpinor hm;
 | 
			
		||||
@@ -232,8 +231,7 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 | 
			
		||||
 | 
			
		||||
  M5Dcalls++;
 | 
			
		||||
  M5Dtime-=usecond();
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
 | 
			
		||||
  parallel_for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
 | 
			
		||||
#if 0
 | 
			
		||||
    alignas(64) SiteHalfSpinor hp;
 | 
			
		||||
    alignas(64) SiteHalfSpinor hm;
 | 
			
		||||
@@ -792,13 +790,11 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
 | 
			
		||||
  MooeeInvTime-=usecond();
 | 
			
		||||
 | 
			
		||||
  if ( switcheroo<Coeff_t>::iscomplex() ) {
 | 
			
		||||
  PARALLEL_FOR_LOOP
 | 
			
		||||
    for(auto site=0;site<vol;site++){
 | 
			
		||||
    parallel_for(auto site=0;site<vol;site++){
 | 
			
		||||
      MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
  PARALLEL_FOR_LOOP
 | 
			
		||||
    for(auto site=0;site<vol;site++){
 | 
			
		||||
    parallel_for(auto site=0;site<vol;site++){
 | 
			
		||||
      MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -194,8 +194,7 @@ namespace QCD {
 | 
			
		||||
      GaugeLinkField tmp(mat._grid);
 | 
			
		||||
      tmp = zero;
 | 
			
		||||
      
 | 
			
		||||
      PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int sss=0;sss<tmp._grid->oSites();sss++){
 | 
			
		||||
      parallel_for(int sss=0;sss<tmp._grid->oSites();sss++){
 | 
			
		||||
	int sU=sss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU;
 | 
			
		||||
@@ -445,8 +444,7 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
 | 
			
		||||
       Uconj = where(coor==neglink,-Uconj,Uconj);
 | 
			
		||||
     }
 | 
			
		||||
	  
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
     for(auto ss=U.begin();ss<U.end();ss++){
 | 
			
		||||
     parallel_for(auto ss=U.begin();ss<U.end();ss++){
 | 
			
		||||
       Uds[ss](0)(mu) = U[ss]();
 | 
			
		||||
       Uds[ss](1)(mu) = Uconj[ss]();
 | 
			
		||||
     }
 | 
			
		||||
@@ -459,8 +457,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
       Utmp = where(coor==0,Uconj,Utmp);
 | 
			
		||||
     }
 | 
			
		||||
	  
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
     for(auto ss=U.begin();ss<U.end();ss++){
 | 
			
		||||
     parallel_for(auto ss=U.begin();ss<U.end();ss++){
 | 
			
		||||
       Uds[ss](0)(mu+4) = Utmp[ss]();
 | 
			
		||||
     }
 | 
			
		||||
	  
 | 
			
		||||
@@ -469,8 +466,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
       Utmp = where(coor==0,U,Utmp);
 | 
			
		||||
     }
 | 
			
		||||
	  
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
     for(auto ss=U.begin();ss<U.end();ss++){
 | 
			
		||||
     parallel_for(auto ss=U.begin();ss<U.end();ss++){
 | 
			
		||||
       Uds[ss](1)(mu+4) = Utmp[ss]();
 | 
			
		||||
     }
 | 
			
		||||
	  
 | 
			
		||||
@@ -484,8 +480,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
   GaugeLinkField link(mat._grid);
 | 
			
		||||
   // use lorentz for flavour as hack.
 | 
			
		||||
   auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
   for (auto ss = tmp.begin(); ss < tmp.end(); ss++) {
 | 
			
		||||
   parallel_for(auto ss = tmp.begin(); ss < tmp.end(); ss++) {
 | 
			
		||||
     link[ss]() = tmp[ss](0, 0) - conjugate(tmp[ss](1, 1));
 | 
			
		||||
   }
 | 
			
		||||
   PokeIndex<LorentzIndex>(mat, link, mu);
 | 
			
		||||
@@ -498,8 +493,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
	
 | 
			
		||||
   GaugeLinkField tmp(mat._grid);
 | 
			
		||||
   tmp = zero;
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
   for (int ss = 0; ss < tmp._grid->oSites(); ss++) {
 | 
			
		||||
   parallel_for(int ss = 0; ss < tmp._grid->oSites(); ss++) {
 | 
			
		||||
     for (int s = 0; s < Ls; s++) {
 | 
			
		||||
       int sF = s + Ls * ss;
 | 
			
		||||
       auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde[sF], Atilde[sF]));
 | 
			
		||||
 
 | 
			
		||||
@@ -222,8 +222,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
 | 
			
		||||
    ////////////////////////
 | 
			
		||||
    // Call the single hop
 | 
			
		||||
    ////////////////////////
 | 
			
		||||
    PARALLEL_FOR_LOOP
 | 
			
		||||
    for (int sss = 0; sss < B._grid->oSites(); sss++) {
 | 
			
		||||
    parallel_for (int sss = 0; sss < B._grid->oSites(); sss++) {
 | 
			
		||||
      Kernels::DhopDir(st, U, st.CommBuf(), sss, sss, B, Btilde, mu,
 | 
			
		||||
                               gamma);
 | 
			
		||||
    }
 | 
			
		||||
@@ -333,8 +332,7 @@ void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,
 | 
			
		||||
 | 
			
		||||
  Stencil.HaloExchange(in, compressor);
 | 
			
		||||
 | 
			
		||||
  PARALLEL_FOR_LOOP
 | 
			
		||||
  for (int sss = 0; sss < in._grid->oSites(); sss++) {
 | 
			
		||||
  parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
 | 
			
		||||
    Kernels::DhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma);
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
@@ -350,13 +348,11 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
 | 
			
		||||
  st.HaloExchange(in, compressor);
 | 
			
		||||
 | 
			
		||||
  if (dag == DaggerYes) {
 | 
			
		||||
    PARALLEL_FOR_LOOP
 | 
			
		||||
    for (int sss = 0; sss < in._grid->oSites(); sss++) {
 | 
			
		||||
    parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
 | 
			
		||||
      Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    PARALLEL_FOR_LOOP
 | 
			
		||||
    for (int sss = 0; sss < in._grid->oSites(); sss++) {
 | 
			
		||||
    parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
 | 
			
		||||
      Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -275,8 +275,7 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
 | 
			
		||||
  assert(dirdisp<=7);
 | 
			
		||||
  assert(dirdisp>=0);
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int ss=0;ss<Umu._grid->oSites();ss++){
 | 
			
		||||
  parallel_for(int ss=0;ss<Umu._grid->oSites();ss++){
 | 
			
		||||
    for(int s=0;s<Ls;s++){
 | 
			
		||||
      int sU=ss;
 | 
			
		||||
      int sF = s+Ls*sU; 
 | 
			
		||||
@@ -323,8 +322,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
 | 
			
		||||
    ////////////////////////
 | 
			
		||||
 | 
			
		||||
    DerivDhopComputeTime -= usecond();
 | 
			
		||||
    PARALLEL_FOR_LOOP
 | 
			
		||||
    for (int sss = 0; sss < U._grid->oSites(); sss++) {
 | 
			
		||||
    parallel_for (int sss = 0; sss < U._grid->oSites(); sss++) {
 | 
			
		||||
      for (int s = 0; s < Ls; s++) {
 | 
			
		||||
        int sU = sss;
 | 
			
		||||
        int sF = s + Ls * sU;
 | 
			
		||||
@@ -493,73 +491,18 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
 | 
			
		||||
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
 | 
			
		||||
 | 
			
		||||
  if (dag == DaggerYes) {
 | 
			
		||||
    PARALLEL_FOR_LOOP
 | 
			
		||||
    for (int ss = 0; ss < U._grid->oSites(); ss++) {
 | 
			
		||||
    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
 | 
			
		||||
      int sU = ss;
 | 
			
		||||
      int sF = LLs * sU;
 | 
			
		||||
      Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    PARALLEL_FOR_LOOP
 | 
			
		||||
    for (int ss = 0; ss < U._grid->oSites(); ss++) {
 | 
			
		||||
    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
 | 
			
		||||
      int sU = ss;
 | 
			
		||||
      int sF = LLs * sU;
 | 
			
		||||
      Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  /*
 | 
			
		||||
 | 
			
		||||
  if (dag == DaggerYes) {
 | 
			
		||||
    PARALLEL_FOR_LOOP
 | 
			
		||||
    for (int ss = 0; ss < U._grid->oSites(); ss++) {
 | 
			
		||||
      int sU = ss;
 | 
			
		||||
      int sF = LLs * sU;
 | 
			
		||||
      Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
 | 
			
		||||
    }
 | 
			
		||||
#ifdef AVX512_SWITCHOFF
 | 
			
		||||
  } else if (stat.is_init() ) {
 | 
			
		||||
 | 
			
		||||
    int nthreads;
 | 
			
		||||
    stat.start();
 | 
			
		||||
#pragma omp parallel
 | 
			
		||||
    {
 | 
			
		||||
#pragma omp master
 | 
			
		||||
    nthreads = omp_get_num_threads();
 | 
			
		||||
    int mythread = omp_get_thread_num();
 | 
			
		||||
    stat.enter(mythread);
 | 
			
		||||
#pragma omp for nowait
 | 
			
		||||
    for(int ss=0;ss<U._grid->oSites();ss++) {
 | 
			
		||||
      int sU=ss;
 | 
			
		||||
      int sF=LLs*sU;
 | 
			
		||||
      Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
 | 
			
		||||
    }
 | 
			
		||||
    stat.exit(mythread);
 | 
			
		||||
    }
 | 
			
		||||
    stat.accum(nthreads);
 | 
			
		||||
#endif
 | 
			
		||||
  } else {
 | 
			
		||||
#if 1
 | 
			
		||||
    PARALLEL_FOR_LOOP
 | 
			
		||||
    for (int ss = 0; ss < U._grid->oSites(); ss++) {
 | 
			
		||||
      int sU = ss;
 | 
			
		||||
      int sF = LLs * sU;
 | 
			
		||||
      Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
 | 
			
		||||
    }
 | 
			
		||||
#else
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
#pragma omp parallel 
 | 
			
		||||
#endif
 | 
			
		||||
    {
 | 
			
		||||
      int len = U._grid->oSites();
 | 
			
		||||
      int me, myoff,mywork;
 | 
			
		||||
      GridThread::GetWorkBarrier(len,me, mywork,myoff);
 | 
			
		||||
      int sF = LLs * myoff;
 | 
			
		||||
      Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out);
 | 
			
		||||
    }
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
  */
 | 
			
		||||
 | 
			
		||||
  DhopComputeTime+=usecond();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -66,8 +66,7 @@ public:
 | 
			
		||||
  // Move this elsewhere? FIXME
 | 
			
		||||
  static inline void AddGaugeLink(GaugeField &U, GaugeLinkField &W,
 | 
			
		||||
                                  int mu) { // U[mu] += W
 | 
			
		||||
    PARALLEL_FOR_LOOP
 | 
			
		||||
    for (auto ss = 0; ss < U._grid->oSites(); ss++) {
 | 
			
		||||
    parallel_for (auto ss = 0; ss < U._grid->oSites(); ss++) {
 | 
			
		||||
      U._odata[ss]._internal[mu] =
 | 
			
		||||
          U._odata[ss]._internal[mu] + W._odata[ss]._internal;
 | 
			
		||||
    }
 | 
			
		||||
 
 | 
			
		||||
@@ -48,8 +48,7 @@ void axpibg5x(Lattice<vobj> &z,const Lattice<vobj> &x,Coeff a,Coeff b)
 | 
			
		||||
  GridBase *grid=x._grid;
 | 
			
		||||
 | 
			
		||||
  Gamma G5(Gamma::Gamma5);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int ss=0;ss<grid->oSites();ss++){
 | 
			
		||||
  parallel_for(int ss=0;ss<grid->oSites();ss++){
 | 
			
		||||
    vobj tmp;
 | 
			
		||||
    tmp = a*x._odata[ss];
 | 
			
		||||
    tmp = tmp + G5*(b*timesI(x._odata[ss]));
 | 
			
		||||
@@ -65,8 +64,7 @@ void axpby_ssp(Lattice<vobj> &z, Coeff a,const Lattice<vobj> &x,Coeff b,const La
 | 
			
		||||
  conformable(x,z);
 | 
			
		||||
  GridBase *grid=x._grid;
 | 
			
		||||
  int Ls = grid->_rdimensions[0];
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
 | 
			
		||||
  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
 | 
			
		||||
    vobj tmp = a*x._odata[ss+s]+b*y._odata[ss+sp];
 | 
			
		||||
    vstream(z._odata[ss+s],tmp);
 | 
			
		||||
  }
 | 
			
		||||
@@ -81,8 +79,7 @@ void ag5xpby_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const L
 | 
			
		||||
  GridBase *grid=x._grid;
 | 
			
		||||
  int Ls = grid->_rdimensions[0];
 | 
			
		||||
  Gamma G5(Gamma::Gamma5);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
 | 
			
		||||
  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
 | 
			
		||||
    vobj tmp;
 | 
			
		||||
    tmp = G5*x._odata[ss+s]*a;
 | 
			
		||||
    tmp = tmp + b*y._odata[ss+sp];
 | 
			
		||||
@@ -99,8 +96,7 @@ void axpbg5y_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const L
 | 
			
		||||
  GridBase *grid=x._grid;
 | 
			
		||||
  int Ls = grid->_rdimensions[0];
 | 
			
		||||
  Gamma G5(Gamma::Gamma5);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
 | 
			
		||||
  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
 | 
			
		||||
    vobj tmp;
 | 
			
		||||
    tmp = G5*y._odata[ss+sp]*b;
 | 
			
		||||
    tmp = tmp + a*x._odata[ss+s];
 | 
			
		||||
@@ -117,8 +113,7 @@ void ag5xpbg5y_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const
 | 
			
		||||
  GridBase *grid=x._grid;
 | 
			
		||||
  int Ls = grid->_rdimensions[0];
 | 
			
		||||
  Gamma G5(Gamma::Gamma5);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
 | 
			
		||||
  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
 | 
			
		||||
    vobj tmp1;
 | 
			
		||||
    vobj tmp2;
 | 
			
		||||
    tmp1 = a*x._odata[ss+s]+b*y._odata[ss+sp];
 | 
			
		||||
@@ -135,8 +130,7 @@ void axpby_ssp_pminus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,co
 | 
			
		||||
  conformable(x,z);
 | 
			
		||||
  GridBase *grid=x._grid;
 | 
			
		||||
  int Ls = grid->_rdimensions[0];
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
 | 
			
		||||
  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
 | 
			
		||||
    vobj tmp;
 | 
			
		||||
    spProj5m(tmp,y._odata[ss+sp]);
 | 
			
		||||
    tmp = a*x._odata[ss+s]+b*tmp;
 | 
			
		||||
@@ -152,8 +146,7 @@ void axpby_ssp_pplus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,con
 | 
			
		||||
  conformable(x,z);
 | 
			
		||||
  GridBase *grid=x._grid;
 | 
			
		||||
  int Ls = grid->_rdimensions[0];
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
 | 
			
		||||
  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
 | 
			
		||||
    vobj tmp;
 | 
			
		||||
    spProj5p(tmp,y._odata[ss+sp]);
 | 
			
		||||
    tmp = a*x._odata[ss+s]+b*tmp;
 | 
			
		||||
@@ -169,8 +162,7 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
 | 
			
		||||
  conformable(x,z);
 | 
			
		||||
  int Ls = grid->_rdimensions[0];
 | 
			
		||||
  Gamma G5(Gamma::Gamma5);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
 | 
			
		||||
  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
 | 
			
		||||
    vobj tmp;
 | 
			
		||||
    for(int s=0;s<Ls;s++){
 | 
			
		||||
      int sp = Ls-1-s;
 | 
			
		||||
 
 | 
			
		||||
@@ -221,8 +221,7 @@ class SU {
 | 
			
		||||
    int i0, i1;
 | 
			
		||||
    su2SubGroupIndex(i0, i1, su2_index);
 | 
			
		||||
 | 
			
		||||
    PARALLEL_FOR_LOOP
 | 
			
		||||
    for (int ss = 0; ss < grid->oSites(); ss++) {
 | 
			
		||||
    parallel_for (int ss = 0; ss < grid->oSites(); ss++) {
 | 
			
		||||
      subgroup._odata[ss]()()(0, 0) = source._odata[ss]()()(i0, i0);
 | 
			
		||||
      subgroup._odata[ss]()()(0, 1) = source._odata[ss]()()(i0, i1);
 | 
			
		||||
      subgroup._odata[ss]()()(1, 0) = source._odata[ss]()()(i1, i0);
 | 
			
		||||
@@ -252,8 +251,7 @@ class SU {
 | 
			
		||||
    su2SubGroupIndex(i0, i1, su2_index);
 | 
			
		||||
 | 
			
		||||
    dest = 1.0;  // start out with identity
 | 
			
		||||
    PARALLEL_FOR_LOOP
 | 
			
		||||
    for (int ss = 0; ss < grid->oSites(); ss++) {
 | 
			
		||||
    parallel_for (int ss = 0; ss < grid->oSites(); ss++) {
 | 
			
		||||
      dest._odata[ss]()()(i0, i0) = subgroup._odata[ss]()()(0, 0);
 | 
			
		||||
      dest._odata[ss]()()(i0, i1) = subgroup._odata[ss]()()(0, 1);
 | 
			
		||||
      dest._odata[ss]()()(i1, i0) = subgroup._odata[ss]()()(1, 0);
 | 
			
		||||
 
 | 
			
		||||
@@ -31,8 +31,6 @@ using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
#define parallel_for PARALLEL_FOR_LOOP for
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 
 | 
			
		||||
@@ -31,8 +31,6 @@ using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
#define parallel_for PARALLEL_FOR_LOOP for
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 
 | 
			
		||||
@@ -31,7 +31,7 @@ using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
#define parallel_for PARALLEL_FOR_LOOP for
 | 
			
		||||
 
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
 
 | 
			
		||||
@@ -31,7 +31,7 @@ using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
#define parallel_for PARALLEL_FOR_LOOP for
 | 
			
		||||
 
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
 
 | 
			
		||||
@@ -31,7 +31,7 @@ using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
#define parallel_for PARALLEL_FOR_LOOP for
 | 
			
		||||
 
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
 
 | 
			
		||||
@@ -31,7 +31,7 @@ using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
#define parallel_for PARALLEL_FOR_LOOP for
 | 
			
		||||
 
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
 
 | 
			
		||||
@@ -31,7 +31,7 @@ using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
#define parallel_for PARALLEL_FOR_LOOP for
 | 
			
		||||
 
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
 
 | 
			
		||||
@@ -31,7 +31,7 @@ using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
#define parallel_for PARALLEL_FOR_LOOP for
 | 
			
		||||
 
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
 
 | 
			
		||||
@@ -31,7 +31,7 @@ using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
#define parallel_for PARALLEL_FOR_LOOP for
 | 
			
		||||
 
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
 
 | 
			
		||||
@@ -31,7 +31,7 @@ using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
#define parallel_for PARALLEL_FOR_LOOP for
 | 
			
		||||
 
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
 
 | 
			
		||||
@@ -31,7 +31,7 @@ using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
#define parallel_for PARALLEL_FOR_LOOP for
 | 
			
		||||
 
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user