mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	GparityWilsonTM typedef added. Not yet tested
Conflicts: configure lib/qcd/action/fermion/WilsonKernels.h
This commit is contained in:
		@@ -117,6 +117,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  typename DomainWallFermionR::ImplParams params; 
 | 
					  typename DomainWallFermionR::ImplParams params; 
 | 
				
			||||||
  params.overlapCommsCompute = overlapComms;
 | 
					  params.overlapCommsCompute = overlapComms;
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
 | 
					  RealD NP = UGrid->_Nprocessors;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
 | 
					  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
@@ -136,6 +137,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
 | 
					    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
 | 
				
			||||||
    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
 | 
					    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
 | 
				
			||||||
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
					    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
				
			||||||
 | 
					    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
 | 
				
			||||||
    err = ref-result; 
 | 
					    err = ref-result; 
 | 
				
			||||||
    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
 | 
					    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
 | 
				
			||||||
    Dw.Report();
 | 
					    Dw.Report();
 | 
				
			||||||
@@ -193,6 +195,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
    double flops=(1344.0*volume*ncall)/2;
 | 
					    double flops=(1344.0*volume*ncall)/2;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
					    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
				
			||||||
 | 
					    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  Dw.DhopEO(src_o,r_e,DaggerNo);
 | 
					  Dw.DhopEO(src_o,r_e,DaggerNo);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -137,9 +137,6 @@
 | 
				
			|||||||
/* Define to the one symbol short name of this package. */
 | 
					/* Define to the one symbol short name of this package. */
 | 
				
			||||||
#undef PACKAGE_TARNAME
 | 
					#undef PACKAGE_TARNAME
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* Define to the home page for this package. */
 | 
					 | 
				
			||||||
#undef PACKAGE_URL
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define to the version of this package. */
 | 
					/* Define to the version of this package. */
 | 
				
			||||||
#undef PACKAGE_VERSION
 | 
					#undef PACKAGE_VERSION
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										316
									
								
								lib/Stencil.h
									
									
									
									
									
								
							
							
						
						
									
										316
									
								
								lib/Stencil.h
									
									
									
									
									
								
							@@ -7,8 +7,6 @@
 | 
				
			|||||||
    Copyright (C) 2015
 | 
					    Copyright (C) 2015
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
					Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 | 
					 | 
				
			||||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    This program is free software; you can redistribute it and/or modify
 | 
					    This program is free software; you can redistribute it and/or modify
 | 
				
			||||||
    it under the terms of the GNU General Public License as published by
 | 
					    it under the terms of the GNU General Public License as published by
 | 
				
			||||||
@@ -88,11 +86,95 @@ namespace Grid {
 | 
				
			|||||||
      typedef typename cobj::scalar_type scalar_type;
 | 
					      typedef typename cobj::scalar_type scalar_type;
 | 
				
			||||||
      typedef typename cobj::scalar_object scalar_object;
 | 
					      typedef typename cobj::scalar_object scalar_object;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      //////////////////////////////////////////
 | 
				
			||||||
 | 
					      // Comms packet queue for asynch thread
 | 
				
			||||||
 | 
					      //////////////////////////////////////////
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      struct Packet {
 | 
				
			||||||
 | 
						void * send_buf;
 | 
				
			||||||
 | 
						void * recv_buf;
 | 
				
			||||||
 | 
						Integer to_rank;
 | 
				
			||||||
 | 
						Integer from_rank;
 | 
				
			||||||
 | 
						Integer bytes;
 | 
				
			||||||
 | 
						volatile Integer done;
 | 
				
			||||||
 | 
					      };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      std::vector<Packet> Packets;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){
 | 
				
			||||||
 | 
						Packet p;
 | 
				
			||||||
 | 
						p.send_buf = xmit;
 | 
				
			||||||
 | 
						p.recv_buf = rcv;
 | 
				
			||||||
 | 
						p.to_rank  = to;
 | 
				
			||||||
 | 
						p.from_rank= from;
 | 
				
			||||||
 | 
						p.bytes    = bytes;
 | 
				
			||||||
 | 
						p.done     = 0;
 | 
				
			||||||
 | 
						comms_bytes+=2.0*bytes;
 | 
				
			||||||
 | 
						Packets.push_back(p);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      void Communicate(void ) { 
 | 
				
			||||||
 | 
						commtime-=usecond();
 | 
				
			||||||
 | 
						for(int i=0;i<Packets.size();i++){
 | 
				
			||||||
 | 
						  _grid->SendToRecvFrom(Packets[i].send_buf,
 | 
				
			||||||
 | 
									Packets[i].to_rank,
 | 
				
			||||||
 | 
									Packets[i].recv_buf,
 | 
				
			||||||
 | 
									Packets[i].from_rank,
 | 
				
			||||||
 | 
									Packets[i].bytes);
 | 
				
			||||||
 | 
						  Packets[i].done = 1;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						commtime+=usecond();
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      ///////////////////////////////////////////
 | 
				
			||||||
 | 
					      // Simd merge queue for asynch comms
 | 
				
			||||||
 | 
					      ///////////////////////////////////////////
 | 
				
			||||||
 | 
					      struct Merge {
 | 
				
			||||||
 | 
					        cobj * mpointer;
 | 
				
			||||||
 | 
						std::vector<scalar_object *> rpointers;
 | 
				
			||||||
 | 
						Integer buffer_size;
 | 
				
			||||||
 | 
						Integer packet_id;
 | 
				
			||||||
 | 
					      };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      std::vector<Merge> Mergers;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      void AddMerge(cobj *merge_p,std::vector<scalar_object *> &rpointers,Integer buffer_size,Integer packet_id) {
 | 
				
			||||||
 | 
						Merge m;
 | 
				
			||||||
 | 
						m.mpointer = merge_p;
 | 
				
			||||||
 | 
						m.rpointers= rpointers;
 | 
				
			||||||
 | 
						m.buffer_size = buffer_size;
 | 
				
			||||||
 | 
						m.packet_id   = packet_id;
 | 
				
			||||||
 | 
						Mergers.push_back(m);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      void CommsMerge(void ) { 
 | 
				
			||||||
 | 
						//PARALLEL_NESTED_LOOP2 
 | 
				
			||||||
 | 
						for(int i=0;i<Mergers.size();i++){	
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  
 | 
				
			||||||
 | 
						spintime-=usecond();
 | 
				
			||||||
 | 
						int packet_id = Mergers[i].packet_id;
 | 
				
			||||||
 | 
						while(! Packets[packet_id].done ); // spin for completion
 | 
				
			||||||
 | 
						spintime+=usecond();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						mergetime-=usecond();
 | 
				
			||||||
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
 | 
						  for(int o=0;o<Mergers[i].buffer_size;o++){
 | 
				
			||||||
 | 
						    merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
						mergetime+=usecond();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      ////////////////////////////////////////
 | 
				
			||||||
 | 
					      // Basic Grid and stencil info
 | 
				
			||||||
 | 
					      ////////////////////////////////////////
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      int                               _checkerboard;
 | 
					      int                               _checkerboard;
 | 
				
			||||||
      int                               _npoints; // Move to template param?
 | 
					      int                               _npoints; // Move to template param?
 | 
				
			||||||
      GridBase *                        _grid;
 | 
					      GridBase *                        _grid;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
      // npoints of these
 | 
					      // npoints of these
 | 
				
			||||||
      std::vector<int>                  _directions;
 | 
					      std::vector<int>                  _directions;
 | 
				
			||||||
      std::vector<int>                  _distances;
 | 
					      std::vector<int>                  _distances;
 | 
				
			||||||
@@ -101,32 +183,32 @@ namespace Grid {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
      // npoints x Osites() of these
 | 
					      // npoints x Osites() of these
 | 
				
			||||||
      std::vector<std::vector<StencilEntry> > _entries;
 | 
					      std::vector<std::vector<StencilEntry> > _entries;
 | 
				
			||||||
 | 
					 | 
				
			||||||
      // Comms buffers
 | 
					 | 
				
			||||||
      std::vector<std::vector<scalar_object> > send_buf_extract;
 | 
					 | 
				
			||||||
      std::vector<std::vector<scalar_object> > recv_buf_extract;
 | 
					 | 
				
			||||||
      std::vector<scalar_object *> pointers;
 | 
					 | 
				
			||||||
      std::vector<scalar_object *> rpointers;
 | 
					 | 
				
			||||||
      Vector<cobj> send_buf;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
      inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point][osite]; }
 | 
					      inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point][osite]; }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      // Comms buffers
 | 
				
			||||||
 | 
					      std::vector<Vector<scalar_object> > u_simd_send_buf;
 | 
				
			||||||
 | 
					      std::vector<Vector<scalar_object> > u_simd_recv_buf;
 | 
				
			||||||
 | 
					      Vector<cobj>          u_send_buf;
 | 
				
			||||||
 | 
					      Vector<cobj>          comm_buf;
 | 
				
			||||||
 | 
					      int u_comm_offset;
 | 
				
			||||||
      int _unified_buffer_size;
 | 
					      int _unified_buffer_size;
 | 
				
			||||||
      int _request_count;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
      double buftime;
 | 
					      /////////////////////////////////////////
 | 
				
			||||||
 | 
					      // Timing info; ugly; possibly temporary
 | 
				
			||||||
 | 
					      /////////////////////////////////////////
 | 
				
			||||||
 | 
					#define TIMING_HACK
 | 
				
			||||||
 | 
					#ifdef TIMING_HACK
 | 
				
			||||||
 | 
					      double jointime;
 | 
				
			||||||
      double gathertime;
 | 
					      double gathertime;
 | 
				
			||||||
      double commtime;
 | 
					      double commtime;
 | 
				
			||||||
      double commstime;
 | 
					      double halogtime;
 | 
				
			||||||
      double halotime;
 | 
					 | 
				
			||||||
      double scattertime;
 | 
					 | 
				
			||||||
      double mergetime;
 | 
					      double mergetime;
 | 
				
			||||||
 | 
					      double spintime;
 | 
				
			||||||
 | 
					      double comms_bytes;
 | 
				
			||||||
      double gathermtime;
 | 
					      double gathermtime;
 | 
				
			||||||
      double splicetime;
 | 
					      double splicetime;
 | 
				
			||||||
      double nosplicetime;
 | 
					      double nosplicetime;
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  CartesianStencil(GridBase *grid,
 | 
					  CartesianStencil(GridBase *grid,
 | 
				
			||||||
				     int npoints,
 | 
									     int npoints,
 | 
				
			||||||
@@ -135,28 +217,29 @@ namespace Grid {
 | 
				
			|||||||
				     const std::vector<int> &distances) 
 | 
									     const std::vector<int> &distances) 
 | 
				
			||||||
    :   _entries(npoints), _permute_type(npoints), _comm_buf_size(npoints)
 | 
					    :   _entries(npoints), _permute_type(npoints), _comm_buf_size(npoints)
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
 | 
					#ifdef TIMING_HACK
 | 
				
			||||||
      gathertime=0;
 | 
					      gathertime=0;
 | 
				
			||||||
 | 
					      jointime=0;
 | 
				
			||||||
      commtime=0;
 | 
					      commtime=0;
 | 
				
			||||||
      commstime=0;
 | 
					      halogtime=0;
 | 
				
			||||||
      halotime=0;
 | 
					 | 
				
			||||||
      scattertime=0;
 | 
					 | 
				
			||||||
      mergetime=0;
 | 
					      mergetime=0;
 | 
				
			||||||
 | 
					      spintime=0;
 | 
				
			||||||
      gathermtime=0;
 | 
					      gathermtime=0;
 | 
				
			||||||
      buftime=0;
 | 
					 | 
				
			||||||
      splicetime=0;
 | 
					      splicetime=0;
 | 
				
			||||||
      nosplicetime=0;
 | 
					      nosplicetime=0;
 | 
				
			||||||
 | 
					      comms_bytes=0;
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
      _npoints = npoints;
 | 
					      _npoints = npoints;
 | 
				
			||||||
      _grid    = grid;
 | 
					      _grid    = grid;
 | 
				
			||||||
      _directions = directions;
 | 
					      _directions = directions;
 | 
				
			||||||
      _distances  = distances;
 | 
					      _distances  = distances;
 | 
				
			||||||
      _unified_buffer_size=0;
 | 
					      _unified_buffer_size=0;
 | 
				
			||||||
      _request_count =0;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
      int osites  = _grid->oSites();
 | 
					      int osites  = _grid->oSites();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      for(int i=0;i<npoints;i++){
 | 
					      for(int ii=0;ii<npoints;ii++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						int i = ii; // reverse direction to get SIMD comms done first
 | 
				
			||||||
	int point = i;
 | 
						int point = i;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	_entries[i].resize( osites);
 | 
						_entries[i].resize( osites);
 | 
				
			||||||
@@ -197,21 +280,24 @@ namespace Grid {
 | 
				
			|||||||
	  sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
 | 
						  sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	  if ( sshift[0] == sshift[1] ) {
 | 
						  if ( sshift[0] == sshift[1] ) {
 | 
				
			||||||
	    //	    std::cout<<"Comms 0x3"<<std::endl;
 | 
					 | 
				
			||||||
	    Comms(point,dimension,shift,0x3);
 | 
						    Comms(point,dimension,shift,0x3);
 | 
				
			||||||
	  } else {
 | 
						  } else {
 | 
				
			||||||
	    //	    std::cout<<"Comms 0x1 ; 0x2"<<std::endl;
 | 
					 | 
				
			||||||
	    Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
 | 
						    Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
 | 
				
			||||||
	    Comms(point,dimension,shift,0x2);// both with block stride loop iteration
 | 
						    Comms(point,dimension,shift,0x2);// both with block stride loop iteration
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	//	for(int ss=0;ss<osites;ss++){
 | 
					 | 
				
			||||||
	//	  std::cout << "point["<<i<<"] "<<ss<<"-> o"<<_entries[i][ss]._offset<<"; l"<<
 | 
					 | 
				
			||||||
	//	    _entries[i][ss]._is_local<<"; p"<<_entries[i][ss]._permute<<std::endl;
 | 
					 | 
				
			||||||
	//	}
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
 | 
					      u_send_buf.resize(_unified_buffer_size);
 | 
				
			||||||
 | 
					      comm_buf.resize(_unified_buffer_size);
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
 | 
					      const int Nsimd = grid->Nsimd();
 | 
				
			||||||
 | 
					      u_simd_send_buf.resize(Nsimd);
 | 
				
			||||||
 | 
					      u_simd_recv_buf.resize(Nsimd);
 | 
				
			||||||
 | 
					      for(int l=0;l<Nsimd;l++){
 | 
				
			||||||
 | 
						u_simd_send_buf[l].resize(_unified_buffer_size);
 | 
				
			||||||
 | 
						u_simd_recv_buf[l].resize(_unified_buffer_size);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    void Local     (int point, int dimension,int shiftpm,int cbmask)
 | 
					    void Local     (int point, int dimension,int shiftpm,int cbmask)
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
@@ -276,17 +362,15 @@ namespace Grid {
 | 
				
			|||||||
      assert(shift<fd);
 | 
					      assert(shift<fd);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; // done in reduced dims, so SIMD factored
 | 
					      int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; // done in reduced dims, so SIMD factored
 | 
				
			||||||
      //      std::cout << " dim " <<dimension<<" buffersize "<<buffer_size<<std::endl;
 | 
					
 | 
				
			||||||
      _comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
 | 
					      _comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
 | 
				
			||||||
                                           // send to one or more remote nodes.
 | 
					                                           // send to one or more remote nodes.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      int cb= (cbmask==0x2)? Odd : Even;
 | 
					      int cb= (cbmask==0x2)? Odd : Even;
 | 
				
			||||||
      int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
 | 
					      int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
      for(int x=0;x<rd;x++){       
 | 
					      for(int x=0;x<rd;x++){       
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
	int permute_type=grid->PermuteType(dimension);
 | 
						int permute_type=grid->PermuteType(dimension);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	int sx        =  (x+sshift)%rd;
 | 
						int sx        =  (x+sshift)%rd;
 | 
				
			||||||
@@ -310,16 +394,9 @@ namespace Grid {
 | 
				
			|||||||
	} else { 
 | 
						} else { 
 | 
				
			||||||
	  int comm_proc = ((x+sshift)/rd)%pd;
 | 
						  int comm_proc = ((x+sshift)/rd)%pd;
 | 
				
			||||||
	  offnode = (comm_proc!= 0);
 | 
						  offnode = (comm_proc!= 0);
 | 
				
			||||||
	  //	  std::cout << "Stencil x "<<x<<" shift "<<shift<<" sshift "<<sshift<<" fd "<<fd<<" rd " <<rd<<" offnode "<<offnode<<" sx "<<sx<< " comm_proc "<<comm_proc<<" pd "<< pd <<std::endl;
 | 
					 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	// Stencil x 1 shift 3 sshift 3 fd 8 rd 2 offnode 0 sx 0 comm_proc 0 pd 2
 | 
					 | 
				
			||||||
	// x+sshift = 4
 | 
					 | 
				
			||||||
	// x+sshift/2 = 2
 | 
					 | 
				
			||||||
	// 2%2 == 0
 | 
					 | 
				
			||||||
	// Problem: sshift is wrong in "rd" for SIMD directions. The complex logic in Cshift_mpi is needed.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	int wraparound=0;
 | 
						int wraparound=0;
 | 
				
			||||||
	if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
 | 
						if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
 | 
				
			||||||
	  wraparound = 1;
 | 
						  wraparound = 1;
 | 
				
			||||||
@@ -337,15 +414,13 @@ namespace Grid {
 | 
				
			|||||||
	  int words = buffer_size;
 | 
						  int words = buffer_size;
 | 
				
			||||||
	  if (cbmask != 0x3) words=words>>1;
 | 
						  if (cbmask != 0x3) words=words>>1;
 | 
				
			||||||
	  
 | 
						  
 | 
				
			||||||
	  //	  GatherPlaneSimple (point,dimension,sx,cbmask);
 | 
					 | 
				
			||||||
	  
 | 
					 | 
				
			||||||
	  int rank           = grid->_processor;
 | 
						  int rank           = grid->_processor;
 | 
				
			||||||
	  int recv_from_rank;
 | 
						  int recv_from_rank;
 | 
				
			||||||
	  int xmit_to_rank;
 | 
						  int xmit_to_rank;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	  int unified_buffer_offset = _unified_buffer_size;
 | 
						  int unified_buffer_offset = _unified_buffer_size;
 | 
				
			||||||
	  _unified_buffer_size    += words;
 | 
						  _unified_buffer_size    += words;
 | 
				
			||||||
	  //	  std::cout<< "Comms dim "<<dimension<<" offset "<<unified_buffer_offset<<" size "<<" " << _unified_buffer_size<<std::endl;
 | 
					
 | 
				
			||||||
	  ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
 | 
						  ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
 | 
				
			||||||
	  
 | 
						  
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
@@ -441,39 +516,36 @@ namespace Grid {
 | 
				
			|||||||
      }
 | 
					      }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//      CartesianStencil(GridBase *grid,
 | 
					 | 
				
			||||||
//		       int npoints,
 | 
					 | 
				
			||||||
//		       int checkerboard,
 | 
					 | 
				
			||||||
//		       const std::vector<int> &directions,
 | 
					 | 
				
			||||||
//		       const std::vector<int> &distances);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      std::thread HaloExchangeBegin(const Lattice<vobj> &source,compressor &compress) {
 | 
				
			||||||
 | 
						Mergers.resize(0); 
 | 
				
			||||||
 | 
						Packets.resize(0);
 | 
				
			||||||
 | 
						HaloGather(source,compress);
 | 
				
			||||||
 | 
					        return std::thread([&] { this->Communicate(); });
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      // Add to tables for various cases;  is this mistaken. only local if 1 proc in dim
 | 
					      void HaloExchange(const Lattice<vobj> &source,compressor &compress) 
 | 
				
			||||||
      // Can this be avoided with simpler coding of comms?
 | 
					 | 
				
			||||||
   //      void Local     (int point, int dimension,int shift,int cbmask);
 | 
					 | 
				
			||||||
   //      void Comms     (int point, int dimension,int shift,int cbmask);
 | 
					 | 
				
			||||||
   //      void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap);
 | 
					 | 
				
			||||||
   //      void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset,int wrap);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
      // Could allow a functional munging of the halo to another type during the comms.
 | 
					 | 
				
			||||||
      // this could implement the 16bit/32bit/64bit compression.
 | 
					 | 
				
			||||||
      void HaloExchange(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress) 
 | 
					 | 
				
			||||||
      {
 | 
					      {
 | 
				
			||||||
	std::thread thr = HaloExchangeBegin(source,u_comm_buf,compress);
 | 
						auto thr = HaloExchangeBegin(source,compress);
 | 
				
			||||||
 | 
					        HaloExchangeComplete(thr);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      void HaloExchangeComplete(std::thread &thr) 
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
						CommsMerge(); // spins
 | 
				
			||||||
 | 
						jointime-=usecond();
 | 
				
			||||||
	thr.join();
 | 
						thr.join();
 | 
				
			||||||
 | 
						jointime+=usecond();
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      std::thread HaloExchangeBegin(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > & u_comm_buf,compressor &compress) {
 | 
					      void HaloGather(const Lattice<vobj> &source,compressor &compress)
 | 
				
			||||||
	return std::thread([&] { this->HaloExchangeBlocking(source,u_comm_buf,compress); });
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
      void HaloExchangeBlocking(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
 | 
					 | 
				
			||||||
      {
 | 
					      {
 | 
				
			||||||
	// conformable(source._grid,_grid);
 | 
						// conformable(source._grid,_grid);
 | 
				
			||||||
	assert(source._grid==_grid);
 | 
						assert(source._grid==_grid);
 | 
				
			||||||
	halotime-=usecond();
 | 
						halogtime-=usecond();
 | 
				
			||||||
	if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size);
 | 
					
 | 
				
			||||||
	int u_comm_offset=0;
 | 
						assert (comm_buf.size() == _unified_buffer_size );
 | 
				
			||||||
 | 
						u_comm_offset=0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	// Gather all comms buffers
 | 
						// Gather all comms buffers
 | 
				
			||||||
	for(int point = 0 ; point < _npoints; point++) {
 | 
						for(int point = 0 ; point < _npoints; point++) {
 | 
				
			||||||
@@ -506,35 +578,34 @@ namespace Grid {
 | 
				
			|||||||
	    if ( sshift[0] == sshift[1] ) {
 | 
						    if ( sshift[0] == sshift[1] ) {
 | 
				
			||||||
	      if (splice_dim) {
 | 
						      if (splice_dim) {
 | 
				
			||||||
		splicetime-=usecond();
 | 
							splicetime-=usecond();
 | 
				
			||||||
		GatherStartCommsSimd(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
 | 
							GatherSimd(source,dimension,shift,0x3,compress);
 | 
				
			||||||
		splicetime+=usecond();
 | 
							splicetime+=usecond();
 | 
				
			||||||
	      } else { 
 | 
						      } else { 
 | 
				
			||||||
		nosplicetime-=usecond();
 | 
							nosplicetime-=usecond();
 | 
				
			||||||
		GatherStartComms(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
 | 
							Gather(source,dimension,shift,0x3,compress);
 | 
				
			||||||
		nosplicetime+=usecond();
 | 
							nosplicetime+=usecond();
 | 
				
			||||||
	      }
 | 
						      }
 | 
				
			||||||
	    } else {
 | 
						    } else {
 | 
				
			||||||
	      //	      std::cout << "dim "<<dimension<<"cb "<<_checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
 | 
					 | 
				
			||||||
	      if(splice_dim){
 | 
						      if(splice_dim){
 | 
				
			||||||
		splicetime-=usecond();
 | 
							splicetime-=usecond();
 | 
				
			||||||
		GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);// if checkerboard is unfavourable take two passes
 | 
							GatherSimd(source,dimension,shift,0x1,compress);// if checkerboard is unfavourable take two passes
 | 
				
			||||||
		GatherStartCommsSimd(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);// both with block stride loop iteration
 | 
							GatherSimd(source,dimension,shift,0x2,compress);// both with block stride loop iteration
 | 
				
			||||||
		splicetime+=usecond();
 | 
							splicetime+=usecond();
 | 
				
			||||||
	      } else {
 | 
						      } else {
 | 
				
			||||||
		nosplicetime-=usecond();
 | 
							nosplicetime-=usecond();
 | 
				
			||||||
		GatherStartComms(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);
 | 
							Gather(source,dimension,shift,0x1,compress);
 | 
				
			||||||
		GatherStartComms(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);
 | 
							Gather(source,dimension,shift,0x2,compress);
 | 
				
			||||||
		nosplicetime+=usecond();
 | 
							nosplicetime+=usecond();
 | 
				
			||||||
	      }
 | 
						      }
 | 
				
			||||||
	    }
 | 
						    }
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	halotime+=usecond();
 | 
					
 | 
				
			||||||
 | 
						assert(u_comm_offset==_unified_buffer_size);
 | 
				
			||||||
 | 
						halogtime+=usecond();
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        void GatherStartComms(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
 | 
					        void Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress)
 | 
				
			||||||
			      std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
 | 
					 | 
				
			||||||
			      int &u_comm_offset,compressor & compress)
 | 
					 | 
				
			||||||
	{
 | 
						{
 | 
				
			||||||
	  typedef typename cobj::vector_type vector_type;
 | 
						  typedef typename cobj::vector_type vector_type;
 | 
				
			||||||
	  typedef typename cobj::scalar_type scalar_type;
 | 
						  typedef typename cobj::scalar_type scalar_type;
 | 
				
			||||||
@@ -555,8 +626,6 @@ namespace Grid {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
	  int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
 | 
						  int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	  if(send_buf.size()<buffer_size) send_buf.resize(buffer_size);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	  int cb= (cbmask==0x2)? Odd : Even;
 | 
						  int cb= (cbmask==0x2)? Odd : Even;
 | 
				
			||||||
	  int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
 | 
						  int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -573,7 +642,7 @@ namespace Grid {
 | 
				
			|||||||
	      int bytes = words * sizeof(cobj);
 | 
						      int bytes = words * sizeof(cobj);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	      gathertime-=usecond();
 | 
						      gathertime-=usecond();
 | 
				
			||||||
	      Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask,compress);
 | 
						      Gather_plane_simple (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset);
 | 
				
			||||||
	      gathertime+=usecond();
 | 
						      gathertime+=usecond();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	      int rank           = _grid->_processor;
 | 
						      int rank           = _grid->_processor;
 | 
				
			||||||
@@ -584,13 +653,18 @@ namespace Grid {
 | 
				
			|||||||
	      assert (recv_from_rank != _grid->ThisRank());
 | 
						      assert (recv_from_rank != _grid->ThisRank());
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	      //      FIXME Implement asynchronous send & also avoid buffer copy
 | 
						      //      FIXME Implement asynchronous send & also avoid buffer copy
 | 
				
			||||||
	      commtime-=usecond();
 | 
						      /*
 | 
				
			||||||
	      _grid->SendToRecvFrom((void *)&send_buf[0],
 | 
						      _grid->SendToRecvFrom((void *)&send_buf[0],
 | 
				
			||||||
				   xmit_to_rank,
 | 
									   xmit_to_rank,
 | 
				
			||||||
				    (void *)&u_comm_buf[u_comm_offset],
 | 
									    (void *)&comm_buf[u_comm_offset],
 | 
				
			||||||
 | 
									   recv_from_rank,
 | 
				
			||||||
 | 
									   bytes);
 | 
				
			||||||
 | 
						      */ 
 | 
				
			||||||
 | 
						      AddPacket((void *)&u_send_buf[u_comm_offset],
 | 
				
			||||||
 | 
								(void *)&comm_buf[u_comm_offset],
 | 
				
			||||||
 | 
								xmit_to_rank,
 | 
				
			||||||
			recv_from_rank,
 | 
								recv_from_rank,
 | 
				
			||||||
			bytes);
 | 
								bytes);
 | 
				
			||||||
	      commtime+=usecond();
 | 
					 | 
				
			||||||
			
 | 
								
 | 
				
			||||||
	      u_comm_offset+=words;
 | 
						      u_comm_offset+=words;
 | 
				
			||||||
	    }
 | 
						    }
 | 
				
			||||||
@@ -598,14 +672,10 @@ namespace Grid {
 | 
				
			|||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	void  GatherStartCommsSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
 | 
						void  GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress)
 | 
				
			||||||
				   std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
 | 
					 | 
				
			||||||
				   int &u_comm_offset,compressor &compress)
 | 
					 | 
				
			||||||
	{
 | 
						{
 | 
				
			||||||
	  buftime-=usecond();
 | 
					 | 
				
			||||||
	  const int Nsimd = _grid->Nsimd();
 | 
						  const int Nsimd = _grid->Nsimd();
 | 
				
			||||||
	  
 | 
						  
 | 
				
			||||||
	  
 | 
					 | 
				
			||||||
	  int fd = _grid->_fdimensions[dimension];
 | 
						  int fd = _grid->_fdimensions[dimension];
 | 
				
			||||||
	  int rd = _grid->_rdimensions[dimension];
 | 
						  int rd = _grid->_rdimensions[dimension];
 | 
				
			||||||
	  int ld = _grid->_ldimensions[dimension];
 | 
						  int ld = _grid->_ldimensions[dimension];
 | 
				
			||||||
@@ -628,21 +698,10 @@ namespace Grid {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
	  assert(cbmask==0x3); // Fixme think there is a latent bug if not true
 | 
						  assert(cbmask==0x3); // Fixme think there is a latent bug if not true
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	  //	Should grow to max size and then cost very little thereafter
 | 
					 | 
				
			||||||
	  send_buf_extract.resize(Nsimd);
 | 
					 | 
				
			||||||
	  recv_buf_extract.resize(Nsimd);
 | 
					 | 
				
			||||||
	  for(int l=0;l<Nsimd;l++){
 | 
					 | 
				
			||||||
	    if( send_buf_extract[l].size() < buffer_size) {
 | 
					 | 
				
			||||||
	      send_buf_extract[l].resize(buffer_size);
 | 
					 | 
				
			||||||
	      recv_buf_extract[l].resize(buffer_size);
 | 
					 | 
				
			||||||
	    }
 | 
					 | 
				
			||||||
	  }
 | 
					 | 
				
			||||||
	  pointers.resize(Nsimd);
 | 
					 | 
				
			||||||
	  rpointers.resize(Nsimd);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	  int bytes = buffer_size*sizeof(scalar_object);
 | 
						  int bytes = buffer_size*sizeof(scalar_object);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	  buftime+=usecond();
 | 
						  std::vector<scalar_object *> rpointers(Nsimd);
 | 
				
			||||||
 | 
						  std::vector<scalar_object *> spointers(Nsimd);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	  ///////////////////////////////////////////
 | 
						  ///////////////////////////////////////////
 | 
				
			||||||
	  // Work out what to send where
 | 
						  // Work out what to send where
 | 
				
			||||||
@@ -659,16 +718,19 @@ namespace Grid {
 | 
				
			|||||||
	    if ( any_offnode ) {
 | 
						    if ( any_offnode ) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	      for(int i=0;i<Nsimd;i++){       
 | 
						      for(int i=0;i<Nsimd;i++){       
 | 
				
			||||||
		pointers[i] = &send_buf_extract[i][0];
 | 
							spointers[i] = &u_simd_send_buf[i][u_comm_offset];
 | 
				
			||||||
	      }
 | 
						      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	      int sx   = (x+sshift)%rd;
 | 
						      int sx   = (x+sshift)%rd;
 | 
				
			||||||
	      
 | 
						      
 | 
				
			||||||
	      gathermtime-=usecond();
 | 
						      gathermtime-=usecond();
 | 
				
			||||||
	      Gather_plane_extract<cobj>(rhs,pointers,dimension,sx,cbmask,compress);
 | 
						      Gather_plane_extract<cobj>(rhs,spointers,dimension,sx,cbmask,compress);
 | 
				
			||||||
	      gathermtime+=usecond();
 | 
						      gathermtime+=usecond();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	      for(int i=0;i<Nsimd;i++){
 | 
						      for(int i=0;i<Nsimd;i++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							//		std::cout << "GatherSimd : lane 1st elem " << i << u_simd_send_buf[i ][u_comm_offset]<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		int inner_bit = (Nsimd>>(permute_type+1));
 | 
							int inner_bit = (Nsimd>>(permute_type+1));
 | 
				
			||||||
		int ic= (i&inner_bit)? 1:0;
 | 
							int ic= (i&inner_bit)? 1:0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -680,45 +742,41 @@ namespace Grid {
 | 
				
			|||||||
		int nbr_ox   = (nbr_lcoor%rd);    // outer coord of peer
 | 
							int nbr_ox   = (nbr_lcoor%rd);    // outer coord of peer
 | 
				
			||||||
		int nbr_lane = (i&(~inner_bit));
 | 
							int nbr_lane = (i&(~inner_bit));
 | 
				
			||||||
		
 | 
							
 | 
				
			||||||
		int recv_from_rank;
 | 
					 | 
				
			||||||
		int xmit_to_rank;
 | 
					 | 
				
			||||||
		
 | 
					 | 
				
			||||||
		if (nbr_ic) nbr_lane|=inner_bit;
 | 
							if (nbr_ic) nbr_lane|=inner_bit;
 | 
				
			||||||
		assert (sx == nbr_ox);
 | 
							assert (sx == nbr_ox);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							auto rp = &u_simd_recv_buf[i       ][u_comm_offset];
 | 
				
			||||||
 | 
							auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							void *vrp = (void *)rp;
 | 
				
			||||||
 | 
							void *vsp = (void *)sp;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		if(nbr_proc){
 | 
							if(nbr_proc){
 | 
				
			||||||
		  
 | 
							  
 | 
				
			||||||
 | 
							  int recv_from_rank;
 | 
				
			||||||
 | 
							  int xmit_to_rank;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		  _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
 | 
							  _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
 | 
				
			||||||
		  
 | 
							  
 | 
				
			||||||
		  commstime-=usecond();
 | 
							  AddPacket( vsp,vrp,xmit_to_rank,recv_from_rank,bytes);
 | 
				
			||||||
		  _grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
 | 
					 | 
				
			||||||
					xmit_to_rank,
 | 
					 | 
				
			||||||
					(void *)&recv_buf_extract[i][0],
 | 
					 | 
				
			||||||
					recv_from_rank,
 | 
					 | 
				
			||||||
					bytes);
 | 
					 | 
				
			||||||
		  commstime+=usecond();
 | 
					 | 
				
			||||||
		  
 | 
							  
 | 
				
			||||||
		  rpointers[i] = &recv_buf_extract[i][0];
 | 
							  rpointers[i] = rp;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		} else { 
 | 
							} else { 
 | 
				
			||||||
		  rpointers[i] = &send_buf_extract[nbr_lane][0];
 | 
					
 | 
				
			||||||
 | 
							  rpointers[i] = sp;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
	      }
 | 
						      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	      //	      std::cout << " CommsSimd ["<<dimension<<"] offset "<<u_comm_offset<<" buffsize "<<buffer_size  <<" unified  buffer size "<<_unified_buffer_size<<std::endl;
 | 
						      AddMerge(&comm_buf[u_comm_offset],rpointers,buffer_size,Packets.size()-1);
 | 
				
			||||||
	      mergetime-=usecond();
 | 
					
 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
						      u_comm_offset     +=buffer_size;
 | 
				
			||||||
	      for(int i=0;i<buffer_size;i++){
 | 
					 | 
				
			||||||
		//		std::cout<<"buffer loop " << i<<" "<<u_comm_offset+i<<" / "<<_unified_buffer_size<<std::endl;
 | 
					 | 
				
			||||||
		//		assert(u_comm_offset+i<_unified_buffer_size);
 | 
					 | 
				
			||||||
		merge(u_comm_buf[u_comm_offset+i],rpointers,i);
 | 
					 | 
				
			||||||
	      }
 | 
					 | 
				
			||||||
	      mergetime+=usecond();
 | 
					 | 
				
			||||||
	      u_comm_offset+=buffer_size;
 | 
					 | 
				
			||||||
	    }
 | 
						    }
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -204,7 +204,6 @@ namespace Grid {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    std::vector<CoarseMatrix> A;
 | 
					    std::vector<CoarseMatrix> A;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    std::vector<siteVector,alignedAllocator<siteVector> >   comm_buf;
 | 
					 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
    ///////////////////////
 | 
					    ///////////////////////
 | 
				
			||||||
    // Interface
 | 
					    // Interface
 | 
				
			||||||
@@ -217,7 +216,7 @@ namespace Grid {
 | 
				
			|||||||
      conformable(in._grid,out._grid);
 | 
					      conformable(in._grid,out._grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      SimpleCompressor<siteVector> compressor;
 | 
					      SimpleCompressor<siteVector> compressor;
 | 
				
			||||||
      Stencil.HaloExchange(in,comm_buf,compressor);
 | 
					      Stencil.HaloExchange(in,compressor);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
      for(int ss=0;ss<Grid()->oSites();ss++){
 | 
					      for(int ss=0;ss<Grid()->oSites();ss++){
 | 
				
			||||||
@@ -234,7 +233,7 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
	  } else if(SE->_is_local) { 
 | 
						  } else if(SE->_is_local) { 
 | 
				
			||||||
	    nbr = in._odata[SE->_offset];
 | 
						    nbr = in._odata[SE->_offset];
 | 
				
			||||||
	  } else {
 | 
						  } else {
 | 
				
			||||||
	    nbr = comm_buf[SE->_offset];
 | 
						    nbr = Stencil.comm_buf[SE->_offset];
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
	  res = res + A[point]._odata[ss]*nbr;
 | 
						  res = res + A[point]._odata[ss]*nbr;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
@@ -258,7 +257,6 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
      Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
 | 
					      Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
 | 
				
			||||||
      A(geom.npoint,&CoarseGrid)
 | 
					      A(geom.npoint,&CoarseGrid)
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
      comm_buf.resize(Stencil._unified_buffer_size);
 | 
					 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
 | 
					    void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -44,7 +44,7 @@ public:
 | 
				
			|||||||
// Gather for when there is no need to SIMD split with compression
 | 
					// Gather for when there is no need to SIMD split with compression
 | 
				
			||||||
///////////////////////////////////////////////////////////////////
 | 
					///////////////////////////////////////////////////////////////////
 | 
				
			||||||
template<class vobj,class cobj,class compressor> void 
 | 
					template<class vobj,class cobj,class compressor> void 
 | 
				
			||||||
Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress)
 | 
					Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off=0)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  int rd = rhs._grid->_rdimensions[dimension];
 | 
					  int rd = rhs._grid->_rdimensions[dimension];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -63,7 +63,7 @@ PARALLEL_NESTED_LOOP2
 | 
				
			|||||||
      for(int b=0;b<e2;b++){
 | 
					      for(int b=0;b<e2;b++){
 | 
				
			||||||
	int o  = n*rhs._grid->_slice_stride[dimension];
 | 
						int o  = n*rhs._grid->_slice_stride[dimension];
 | 
				
			||||||
	int bo = n*rhs._grid->_slice_block[dimension];
 | 
						int bo = n*rhs._grid->_slice_block[dimension];
 | 
				
			||||||
	buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 | 
						buffer[off+bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  } else { 
 | 
					  } else { 
 | 
				
			||||||
@@ -73,7 +73,7 @@ PARALLEL_NESTED_LOOP2
 | 
				
			|||||||
	 int o  = n*rhs._grid->_slice_stride[dimension];
 | 
						 int o  = n*rhs._grid->_slice_stride[dimension];
 | 
				
			||||||
	 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 | 
						 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 | 
				
			||||||
	 if ( ocb &cbmask ) {
 | 
						 if ( ocb &cbmask ) {
 | 
				
			||||||
	   buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 | 
						   buffer[off+bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 | 
				
			||||||
	 }
 | 
						 }
 | 
				
			||||||
       }
 | 
					       }
 | 
				
			||||||
     }
 | 
					     }
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -206,6 +206,9 @@ typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplD> OverlapWilsonP
 | 
				
			|||||||
typedef WilsonFermion<GparityWilsonImplR>     GparityWilsonFermionR;
 | 
					typedef WilsonFermion<GparityWilsonImplR>     GparityWilsonFermionR;
 | 
				
			||||||
typedef WilsonFermion<GparityWilsonImplF>     GparityWilsonFermionF;
 | 
					typedef WilsonFermion<GparityWilsonImplF>     GparityWilsonFermionF;
 | 
				
			||||||
typedef WilsonFermion<GparityWilsonImplD>     GparityWilsonFermionD;
 | 
					typedef WilsonFermion<GparityWilsonImplD>     GparityWilsonFermionD;
 | 
				
			||||||
 | 
					typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR;
 | 
				
			||||||
 | 
					typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF;
 | 
				
			||||||
 | 
					typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD;
 | 
				
			||||||
typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR;
 | 
					typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR;
 | 
				
			||||||
typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
 | 
					typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
 | 
				
			||||||
typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;
 | 
					typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -58,7 +58,6 @@ namespace QCD {
 | 
				
			|||||||
	UmuOdd (&Hgrid) 
 | 
						UmuOdd (&Hgrid) 
 | 
				
			||||||
  {
 | 
					  {
 | 
				
			||||||
    // Allocate the required comms buffer
 | 
					    // Allocate the required comms buffer
 | 
				
			||||||
    comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
 | 
					 | 
				
			||||||
    ImportGauge(_Umu);
 | 
					    ImportGauge(_Umu);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -153,7 +152,7 @@ namespace QCD {
 | 
				
			|||||||
    FermionField Atilde(B._grid);
 | 
					    FermionField Atilde(B._grid);
 | 
				
			||||||
    Atilde = A;
 | 
					    Atilde = A;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    st.HaloExchange(B,comm_buf,compressor);
 | 
					    st.HaloExchange(B,compressor);
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    for(int mu=0;mu<Nd;mu++){
 | 
					    for(int mu=0;mu<Nd;mu++){
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
@@ -168,7 +167,7 @@ namespace QCD {
 | 
				
			|||||||
      ////////////////////////
 | 
					      ////////////////////////
 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
	for(int sss=0;sss<B._grid->oSites();sss++){
 | 
						for(int sss=0;sss<B._grid->oSites();sss++){
 | 
				
			||||||
	  Kernels::DiracOptDhopDir(st,U,comm_buf,sss,sss,B,Btilde,mu,gamma);
 | 
						  Kernels::DiracOptDhopDir(st,U,st.comm_buf,sss,sss,B,Btilde,mu,gamma);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
      //////////////////////////////////////////////////
 | 
					      //////////////////////////////////////////////////
 | 
				
			||||||
@@ -274,11 +273,11 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
    
 | 
					    
 | 
				
			||||||
    Compressor compressor(dag);
 | 
					    Compressor compressor(dag);
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    Stencil.HaloExchange(in,comm_buf,compressor);
 | 
					    Stencil.HaloExchange(in,compressor);
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
      for(int sss=0;sss<in._grid->oSites();sss++){
 | 
					      for(int sss=0;sss<in._grid->oSites();sss++){
 | 
				
			||||||
	Kernels::DiracOptDhopDir(Stencil,Umu,comm_buf,sss,sss,in,out,dirdisp,gamma);
 | 
						Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sss,sss,in,out,dirdisp,gamma);
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
@@ -300,30 +299,30 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
    assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
					    assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Compressor compressor(dag);
 | 
					    Compressor compressor(dag);
 | 
				
			||||||
    st.HaloExchange(in,comm_buf,compressor);
 | 
					    st.HaloExchange(in,compressor);
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    if ( dag == DaggerYes ) {
 | 
					    if ( dag == DaggerYes ) {
 | 
				
			||||||
      if( HandOptDslash ) {
 | 
					      if( HandOptDslash ) {
 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
					        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
				
			||||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sss,sss,in,out);
 | 
						  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
      } else { 
 | 
					      } else { 
 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
					        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
				
			||||||
	  Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sss,sss,in,out);
 | 
						  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    } else {
 | 
					    } else {
 | 
				
			||||||
      if( HandOptDslash ) {
 | 
					      if( HandOptDslash ) {
 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
					        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
				
			||||||
	  Kernels::DiracOptHandDhopSite(st,U,comm_buf,sss,sss,in,out);
 | 
						  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
      } else { 
 | 
					      } else { 
 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
					        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
				
			||||||
	  Kernels::DiracOptDhopSite(st,U,comm_buf,sss,sss,in,out);
 | 
						  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@@ -338,8 +337,7 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    Compressor compressor(dag);
 | 
					    Compressor compressor(dag);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    std::thread comms_thread = st.HaloExchangeBegin(in,comm_buf,compressor);
 | 
					    auto handle = st.HaloExchangeBegin(in,compressor);
 | 
				
			||||||
    comms_thread.join();
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    bool local    = true;
 | 
					    bool local    = true;
 | 
				
			||||||
    bool nonlocal = false;
 | 
					    bool nonlocal = false;
 | 
				
			||||||
@@ -347,28 +345,29 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
      if( HandOptDslash ) {
 | 
					      if( HandOptDslash ) {
 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
					        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
				
			||||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
 | 
						  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
      } else { 
 | 
					      } else { 
 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
					        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
				
			||||||
	  Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
 | 
						  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    } else {
 | 
					    } else {
 | 
				
			||||||
      if( HandOptDslash ) {
 | 
					      if( HandOptDslash ) {
 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
					        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
				
			||||||
	  Kernels::DiracOptHandDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
 | 
						  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
      } else { 
 | 
					      } else { 
 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
					        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
				
			||||||
	  Kernels::DiracOptDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
 | 
						  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    st.HaloExchangeComplete(handle);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    local    = false;
 | 
					    local    = false;
 | 
				
			||||||
    nonlocal = true;
 | 
					    nonlocal = true;
 | 
				
			||||||
@@ -376,24 +375,24 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
      if( HandOptDslash ) {
 | 
					      if( HandOptDslash ) {
 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
					        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
				
			||||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
 | 
						  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
      } else { 
 | 
					      } else { 
 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
					        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
				
			||||||
	  Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
 | 
						  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    } else {
 | 
					    } else {
 | 
				
			||||||
      if( HandOptDslash ) {
 | 
					      if( HandOptDslash ) {
 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
					        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
				
			||||||
	  Kernels::DiracOptHandDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
 | 
						  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
      } else { 
 | 
					      } else { 
 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
					        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
				
			||||||
	  Kernels::DiracOptDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
 | 
						  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -153,9 +153,6 @@ namespace Grid {
 | 
				
			|||||||
      DoubledGaugeField UmuEven;
 | 
					      DoubledGaugeField UmuEven;
 | 
				
			||||||
      DoubledGaugeField UmuOdd;
 | 
					      DoubledGaugeField UmuOdd;
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
      // Comms buffer
 | 
					 | 
				
			||||||
      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
 | 
					 | 
				
			||||||
      
 | 
					 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    typedef WilsonFermion<WilsonImplF> WilsonFermionF;
 | 
					    typedef WilsonFermion<WilsonImplF> WilsonFermionF;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -98,12 +98,12 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Allocate the required comms buffer
 | 
					  // Allocate the required comms buffer
 | 
				
			||||||
  comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  ImportGauge(_Umu);
 | 
					  ImportGauge(_Umu);
 | 
				
			||||||
 | 
					  alltime=0;
 | 
				
			||||||
  commtime=0;
 | 
					  commtime=0;
 | 
				
			||||||
  jointime=0;
 | 
					  jointime=0;
 | 
				
			||||||
  dslashtime=0;
 | 
					  dslashtime=0;
 | 
				
			||||||
 | 
					  dslash1time=0;
 | 
				
			||||||
}  
 | 
					}  
 | 
				
			||||||
template<class Impl>
 | 
					template<class Impl>
 | 
				
			||||||
void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
 | 
					void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
 | 
				
			||||||
@@ -121,7 +121,7 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
 | 
				
			|||||||
  //  assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
 | 
					  //  assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  Compressor compressor(DaggerNo);
 | 
					  Compressor compressor(DaggerNo);
 | 
				
			||||||
  Stencil.HaloExchange(in,comm_buf,compressor);
 | 
					  Stencil.HaloExchange(in,compressor);
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  int skip = (disp==1) ? 0 : 1;
 | 
					  int skip = (disp==1) ? 0 : 1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -136,7 +136,7 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
    for(int s=0;s<Ls;s++){
 | 
					    for(int s=0;s<Ls;s++){
 | 
				
			||||||
      int sU=ss;
 | 
					      int sU=ss;
 | 
				
			||||||
      int sF = s+Ls*sU; 
 | 
					      int sF = s+Ls*sU; 
 | 
				
			||||||
      Kernels::DiracOptDhopDir(Stencil,Umu,comm_buf,sF,sU,in,out,dirdisp,gamma);
 | 
					      Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sF,sU,in,out,dirdisp,gamma);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
@@ -159,7 +159,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
 | 
				
			|||||||
  FermionField Btilde(B._grid);
 | 
					  FermionField Btilde(B._grid);
 | 
				
			||||||
  FermionField Atilde(B._grid);
 | 
					  FermionField Atilde(B._grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  st.HaloExchange(B,comm_buf,compressor);
 | 
					  st.HaloExchange(B,compressor);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  Atilde=A;
 | 
					  Atilde=A;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -184,7 +184,7 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
	assert ( sF< B._grid->oSites());
 | 
						assert ( sF< B._grid->oSites());
 | 
				
			||||||
	assert ( sU< U._grid->oSites());
 | 
						assert ( sU< U._grid->oSites());
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	Kernels::DiracOptDhopDir(st,U,comm_buf,sF,sU,B,Btilde,mu,gamma);
 | 
						Kernels::DiracOptDhopDir(st,U,st.comm_buf,sF,sU,B,Btilde,mu,gamma);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    ////////////////////////////
 | 
					    ////////////////////////////
 | 
				
			||||||
    // spin trace outer product
 | 
					    // spin trace outer product
 | 
				
			||||||
@@ -235,22 +235,25 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
 | 
				
			|||||||
template<class Impl>
 | 
					template<class Impl>
 | 
				
			||||||
void WilsonFermion5D<Impl>::Report(void)
 | 
					void WilsonFermion5D<Impl>::Report(void)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  std::cout<<GridLogMessage << "********************"<<std::endl;
 | 
					  std::cout<<GridLogMessage << "******************** WilsonFermion"<<std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "Halo   time "<<commtime <<" us"<<std::endl;
 | 
					  std::cout<<GridLogMessage << "Wilson5d      time "<<alltime <<" us"<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "HaloBegin     time "<<commtime <<" us"<<std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "Dslash        time "<<dslashtime<<" us"<<std::endl;
 | 
					  std::cout<<GridLogMessage << "Dslash        time "<<dslashtime<<" us"<<std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "join   time "<<jointime<<" us"<<std::endl;
 | 
					  std::cout<<GridLogMessage << "Dslash1       time "<<dslash1time<<" us"<<std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "Stencil All    time "<<Stencil.halotime<<" us"<<std::endl;
 | 
					  std::cout<<GridLogMessage << "HaloComplete  time "<<jointime<<" us"<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "******************** Stencil"<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Stencil all gather      time "<<Stencil.halogtime<<" us"<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Stencil nosplice gather time "<<Stencil.nosplicetime<<" us"<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Stencil splice   gather time "<<Stencil.splicetime<<" us"<<std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "********************"<<std::endl;
 | 
					  std::cout<<GridLogMessage << "********************"<<std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "Stencil nosplice time "<<Stencil.nosplicetime<<" us"<<std::endl;
 | 
					  std::cout<<GridLogMessage << "Stencil gather        "<<Stencil.gathertime<<" us"<<std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "Stencil gather time "<<Stencil.gathertime<<" us"<<std::endl;
 | 
					  std::cout<<GridLogMessage << "Stencil gather simd   "<<Stencil.gathermtime<<" us"<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Stencil merge  simd   "<<Stencil.mergetime<<" us"<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Stencil spin   simd   "<<Stencil.spintime<<" us"<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "********************"<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Stencil MB/s          "<<(double)Stencil.comms_bytes/Stencil.commtime<<std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "Stencil comm     time "<<Stencil.commtime<<" us"<<std::endl;
 | 
					  std::cout<<GridLogMessage << "Stencil comm     time "<<Stencil.commtime<<" us"<<std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "Stencil scattertime "<<Stencil.scattertime<<" us"<<std::endl;
 | 
					  std::cout<<GridLogMessage << "Stencil join     time "<<Stencil.jointime<<" us"<<std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "********************"<<std::endl;
 | 
					 | 
				
			||||||
  std::cout<<GridLogMessage << "Stencil splice time "<<Stencil.splicetime<<" us"<<std::endl;
 | 
					 | 
				
			||||||
  std::cout<<GridLogMessage << "Stencil comm   time "<<Stencil.commstime<<" us"<<std::endl;
 | 
					 | 
				
			||||||
  std::cout<<GridLogMessage << "Stencil gathremtime "<<Stencil.gathermtime<<" us"<<std::endl;
 | 
					 | 
				
			||||||
  std::cout<<GridLogMessage << "Stencil merge  time "<<Stencil.mergetime<<" us"<<std::endl;
 | 
					 | 
				
			||||||
  std::cout<<GridLogMessage << "Stencil buf    time "<<Stencil.buftime<<" us"<<std::endl;
 | 
					 | 
				
			||||||
  std::cout<<GridLogMessage << "********************"<<std::endl;
 | 
					  std::cout<<GridLogMessage << "********************"<<std::endl;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
template<class Impl>
 | 
					template<class Impl>
 | 
				
			||||||
@@ -288,7 +291,7 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
 | 
				
			|||||||
					 const FermionField &in, FermionField &out,int dag)
 | 
										 const FermionField &in, FermionField &out,int dag)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
					  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
				
			||||||
 | 
					  alltime-=usecond();
 | 
				
			||||||
  Compressor compressor(dag);
 | 
					  Compressor compressor(dag);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
 | 
					  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
 | 
				
			||||||
@@ -299,11 +302,11 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
 | 
				
			|||||||
  int nwork = U._grid->oSites();
 | 
					  int nwork = U._grid->oSites();
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  commtime -=usecond();
 | 
					  commtime -=usecond();
 | 
				
			||||||
  std::thread thr = st.HaloExchangeBegin(in,comm_buf,compressor);
 | 
					  auto handle = st.HaloExchangeBegin(in,compressor);
 | 
				
			||||||
 | 
					  st.HaloExchangeComplete(handle);
 | 
				
			||||||
  commtime +=usecond();
 | 
					  commtime +=usecond();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  jointime -=usecond();
 | 
					  jointime -=usecond();
 | 
				
			||||||
  thr.join();
 | 
					 | 
				
			||||||
  jointime +=usecond();
 | 
					  jointime +=usecond();
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
 | 
					  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
 | 
				
			||||||
@@ -319,7 +322,7 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
 | 
				
			|||||||
	int sU=ss;
 | 
						int sU=ss;
 | 
				
			||||||
	for(int s=0;s<Ls;s++){
 | 
						for(int s=0;s<Ls;s++){
 | 
				
			||||||
	  int sF = s+Ls*sU;
 | 
						  int sF = s+Ls*sU;
 | 
				
			||||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out);
 | 
						  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    } else { 
 | 
					    } else { 
 | 
				
			||||||
@@ -330,7 +333,7 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
	  for(sd=0;sd<Ls;sd++){
 | 
						  for(sd=0;sd<Ls;sd++){
 | 
				
			||||||
	    int sU=ss;
 | 
						    int sU=ss;
 | 
				
			||||||
	    int sF = sd+Ls*sU;
 | 
						    int sF = sd+Ls*sU;
 | 
				
			||||||
	    Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sF,sU,in,out);
 | 
						    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
@@ -362,7 +365,7 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
	      sU = lo.Reorder(sU);
 | 
						      sU = lo.Reorder(sU);
 | 
				
			||||||
	    }
 | 
						    }
 | 
				
			||||||
	    sF = s+Ls*sU;
 | 
						    sF = s+Ls*sU;
 | 
				
			||||||
	    Kernels::DiracOptAsmDhopSite(st,U,comm_buf,sF,sU,in,out,(uint64_t *)0);// &buf[0]
 | 
						    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out,(uint64_t *)0);// &buf[0]
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
@@ -387,7 +390,7 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
	  sU=ss+ ssoff;
 | 
						  sU=ss+ ssoff;
 | 
				
			||||||
	  for(int s=soff;s<soff+swork;s++){
 | 
						  for(int s=soff;s<soff+swork;s++){
 | 
				
			||||||
	    sF = s+Ls*sU;
 | 
						    sF = s+Ls*sU;
 | 
				
			||||||
	    Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
 | 
						    Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
@@ -398,7 +401,7 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
	int sU=ss;
 | 
						int sU=ss;
 | 
				
			||||||
	for(int s=0;s<Ls;s++){
 | 
						for(int s=0;s<Ls;s++){
 | 
				
			||||||
	  int sF = s+Ls*sU;
 | 
						  int sF = s+Ls*sU;
 | 
				
			||||||
	  Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
 | 
						  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    } else { 
 | 
					    } else { 
 | 
				
			||||||
@@ -407,12 +410,13 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
	int sU=ss;
 | 
						int sU=ss;
 | 
				
			||||||
	for(int s=0;s<Ls;s++){
 | 
						for(int s=0;s<Ls;s++){
 | 
				
			||||||
	  int sF = s+Ls*sU; 
 | 
						  int sF = s+Ls*sU; 
 | 
				
			||||||
	  Kernels::DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out);
 | 
						  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  dslashtime +=usecond();
 | 
					  dslashtime +=usecond();
 | 
				
			||||||
 | 
					  alltime+=usecond();
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class Impl>
 | 
					template<class Impl>
 | 
				
			||||||
@@ -421,7 +425,10 @@ void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, Le
 | 
				
			|||||||
						     const FermionField &in, FermionField &out,int dag)
 | 
											     const FermionField &in, FermionField &out,int dag)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
					  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
				
			||||||
 | 
					  alltime-=usecond();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int calls;
 | 
				
			||||||
 | 
					  int updates;
 | 
				
			||||||
  Compressor compressor(dag);
 | 
					  Compressor compressor(dag);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
 | 
					  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
 | 
				
			||||||
@@ -432,7 +439,7 @@ void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, Le
 | 
				
			|||||||
  int nwork = U._grid->oSites();
 | 
					  int nwork = U._grid->oSites();
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  commtime -=usecond();
 | 
					  commtime -=usecond();
 | 
				
			||||||
  std::thread thr = st.HaloExchangeBegin(in,comm_buf,compressor);
 | 
					  auto handle = st.HaloExchangeBegin(in,compressor);
 | 
				
			||||||
  commtime +=usecond();
 | 
					  commtime +=usecond();
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
 | 
					  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
 | 
				
			||||||
@@ -450,7 +457,7 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
	int sU=ss;
 | 
						int sU=ss;
 | 
				
			||||||
	for(int s=0;s<Ls;s++){
 | 
						for(int s=0;s<Ls;s++){
 | 
				
			||||||
	  int sF = s+Ls*sU;
 | 
						  int sF = s+Ls*sU;
 | 
				
			||||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
 | 
						  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    } else { 
 | 
					    } else { 
 | 
				
			||||||
@@ -461,7 +468,7 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
	  for(sd=0;sd<Ls;sd++){
 | 
						  for(sd=0;sd<Ls;sd++){
 | 
				
			||||||
	    int sU=ss;
 | 
						    int sU=ss;
 | 
				
			||||||
	    int sF = sd+Ls*sU;
 | 
						    int sF = sd+Ls*sU;
 | 
				
			||||||
	    Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
 | 
						    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
@@ -473,7 +480,7 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
	int sU=ss;
 | 
						int sU=ss;
 | 
				
			||||||
	for(int s=0;s<Ls;s++){
 | 
						for(int s=0;s<Ls;s++){
 | 
				
			||||||
	  int sF = s+Ls*sU;
 | 
						  int sF = s+Ls*sU;
 | 
				
			||||||
	  Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
 | 
						  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    } else { 
 | 
					    } else { 
 | 
				
			||||||
@@ -482,7 +489,7 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
	int sU=ss;
 | 
						int sU=ss;
 | 
				
			||||||
	for(int s=0;s<Ls;s++){
 | 
						for(int s=0;s<Ls;s++){
 | 
				
			||||||
	  int sF = s+Ls*sU; 
 | 
						  int sF = s+Ls*sU; 
 | 
				
			||||||
	  Kernels::DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
 | 
						  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@@ -490,12 +497,12 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
  dslashtime +=usecond();
 | 
					  dslashtime +=usecond();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  jointime -=usecond();
 | 
					  jointime -=usecond();
 | 
				
			||||||
  thr.join();
 | 
					  st.HaloExchangeComplete(handle);
 | 
				
			||||||
  jointime +=usecond();
 | 
					  jointime +=usecond();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  local    = false;
 | 
					  local    = false;
 | 
				
			||||||
  nonlocal = true;
 | 
					  nonlocal = true;
 | 
				
			||||||
  dslashtime -=usecond();
 | 
					  dslash1time -=usecond();
 | 
				
			||||||
  if ( dag == DaggerYes ) {
 | 
					  if ( dag == DaggerYes ) {
 | 
				
			||||||
    if( this->HandOptDslash ) {
 | 
					    if( this->HandOptDslash ) {
 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
@@ -503,7 +510,7 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
	int sU=ss;
 | 
						int sU=ss;
 | 
				
			||||||
	for(int s=0;s<Ls;s++){
 | 
						for(int s=0;s<Ls;s++){
 | 
				
			||||||
	  int sF = s+Ls*sU;
 | 
						  int sF = s+Ls*sU;
 | 
				
			||||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
 | 
						  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    } else { 
 | 
					    } else { 
 | 
				
			||||||
@@ -514,7 +521,7 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
	  for(sd=0;sd<Ls;sd++){
 | 
						  for(sd=0;sd<Ls;sd++){
 | 
				
			||||||
	    int sU=ss;
 | 
						    int sU=ss;
 | 
				
			||||||
	    int sF = sd+Ls*sU;
 | 
						    int sF = sd+Ls*sU;
 | 
				
			||||||
	    Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
 | 
						    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
@@ -526,7 +533,7 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
	int sU=ss;
 | 
						int sU=ss;
 | 
				
			||||||
	for(int s=0;s<Ls;s++){
 | 
						for(int s=0;s<Ls;s++){
 | 
				
			||||||
	  int sF = s+Ls*sU;
 | 
						  int sF = s+Ls*sU;
 | 
				
			||||||
	  Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
 | 
						  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    } else { 
 | 
					    } else { 
 | 
				
			||||||
@@ -535,13 +542,13 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
	int sU=ss;
 | 
						int sU=ss;
 | 
				
			||||||
	for(int s=0;s<Ls;s++){
 | 
						for(int s=0;s<Ls;s++){
 | 
				
			||||||
	  int sF = s+Ls*sU; 
 | 
						  int sF = s+Ls*sU; 
 | 
				
			||||||
	  Kernels::DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
 | 
						  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  dslashtime +=usecond();
 | 
					  dslash1time +=usecond();
 | 
				
			||||||
 | 
					  alltime+=usecond();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -61,9 +61,11 @@ namespace Grid {
 | 
				
			|||||||
    public:
 | 
					    public:
 | 
				
			||||||
     INHERIT_IMPL_TYPES(Impl);
 | 
					     INHERIT_IMPL_TYPES(Impl);
 | 
				
			||||||
     typedef WilsonKernels<Impl> Kernels;
 | 
					     typedef WilsonKernels<Impl> Kernels;
 | 
				
			||||||
 | 
					     double alltime;
 | 
				
			||||||
     double jointime;
 | 
					     double jointime;
 | 
				
			||||||
     double commtime;
 | 
					     double commtime;
 | 
				
			||||||
     double dslashtime;
 | 
					     double dslashtime;
 | 
				
			||||||
 | 
					     double dslash1time;
 | 
				
			||||||
      ///////////////////////////////////////////////////////////////
 | 
					      ///////////////////////////////////////////////////////////////
 | 
				
			||||||
      // Implement the abstract base
 | 
					      // Implement the abstract base
 | 
				
			||||||
      ///////////////////////////////////////////////////////////////
 | 
					      ///////////////////////////////////////////////////////////////
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -60,15 +60,10 @@ namespace Grid {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
     void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					     void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
 | 
					#if 0
 | 
				
			||||||
 | 
					<<<<<<< HEAD
 | 
				
			||||||
				int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
 | 
									int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
 | 
				
			||||||
//			      int sF,int sU,const FermionField &in, FermionField &out,uint64_t *);
 | 
					//			      int sF,int sU,const FermionField &in, FermionField &out,uint64_t *);
 | 
				
			||||||
#if 0
 | 
					 | 
				
			||||||
     void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					 | 
				
			||||||
			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
					 | 
				
			||||||
			      int sF,int sU,const FermionField &in, FermionField &out,uint64_t *p){
 | 
					 | 
				
			||||||
       DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
 | 
					 | 
				
			||||||
     }
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
// doesn't seem to work with Gparity at the moment
 | 
					// doesn't seem to work with Gparity at the moment
 | 
				
			||||||
#undef HANDOPT
 | 
					#undef HANDOPT
 | 
				
			||||||
#if 1
 | 
					#if 1
 | 
				
			||||||
@@ -79,7 +74,18 @@ namespace Grid {
 | 
				
			|||||||
     void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
					     void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
				  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
									  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
				  int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
 | 
									  int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
 | 
				
			||||||
//				  int sF,int sU,const FermionField &in, FermionField &out);
 | 
					#endif
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
 | 
								      int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					     int DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
 | 
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
 | 
								      int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
 | 
				
			||||||
 | 
					     
 | 
				
			||||||
 | 
					     int DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
 | 
									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
 | 
									 int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
 | 
				
			||||||
 | 
					//>>>>>>> fc6ad657514c7966291c19f22af89de5d5a96f93
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
     WilsonKernels(const ImplParams &p= ImplParams());
 | 
					     WilsonKernels(const ImplParams &p= ImplParams());
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -310,7 +310,7 @@ namespace QCD {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class Impl>
 | 
					template<class Impl>
 | 
				
			||||||
void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
					int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
											   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
						   int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
											   int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
@@ -318,21 +318,21 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
 | 
				
			|||||||
  typedef typename Simd::scalar_type S;
 | 
					  typedef typename Simd::scalar_type S;
 | 
				
			||||||
  typedef typename Simd::vector_type V;
 | 
					  typedef typename Simd::vector_type V;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  REGISTER Simd result_00; // 12 regs on knc
 | 
					  REGISTER Simd result_00 ; zeroit(result_00); // 12 regs on knc
 | 
				
			||||||
  REGISTER Simd result_01;
 | 
					  REGISTER Simd result_01 ; zeroit(result_01); // 12 regs on knc
 | 
				
			||||||
  REGISTER Simd result_02;
 | 
					  REGISTER Simd result_02 ; zeroit(result_02); // 12 regs on knc
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  REGISTER Simd result_10;
 | 
					  REGISTER Simd result_10 ; zeroit(result_10); // 12 regs on knc
 | 
				
			||||||
  REGISTER Simd result_11;
 | 
					  REGISTER Simd result_11 ; zeroit(result_11); // 12 regs on knc
 | 
				
			||||||
  REGISTER Simd result_12;
 | 
					  REGISTER Simd result_12 ; zeroit(result_12); // 12 regs on knc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  REGISTER Simd result_20;
 | 
					  REGISTER Simd result_20 ; zeroit(result_20); // 12 regs on knc
 | 
				
			||||||
  REGISTER Simd result_21;
 | 
					  REGISTER Simd result_21 ; zeroit(result_21); // 12 regs on knc
 | 
				
			||||||
  REGISTER Simd result_22;
 | 
					  REGISTER Simd result_22 ; zeroit(result_22); // 12 regs on knc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  REGISTER Simd result_30;
 | 
					  REGISTER Simd result_30 ; zeroit(result_30); // 12 regs on knc
 | 
				
			||||||
  REGISTER Simd result_31;
 | 
					  REGISTER Simd result_31 ; zeroit(result_31); // 12 regs on knc
 | 
				
			||||||
  REGISTER Simd result_32; // 20 left
 | 
					  REGISTER Simd result_32 ; zeroit(result_32); // 12 regs on knc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  REGISTER Simd Chi_00;    // two spinor; 6 regs
 | 
					  REGISTER Simd Chi_00;    // two spinor; 6 regs
 | 
				
			||||||
  REGISTER Simd Chi_01;
 | 
					  REGISTER Simd Chi_01;
 | 
				
			||||||
@@ -372,172 +372,178 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  StencilEntry *SE;
 | 
					  StencilEntry *SE;
 | 
				
			||||||
  int offset,local,perm, ptype;
 | 
					  int offset, ptype;
 | 
				
			||||||
 | 
					  int num = 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Xp
 | 
					  // Xp
 | 
				
			||||||
  SE=st.GetEntry(ptype,Xp,ss);
 | 
					  SE=st.GetEntry(ptype,Xp,ss);
 | 
				
			||||||
  offset = SE->_offset;
 | 
					  offset = SE->_offset;
 | 
				
			||||||
  local  = SE->_is_local;
 | 
					 | 
				
			||||||
  perm   = SE->_permute;
 | 
					 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  if ( local ) {
 | 
					  if (Local && SE->_is_local ) { 
 | 
				
			||||||
    LOAD_CHIMU;
 | 
					    LOAD_CHIMU;
 | 
				
			||||||
    XP_PROJ;
 | 
					    XP_PROJ;
 | 
				
			||||||
    if ( perm) {
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  } else { 
 | 
					
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
				
			||||||
    LOAD_CHI;
 | 
					    LOAD_CHI;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  {
 | 
					  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
				
			||||||
    MULT_2SPIN(Xp);
 | 
					    MULT_2SPIN(Xp);
 | 
				
			||||||
 | 
					    XP_RECON_ACCUM;
 | 
				
			||||||
 | 
					    num++;  
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  XP_RECON;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Yp
 | 
					  // Yp
 | 
				
			||||||
  SE=st.GetEntry(ptype,Yp,ss);
 | 
					  SE=st.GetEntry(ptype,Yp,ss);
 | 
				
			||||||
  offset = SE->_offset;
 | 
					  offset = SE->_offset;
 | 
				
			||||||
  local  = SE->_is_local;
 | 
					 | 
				
			||||||
  perm   = SE->_permute;
 | 
					 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  if ( local ) {
 | 
					  if (Local && SE->_is_local ) { 
 | 
				
			||||||
    LOAD_CHIMU;
 | 
					    LOAD_CHIMU;
 | 
				
			||||||
    YP_PROJ;
 | 
					    YP_PROJ;
 | 
				
			||||||
    if ( perm) {
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  } else { 
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
				
			||||||
    LOAD_CHI;
 | 
					    LOAD_CHI;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  {
 | 
					  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
				
			||||||
    MULT_2SPIN(Yp);
 | 
					    MULT_2SPIN(Yp);
 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
    YP_RECON_ACCUM;
 | 
					    YP_RECON_ACCUM;
 | 
				
			||||||
 | 
					    num++;  
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Zp
 | 
					  // Zp
 | 
				
			||||||
  SE=st.GetEntry(ptype,Zp,ss);
 | 
					  SE=st.GetEntry(ptype,Zp,ss);
 | 
				
			||||||
  offset = SE->_offset;
 | 
					  offset = SE->_offset;
 | 
				
			||||||
  local  = SE->_is_local;
 | 
					 | 
				
			||||||
  perm   = SE->_permute;
 | 
					 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  if ( local ) {
 | 
					  if (Local && SE->_is_local ) { 
 | 
				
			||||||
    LOAD_CHIMU;
 | 
					    LOAD_CHIMU;
 | 
				
			||||||
    ZP_PROJ;
 | 
					    ZP_PROJ;
 | 
				
			||||||
    if ( perm) {
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  } else { 
 | 
					  }  
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
				
			||||||
    LOAD_CHI;
 | 
					    LOAD_CHI;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  {
 | 
					
 | 
				
			||||||
 | 
					  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
				
			||||||
    MULT_2SPIN(Zp);
 | 
					    MULT_2SPIN(Zp);
 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
    ZP_RECON_ACCUM;
 | 
					    ZP_RECON_ACCUM;
 | 
				
			||||||
 | 
					    num++;  
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Tp
 | 
					  // Tp
 | 
				
			||||||
  SE=st.GetEntry(ptype,Tp,ss);
 | 
					  SE=st.GetEntry(ptype,Tp,ss);
 | 
				
			||||||
  offset = SE->_offset;
 | 
					  offset = SE->_offset;
 | 
				
			||||||
  local  = SE->_is_local;
 | 
					 | 
				
			||||||
  perm   = SE->_permute;
 | 
					 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  if ( local ) {
 | 
					  if (Local && SE->_is_local ) { 
 | 
				
			||||||
    LOAD_CHIMU;
 | 
					    LOAD_CHIMU;
 | 
				
			||||||
    TP_PROJ;
 | 
					    TP_PROJ;
 | 
				
			||||||
    if ( perm) {
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  } else { 
 | 
					  }
 | 
				
			||||||
 | 
					  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
				
			||||||
    LOAD_CHI;
 | 
					    LOAD_CHI;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  {
 | 
					  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
				
			||||||
    MULT_2SPIN(Tp);
 | 
					    MULT_2SPIN(Tp);
 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
    TP_RECON_ACCUM;
 | 
					    TP_RECON_ACCUM;
 | 
				
			||||||
 | 
					    num++;  
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  // Xm
 | 
					  // Xm
 | 
				
			||||||
  SE=st.GetEntry(ptype,Xm,ss);
 | 
					  SE=st.GetEntry(ptype,Xm,ss);
 | 
				
			||||||
  offset = SE->_offset;
 | 
					  offset = SE->_offset;
 | 
				
			||||||
  local  = SE->_is_local;
 | 
					 | 
				
			||||||
  perm   = SE->_permute;
 | 
					 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  if ( local ) {
 | 
					  if (Local && SE->_is_local ) { 
 | 
				
			||||||
    LOAD_CHIMU;
 | 
					    LOAD_CHIMU;
 | 
				
			||||||
    XM_PROJ;
 | 
					    XM_PROJ;
 | 
				
			||||||
    if ( perm) {
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  } else { 
 | 
					  }
 | 
				
			||||||
 | 
					  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
				
			||||||
    LOAD_CHI;
 | 
					    LOAD_CHI;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  {
 | 
					  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
				
			||||||
    MULT_2SPIN(Xm);
 | 
					    MULT_2SPIN(Xm);
 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
    XM_RECON_ACCUM;
 | 
					    XM_RECON_ACCUM;
 | 
				
			||||||
 | 
					    num++;  
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  // Ym
 | 
					  // Ym
 | 
				
			||||||
  SE=st.GetEntry(ptype,Ym,ss);
 | 
					  SE=st.GetEntry(ptype,Ym,ss);
 | 
				
			||||||
  offset = SE->_offset;
 | 
					  offset = SE->_offset;
 | 
				
			||||||
  local  = SE->_is_local;
 | 
					 | 
				
			||||||
  perm   = SE->_permute;
 | 
					 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  if ( local ) {
 | 
					  if (Local && SE->_is_local ) { 
 | 
				
			||||||
    LOAD_CHIMU;
 | 
					    LOAD_CHIMU;
 | 
				
			||||||
    YM_PROJ;
 | 
					    YM_PROJ;
 | 
				
			||||||
    if ( perm) {
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  } else { 
 | 
					  }
 | 
				
			||||||
 | 
					  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
				
			||||||
    LOAD_CHI;
 | 
					    LOAD_CHI;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  {
 | 
					  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
				
			||||||
    MULT_2SPIN(Ym);
 | 
					    MULT_2SPIN(Ym);
 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
    YM_RECON_ACCUM;
 | 
					    YM_RECON_ACCUM;
 | 
				
			||||||
 | 
					    num++;  
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Zm
 | 
					  // Zm
 | 
				
			||||||
  SE=st.GetEntry(ptype,Zm,ss);
 | 
					  SE=st.GetEntry(ptype,Zm,ss);
 | 
				
			||||||
  offset = SE->_offset;
 | 
					  offset = SE->_offset;
 | 
				
			||||||
  local  = SE->_is_local;
 | 
					 | 
				
			||||||
  perm   = SE->_permute;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if ( local ) {
 | 
					  if (Local && SE->_is_local ) { 
 | 
				
			||||||
    LOAD_CHIMU;
 | 
					    LOAD_CHIMU;
 | 
				
			||||||
    ZM_PROJ;
 | 
					    ZM_PROJ;
 | 
				
			||||||
    if ( perm) {
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  } else { 
 | 
					  }
 | 
				
			||||||
 | 
					  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
				
			||||||
    LOAD_CHI;
 | 
					    LOAD_CHI;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  {
 | 
					  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
				
			||||||
    MULT_2SPIN(Zm);
 | 
					    MULT_2SPIN(Zm);
 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
    ZM_RECON_ACCUM;
 | 
					    ZM_RECON_ACCUM;
 | 
				
			||||||
 | 
					    num++;  
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Tm
 | 
					  // Tm
 | 
				
			||||||
  SE=st.GetEntry(ptype,Tm,ss);
 | 
					  SE=st.GetEntry(ptype,Tm,ss);
 | 
				
			||||||
  offset = SE->_offset;
 | 
					  offset = SE->_offset;
 | 
				
			||||||
  local  = SE->_is_local;
 | 
					 | 
				
			||||||
  perm   = SE->_permute;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if ( local ) {
 | 
					  if (Local && SE->_is_local ) { 
 | 
				
			||||||
    LOAD_CHIMU;
 | 
					    LOAD_CHIMU;
 | 
				
			||||||
    TM_PROJ;
 | 
					    TM_PROJ;
 | 
				
			||||||
    if ( perm) {
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  } else { 
 | 
					  }
 | 
				
			||||||
 | 
					  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
				
			||||||
    LOAD_CHI;
 | 
					    LOAD_CHI;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  {
 | 
					  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
				
			||||||
    MULT_2SPIN(Tm);
 | 
					    MULT_2SPIN(Tm);
 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
    TM_RECON_ACCUM;
 | 
					    TM_RECON_ACCUM;
 | 
				
			||||||
 | 
					    num++;  
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  {
 | 
					 | 
				
			||||||
  SiteSpinor & ref (out._odata[ss]);
 | 
					  SiteSpinor & ref (out._odata[ss]);
 | 
				
			||||||
 | 
					  if ( Local ) {
 | 
				
			||||||
    vstream(ref()(0)(0),result_00*(-0.5));
 | 
					    vstream(ref()(0)(0),result_00*(-0.5));
 | 
				
			||||||
    vstream(ref()(0)(1),result_01*(-0.5));
 | 
					    vstream(ref()(0)(1),result_01*(-0.5));
 | 
				
			||||||
    vstream(ref()(0)(2),result_02*(-0.5));
 | 
					    vstream(ref()(0)(2),result_02*(-0.5));
 | 
				
			||||||
@@ -550,9 +556,295 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
 | 
				
			|||||||
    vstream(ref()(3)(0),result_30*(-0.5));
 | 
					    vstream(ref()(3)(0),result_30*(-0.5));
 | 
				
			||||||
    vstream(ref()(3)(1),result_31*(-0.5));
 | 
					    vstream(ref()(3)(1),result_31*(-0.5));
 | 
				
			||||||
    vstream(ref()(3)(2),result_32*(-0.5));
 | 
					    vstream(ref()(3)(2),result_32*(-0.5));
 | 
				
			||||||
 | 
					    return 1;
 | 
				
			||||||
 | 
					  } else if ( num ) { 
 | 
				
			||||||
 | 
					    vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(0)(2),ref()(0)(2)+result_02*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(1)(0),ref()(1)(0)+result_10*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(1)(1),ref()(1)(1)+result_11*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(1)(2),ref()(1)(2)+result_12*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(2)(0),ref()(2)(0)+result_20*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(2)(1),ref()(2)(1)+result_21*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(2)(2),ref()(2)(2)+result_22*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5));
 | 
				
			||||||
 | 
					    return 1;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					  return 0;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template<class Impl>
 | 
				
			||||||
 | 
					int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
 | 
											std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
 | 
											int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  //  std::cout << "Hand op Dhop "<<std::endl;
 | 
				
			||||||
 | 
					  typedef typename Simd::scalar_type S;
 | 
				
			||||||
 | 
					  typedef typename Simd::vector_type V;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  REGISTER Simd result_00 ; zeroit(result_00); // 12 regs on knc
 | 
				
			||||||
 | 
					  REGISTER Simd result_01 ; zeroit(result_01); // 12 regs on knc
 | 
				
			||||||
 | 
					  REGISTER Simd result_02 ; zeroit(result_02); // 12 regs on knc
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  REGISTER Simd result_10 ; zeroit(result_10); // 12 regs on knc
 | 
				
			||||||
 | 
					  REGISTER Simd result_11 ; zeroit(result_11); // 12 regs on knc
 | 
				
			||||||
 | 
					  REGISTER Simd result_12 ; zeroit(result_12); // 12 regs on knc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  REGISTER Simd result_20 ; zeroit(result_20); // 12 regs on knc
 | 
				
			||||||
 | 
					  REGISTER Simd result_21 ; zeroit(result_21); // 12 regs on knc
 | 
				
			||||||
 | 
					  REGISTER Simd result_22 ; zeroit(result_22); // 12 regs on knc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  REGISTER Simd result_30 ; zeroit(result_30); // 12 regs on knc
 | 
				
			||||||
 | 
					  REGISTER Simd result_31 ; zeroit(result_31); // 12 regs on knc
 | 
				
			||||||
 | 
					  REGISTER Simd result_32 ; zeroit(result_32); // 12 regs on knc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  REGISTER Simd Chi_00;    // two spinor; 6 regs
 | 
				
			||||||
 | 
					  REGISTER Simd Chi_01;
 | 
				
			||||||
 | 
					  REGISTER Simd Chi_02;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  REGISTER Simd Chi_10;
 | 
				
			||||||
 | 
					  REGISTER Simd Chi_11;
 | 
				
			||||||
 | 
					  REGISTER Simd Chi_12;   // 14 left
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  REGISTER Simd UChi_00;  // two spinor; 6 regs
 | 
				
			||||||
 | 
					  REGISTER Simd UChi_01;
 | 
				
			||||||
 | 
					  REGISTER Simd UChi_02;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  REGISTER Simd UChi_10;
 | 
				
			||||||
 | 
					  REGISTER Simd UChi_11;
 | 
				
			||||||
 | 
					  REGISTER Simd UChi_12;  // 8 left
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  REGISTER Simd U_00;  // two rows of U matrix
 | 
				
			||||||
 | 
					  REGISTER Simd U_10;
 | 
				
			||||||
 | 
					  REGISTER Simd U_20;  
 | 
				
			||||||
 | 
					  REGISTER Simd U_01;
 | 
				
			||||||
 | 
					  REGISTER Simd U_11;
 | 
				
			||||||
 | 
					  REGISTER Simd U_21;  // 2 reg left.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define Chimu_00 Chi_00
 | 
				
			||||||
 | 
					#define Chimu_01 Chi_01
 | 
				
			||||||
 | 
					#define Chimu_02 Chi_02
 | 
				
			||||||
 | 
					#define Chimu_10 Chi_10
 | 
				
			||||||
 | 
					#define Chimu_11 Chi_11
 | 
				
			||||||
 | 
					#define Chimu_12 Chi_12
 | 
				
			||||||
 | 
					#define Chimu_20 UChi_00
 | 
				
			||||||
 | 
					#define Chimu_21 UChi_01
 | 
				
			||||||
 | 
					#define Chimu_22 UChi_02
 | 
				
			||||||
 | 
					#define Chimu_30 UChi_10
 | 
				
			||||||
 | 
					#define Chimu_31 UChi_11
 | 
				
			||||||
 | 
					#define Chimu_32 UChi_12
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  StencilEntry *SE;
 | 
				
			||||||
 | 
					  int offset, ptype;
 | 
				
			||||||
 | 
					  int num = 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Xp
 | 
				
			||||||
 | 
					  SE=st.GetEntry(ptype,Xp,ss);
 | 
				
			||||||
 | 
					  offset = SE->_offset;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  if (Local && SE->_is_local ) { 
 | 
				
			||||||
 | 
					    LOAD_CHIMU;
 | 
				
			||||||
 | 
					    XM_PROJ;
 | 
				
			||||||
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
 | 
					      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
				
			||||||
 | 
					    LOAD_CHI;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
				
			||||||
 | 
					    MULT_2SPIN(Xp);
 | 
				
			||||||
 | 
					    XM_RECON_ACCUM;
 | 
				
			||||||
 | 
					    num++;  
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Yp
 | 
				
			||||||
 | 
					  SE=st.GetEntry(ptype,Yp,ss);
 | 
				
			||||||
 | 
					  offset = SE->_offset;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  if (Local && SE->_is_local ) { 
 | 
				
			||||||
 | 
					    LOAD_CHIMU;
 | 
				
			||||||
 | 
					    YM_PROJ;
 | 
				
			||||||
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
 | 
					      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
				
			||||||
 | 
					    LOAD_CHI;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
				
			||||||
 | 
					    MULT_2SPIN(Yp);
 | 
				
			||||||
 | 
					    YM_RECON_ACCUM;
 | 
				
			||||||
 | 
					    num++;  
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Zp
 | 
				
			||||||
 | 
					  SE=st.GetEntry(ptype,Zp,ss);
 | 
				
			||||||
 | 
					  offset = SE->_offset;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  if (Local && SE->_is_local ) { 
 | 
				
			||||||
 | 
					    LOAD_CHIMU;
 | 
				
			||||||
 | 
					    ZM_PROJ;
 | 
				
			||||||
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
 | 
					      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }  
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
				
			||||||
 | 
					    LOAD_CHI;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
				
			||||||
 | 
					    MULT_2SPIN(Zp);
 | 
				
			||||||
 | 
					    ZM_RECON_ACCUM;
 | 
				
			||||||
 | 
					    num++;  
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Tp
 | 
				
			||||||
 | 
					  SE=st.GetEntry(ptype,Tp,ss);
 | 
				
			||||||
 | 
					  offset = SE->_offset;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  if (Local && SE->_is_local ) { 
 | 
				
			||||||
 | 
					    LOAD_CHIMU;
 | 
				
			||||||
 | 
					    TM_PROJ;
 | 
				
			||||||
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
 | 
					      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
				
			||||||
 | 
					    LOAD_CHI;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
				
			||||||
 | 
					    MULT_2SPIN(Tp);
 | 
				
			||||||
 | 
					    TM_RECON_ACCUM;
 | 
				
			||||||
 | 
					    num++;  
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  // Xm
 | 
				
			||||||
 | 
					  SE=st.GetEntry(ptype,Xm,ss);
 | 
				
			||||||
 | 
					  offset = SE->_offset;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  if (Local && SE->_is_local ) { 
 | 
				
			||||||
 | 
					    LOAD_CHIMU;
 | 
				
			||||||
 | 
					    XP_PROJ;
 | 
				
			||||||
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
 | 
					      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
				
			||||||
 | 
					    LOAD_CHI;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
				
			||||||
 | 
					    MULT_2SPIN(Xm);
 | 
				
			||||||
 | 
					    XP_RECON_ACCUM;
 | 
				
			||||||
 | 
					    num++;  
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  // Ym
 | 
				
			||||||
 | 
					  SE=st.GetEntry(ptype,Ym,ss);
 | 
				
			||||||
 | 
					  offset = SE->_offset;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  if (Local && SE->_is_local ) { 
 | 
				
			||||||
 | 
					    LOAD_CHIMU;
 | 
				
			||||||
 | 
					    YP_PROJ;
 | 
				
			||||||
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
 | 
					      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
				
			||||||
 | 
					    LOAD_CHI;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
				
			||||||
 | 
					    MULT_2SPIN(Ym);
 | 
				
			||||||
 | 
					    YP_RECON_ACCUM;
 | 
				
			||||||
 | 
					    num++;  
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Zm
 | 
				
			||||||
 | 
					  SE=st.GetEntry(ptype,Zm,ss);
 | 
				
			||||||
 | 
					  offset = SE->_offset;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if (Local && SE->_is_local ) { 
 | 
				
			||||||
 | 
					    LOAD_CHIMU;
 | 
				
			||||||
 | 
					    ZP_PROJ;
 | 
				
			||||||
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
 | 
					      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
				
			||||||
 | 
					    LOAD_CHI;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
				
			||||||
 | 
					    MULT_2SPIN(Zm);
 | 
				
			||||||
 | 
					    ZP_RECON_ACCUM;
 | 
				
			||||||
 | 
					    num++;  
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Tm
 | 
				
			||||||
 | 
					  SE=st.GetEntry(ptype,Tm,ss);
 | 
				
			||||||
 | 
					  offset = SE->_offset;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if (Local && SE->_is_local ) { 
 | 
				
			||||||
 | 
					    LOAD_CHIMU;
 | 
				
			||||||
 | 
					    TP_PROJ;
 | 
				
			||||||
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
 | 
					      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
				
			||||||
 | 
					    LOAD_CHI;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
				
			||||||
 | 
					    MULT_2SPIN(Tm);
 | 
				
			||||||
 | 
					    TP_RECON_ACCUM;
 | 
				
			||||||
 | 
					    num++;  
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  SiteSpinor & ref (out._odata[ss]);
 | 
				
			||||||
 | 
					  if ( Local ) {
 | 
				
			||||||
 | 
					    vstream(ref()(0)(0),result_00*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(0)(1),result_01*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(0)(2),result_02*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(1)(0),result_10*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(1)(1),result_11*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(1)(2),result_12*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(2)(0),result_20*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(2)(1),result_21*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(2)(2),result_22*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(3)(0),result_30*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(3)(1),result_31*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(3)(2),result_32*(-0.5));
 | 
				
			||||||
 | 
					    return 1;
 | 
				
			||||||
 | 
					  } else if ( num ) { 
 | 
				
			||||||
 | 
					    vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(0)(2),ref()(0)(2)+result_02*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(1)(0),ref()(1)(0)+result_10*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(1)(1),ref()(1)(1)+result_11*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(1)(2),ref()(1)(2)+result_12*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(2)(0),ref()(2)(0)+result_20*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(2)(1),ref()(2)(1)+result_21*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(2)(2),ref()(2)(2)+result_22*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5));
 | 
				
			||||||
 | 
					    vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5));
 | 
				
			||||||
 | 
					    return 1;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  return 0;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /*
 | 
				
			||||||
template<class Impl>
 | 
					template<class Impl>
 | 
				
			||||||
void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
						std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
											std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
@@ -795,7 +1087,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeFiel
 | 
				
			|||||||
    vstream(ref()(3)(2),result_32*(-0.5));
 | 
					    vstream(ref()(3)(2),result_32*(-0.5));
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					*/
 | 
				
			||||||
  ////////////////////////////////////////////////
 | 
					  ////////////////////////////////////////////////
 | 
				
			||||||
  // Specialise Gparity to simple implementation
 | 
					  // Specialise Gparity to simple implementation
 | 
				
			||||||
  ////////////////////////////////////////////////
 | 
					  ////////////////////////////////////////////////
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -217,5 +217,50 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 }
 | 
					 }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template<class vobj> inline 
 | 
				
			||||||
 | 
					void merge1(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  typedef typename vobj::scalar_type scalar_type ;
 | 
				
			||||||
 | 
					  typedef typename vobj::vector_type vector_type ;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  const int Nsimd=vobj::vector_type::Nsimd();
 | 
				
			||||||
 | 
					  const int words=sizeof(vobj)/sizeof(vector_type);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  scalar_type *pointer;
 | 
				
			||||||
 | 
					  scalar_type *vp = (scalar_type *)&vec;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  //  assert( (((uint64_t)vp)&(sizeof(scalar_type)-1)) == 0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(int i=0;i<Nsimd;i++){
 | 
				
			||||||
 | 
					    pointer=(scalar_type *)&extracted[i][offset];
 | 
				
			||||||
 | 
					    for(int w=0;w<words;w++){
 | 
				
			||||||
 | 
					      vp[w*Nsimd+i] = pointer[w];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template<class vobj> inline 
 | 
				
			||||||
 | 
					void merge2(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  typedef typename vobj::scalar_type scalar_type ;
 | 
				
			||||||
 | 
					  typedef typename vobj::vector_type vector_type ;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  const int Nsimd=vobj::vector_type::Nsimd();
 | 
				
			||||||
 | 
					  const int words=sizeof(vobj)/sizeof(vector_type);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  scalar_type *pointer;
 | 
				
			||||||
 | 
					  scalar_type *vp = (scalar_type *)&vec;
 | 
				
			||||||
 | 
					  //  assert( (((uint64_t)vp)&(sizeof(scalar_type)-1)) == 0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(int w=0;w<words;w++){
 | 
				
			||||||
 | 
					    for(int i=0;i<Nsimd;i++){
 | 
				
			||||||
 | 
					      pointer=(scalar_type *)&extracted[i][offset];
 | 
				
			||||||
 | 
					      vp[w*Nsimd+i] =pointer[w];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -99,9 +99,8 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
	  ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
 | 
						  ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
	std::vector<vobj,alignedAllocator<vobj> >  comm_buf(myStencil._unified_buffer_size);
 | 
					 | 
				
			||||||
	SimpleCompressor<vobj> compress;
 | 
						SimpleCompressor<vobj> compress;
 | 
				
			||||||
	myStencil.HaloExchange(Foo,comm_buf,compress);
 | 
						myStencil.HaloExchange(Foo,compress);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	Bar = Cshift(Foo,dir,disp);
 | 
						Bar = Cshift(Foo,dir,disp);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -117,7 +116,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
	  else if (SE->_is_local)
 | 
						  else if (SE->_is_local)
 | 
				
			||||||
	    Check._odata[i] = Foo._odata[SE->_offset];
 | 
						    Check._odata[i] = Foo._odata[SE->_offset];
 | 
				
			||||||
	  else 
 | 
						  else 
 | 
				
			||||||
	    Check._odata[i] = comm_buf[SE->_offset];
 | 
						    Check._odata[i] = myStencil.comm_buf[SE->_offset];
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	Real nrmC = norm2(Check);
 | 
						Real nrmC = norm2(Check);
 | 
				
			||||||
@@ -181,13 +180,10 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
	  ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
 | 
						  ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
	std::vector<vobj,alignedAllocator<vobj> >  Ecomm_buf(EStencil._unified_buffer_size);
 | 
					 | 
				
			||||||
	std::vector<vobj,alignedAllocator<vobj> >  Ocomm_buf(OStencil._unified_buffer_size);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	SimpleCompressor<vobj> compress;
 | 
						SimpleCompressor<vobj> compress;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	EStencil.HaloExchange(EFoo,Ecomm_buf,compress);
 | 
						EStencil.HaloExchange(EFoo,compress);
 | 
				
			||||||
	OStencil.HaloExchange(OFoo,Ocomm_buf,compress);
 | 
						OStencil.HaloExchange(OFoo,compress);
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
	Bar = Cshift(Foo,dir,disp);
 | 
						Bar = Cshift(Foo,dir,disp);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -211,7 +207,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
	  else if (SE->_is_local)
 | 
						  else if (SE->_is_local)
 | 
				
			||||||
	    OCheck._odata[i] = EFoo._odata[SE->_offset];
 | 
						    OCheck._odata[i] = EFoo._odata[SE->_offset];
 | 
				
			||||||
	  else 
 | 
						  else 
 | 
				
			||||||
	    OCheck._odata[i] = Ecomm_buf[SE->_offset];
 | 
						    OCheck._odata[i] = EStencil.comm_buf[SE->_offset];
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	for(int i=0;i<ECheck._grid->oSites();i++){
 | 
						for(int i=0;i<ECheck._grid->oSites();i++){
 | 
				
			||||||
	  int permute_type;
 | 
						  int permute_type;
 | 
				
			||||||
@@ -224,7 +220,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
	  else if (SE->_is_local)
 | 
						  else if (SE->_is_local)
 | 
				
			||||||
	    ECheck._odata[i] = OFoo._odata[SE->_offset];
 | 
						    ECheck._odata[i] = OFoo._odata[SE->_offset];
 | 
				
			||||||
	  else 
 | 
						  else 
 | 
				
			||||||
	    ECheck._odata[i] = Ocomm_buf[SE->_offset];
 | 
						    ECheck._odata[i] = OStencil.comm_buf[SE->_offset];
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
	setCheckerboard(Check,ECheck);
 | 
						setCheckerboard(Check,ECheck);
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user