mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 14:04:32 +00:00 
			
		
		
		
	Merge branch 'develop' into feature/hadrons
This commit is contained in:
		
							
								
								
									
										276
									
								
								lib/FFT.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										276
									
								
								lib/FFT.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,276 @@
 | 
			
		||||
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/Cshift.h
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#ifndef _GRID_FFT_H_
 | 
			
		||||
#define _GRID_FFT_H_
 | 
			
		||||
 | 
			
		||||
#ifdef HAVE_FFTW	
 | 
			
		||||
#include <fftw3.h>
 | 
			
		||||
#endif
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
  template<class scalar> struct FFTW { };
 | 
			
		||||
 | 
			
		||||
#ifdef HAVE_FFTW	
 | 
			
		||||
  template<> struct FFTW<ComplexD> {
 | 
			
		||||
  public:
 | 
			
		||||
 | 
			
		||||
    typedef fftw_complex FFTW_scalar;
 | 
			
		||||
    typedef fftw_plan    FFTW_plan;
 | 
			
		||||
 | 
			
		||||
    static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
 | 
			
		||||
					FFTW_scalar *in, const int *inembed,		
 | 
			
		||||
					int istride, int idist,		
 | 
			
		||||
					FFTW_scalar *out, const int *onembed,		
 | 
			
		||||
					int ostride, int odist,		
 | 
			
		||||
					int sign, unsigned flags) {
 | 
			
		||||
      return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
 | 
			
		||||
    }	  
 | 
			
		||||
    
 | 
			
		||||
    static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
 | 
			
		||||
      ::fftw_flops(p,add,mul,fmas);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
 | 
			
		||||
      ::fftw_execute_dft(p,in,out);
 | 
			
		||||
    }
 | 
			
		||||
    inline static void fftw_destroy_plan(const FFTW_plan p) {
 | 
			
		||||
      ::fftw_destroy_plan(p);
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  template<> struct FFTW<ComplexF> {
 | 
			
		||||
  public:
 | 
			
		||||
 | 
			
		||||
    typedef fftwf_complex FFTW_scalar;
 | 
			
		||||
    typedef fftwf_plan    FFTW_plan;
 | 
			
		||||
 | 
			
		||||
    static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
 | 
			
		||||
					FFTW_scalar *in, const int *inembed,		
 | 
			
		||||
					int istride, int idist,		
 | 
			
		||||
					FFTW_scalar *out, const int *onembed,		
 | 
			
		||||
					int ostride, int odist,		
 | 
			
		||||
					int sign, unsigned flags) {
 | 
			
		||||
      return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
 | 
			
		||||
    }	  
 | 
			
		||||
    
 | 
			
		||||
    static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
 | 
			
		||||
      ::fftwf_flops(p,add,mul,fmas);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
 | 
			
		||||
      ::fftwf_execute_dft(p,in,out);
 | 
			
		||||
    }
 | 
			
		||||
    inline static void fftw_destroy_plan(const FFTW_plan p) {
 | 
			
		||||
      ::fftwf_destroy_plan(p);
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifndef FFTW_FORWARD
 | 
			
		||||
#define FFTW_FORWARD (-1)
 | 
			
		||||
#define FFTW_BACKWARD (+1)
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  class FFT { 
 | 
			
		||||
  private:
 | 
			
		||||
 | 
			
		||||
    GridCartesian *vgrid;
 | 
			
		||||
    GridCartesian *sgrid;
 | 
			
		||||
 | 
			
		||||
    int Nd;
 | 
			
		||||
    double flops;
 | 
			
		||||
    double flops_call;
 | 
			
		||||
    uint64_t usec;
 | 
			
		||||
 | 
			
		||||
    std::vector<int> dimensions;
 | 
			
		||||
    std::vector<int> processors;
 | 
			
		||||
    std::vector<int> processor_coor;
 | 
			
		||||
 | 
			
		||||
  public:
 | 
			
		||||
 | 
			
		||||
    static const int forward=FFTW_FORWARD;
 | 
			
		||||
    static const int backward=FFTW_BACKWARD;
 | 
			
		||||
 | 
			
		||||
    double Flops(void) {return flops;}
 | 
			
		||||
    double MFlops(void) {return flops/usec;}
 | 
			
		||||
 | 
			
		||||
    FFT ( GridCartesian * grid ) : 
 | 
			
		||||
      vgrid(grid),
 | 
			
		||||
      Nd(grid->_ndimension),
 | 
			
		||||
      dimensions(grid->_fdimensions),
 | 
			
		||||
      processors(grid->_processors),
 | 
			
		||||
      processor_coor(grid->_processor_coor)
 | 
			
		||||
    {
 | 
			
		||||
      flops=0;
 | 
			
		||||
      usec =0;
 | 
			
		||||
      std::vector<int> layout(Nd,1);
 | 
			
		||||
      sgrid = new GridCartesian(dimensions,layout,processors);
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    ~FFT ( void)  { 
 | 
			
		||||
      delete sgrid; 
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    template<class vobj>
 | 
			
		||||
    void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int inverse){
 | 
			
		||||
 | 
			
		||||
      conformable(result._grid,vgrid);
 | 
			
		||||
      conformable(source._grid,vgrid);
 | 
			
		||||
 | 
			
		||||
      int L = vgrid->_ldimensions[dim];
 | 
			
		||||
      int G = vgrid->_fdimensions[dim];
 | 
			
		||||
 | 
			
		||||
      std::vector<int> layout(Nd,1);
 | 
			
		||||
      std::vector<int> pencil_gd(vgrid->_fdimensions);
 | 
			
		||||
 | 
			
		||||
      pencil_gd[dim] = G*processors[dim];    
 | 
			
		||||
 | 
			
		||||
      // Pencil global vol LxLxGxLxL per node
 | 
			
		||||
      GridCartesian pencil_g(pencil_gd,layout,processors);
 | 
			
		||||
 | 
			
		||||
      // Construct pencils
 | 
			
		||||
      typedef typename vobj::scalar_object sobj;
 | 
			
		||||
      typedef typename sobj::scalar_type   scalar;
 | 
			
		||||
 | 
			
		||||
      Lattice<vobj> ssource(vgrid); ssource =source;
 | 
			
		||||
      Lattice<sobj> pgsource(&pencil_g);
 | 
			
		||||
      Lattice<sobj> pgresult(&pencil_g); pgresult=zero;
 | 
			
		||||
 | 
			
		||||
#ifndef HAVE_FFTW	
 | 
			
		||||
      assert(0);
 | 
			
		||||
#else 
 | 
			
		||||
      typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
 | 
			
		||||
      typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
 | 
			
		||||
 | 
			
		||||
      {
 | 
			
		||||
	int Ncomp = sizeof(sobj)/sizeof(scalar);
 | 
			
		||||
	int Nlow  = 1;
 | 
			
		||||
	for(int d=0;d<dim;d++){
 | 
			
		||||
	  Nlow*=vgrid->_ldimensions[d];
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	int rank = 1;  /* 1d transforms */
 | 
			
		||||
	int n[] = {G}; /* 1d transforms of length G */
 | 
			
		||||
	int howmany = Ncomp;
 | 
			
		||||
	int odist,idist,istride,ostride;
 | 
			
		||||
	idist   = odist   = 1;          /* Distance between consecutive FT's */
 | 
			
		||||
	istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
 | 
			
		||||
	int *inembed = n, *onembed = n;
 | 
			
		||||
 | 
			
		||||
	
 | 
			
		||||
	int sign = FFTW_FORWARD;
 | 
			
		||||
	if (inverse) sign = FFTW_BACKWARD;
 | 
			
		||||
 | 
			
		||||
	FFTW_plan p;
 | 
			
		||||
	{
 | 
			
		||||
	  FFTW_scalar *in = (FFTW_scalar *)&pgsource._odata[0];
 | 
			
		||||
	  FFTW_scalar *out= (FFTW_scalar *)&pgresult._odata[0];
 | 
			
		||||
	  p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
 | 
			
		||||
					       in,inembed,
 | 
			
		||||
					       istride,idist,
 | 
			
		||||
					       out,onembed,
 | 
			
		||||
					       ostride, odist,
 | 
			
		||||
					       sign,FFTW_ESTIMATE);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	double add,mul,fma;
 | 
			
		||||
	FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
 | 
			
		||||
	flops_call = add+mul+2.0*fma;
 | 
			
		||||
 | 
			
		||||
	GridStopWatch timer;
 | 
			
		||||
 | 
			
		||||
	// Barrel shift and collect global pencil
 | 
			
		||||
	for(int p=0;p<processors[dim];p++) { 
 | 
			
		||||
 | 
			
		||||
	  for(int idx=0;idx<sgrid->lSites();idx++) { 
 | 
			
		||||
 | 
			
		||||
	    std::vector<int> lcoor(Nd);
 | 
			
		||||
    	    sgrid->LocalIndexToLocalCoor(idx,lcoor);
 | 
			
		||||
 | 
			
		||||
	    sobj s;
 | 
			
		||||
 | 
			
		||||
	    peekLocalSite(s,ssource,lcoor);
 | 
			
		||||
 | 
			
		||||
	    lcoor[dim]+=p*L;
 | 
			
		||||
	   
 | 
			
		||||
	    pokeLocalSite(s,pgsource,lcoor);
 | 
			
		||||
	  }
 | 
			
		||||
 | 
			
		||||
	  ssource = Cshift(ssource,dim,L);
 | 
			
		||||
	}
 | 
			
		||||
	
 | 
			
		||||
	// Loop over orthog coords
 | 
			
		||||
	int NN=pencil_g.lSites();
 | 
			
		||||
 | 
			
		||||
	GridStopWatch Timer;
 | 
			
		||||
	Timer.Start();
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
	for(int idx=0;idx<NN;idx++) { 
 | 
			
		||||
 | 
			
		||||
	  std::vector<int> lcoor(Nd);
 | 
			
		||||
	  pencil_g.LocalIndexToLocalCoor(idx,lcoor);
 | 
			
		||||
 | 
			
		||||
	  if ( lcoor[dim] == 0 ) {  // restricts loop to plane at lcoor[dim]==0
 | 
			
		||||
	    FFTW_scalar *in = (FFTW_scalar *)&pgsource._odata[idx];
 | 
			
		||||
	    FFTW_scalar *out= (FFTW_scalar *)&pgresult._odata[idx];
 | 
			
		||||
	    FFTW<scalar>::fftw_execute_dft(p,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
        Timer.Stop();
 | 
			
		||||
	usec += Timer.useconds();
 | 
			
		||||
	flops+= flops_call*NN;
 | 
			
		||||
 | 
			
		||||
        int pc = processor_coor[dim];
 | 
			
		||||
        for(int idx=0;idx<sgrid->lSites();idx++) { 
 | 
			
		||||
	  std::vector<int> lcoor(Nd);
 | 
			
		||||
	  sgrid->LocalIndexToLocalCoor(idx,lcoor);
 | 
			
		||||
	  std::vector<int> gcoor = lcoor;
 | 
			
		||||
	  // extract the result
 | 
			
		||||
	  sobj s;
 | 
			
		||||
	  gcoor[dim] = lcoor[dim]+L*pc;
 | 
			
		||||
	  peekLocalSite(s,pgresult,gcoor);
 | 
			
		||||
	  pokeLocalSite(s,result,lcoor);
 | 
			
		||||
	}
 | 
			
		||||
      	  
 | 
			
		||||
	FFTW<scalar>::fftw_destroy_plan(p);
 | 
			
		||||
      }
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
@@ -68,6 +68,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#include <Grid/Simd.h>
 | 
			
		||||
#include <Grid/Threads.h>
 | 
			
		||||
#include <Grid/Lexicographic.h>
 | 
			
		||||
#include <Grid/Init.h>
 | 
			
		||||
#include <Grid/Communicator.h> 
 | 
			
		||||
#include <Grid/Cartesian.h>    
 | 
			
		||||
#include <Grid/Tensors.h>      
 | 
			
		||||
@@ -78,7 +79,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#include <Grid/parallelIO/BinaryIO.h>
 | 
			
		||||
#include <Grid/qcd/QCD.h>
 | 
			
		||||
#include <Grid/parallelIO/NerscIO.h>
 | 
			
		||||
#include <Grid/Init.h>
 | 
			
		||||
 | 
			
		||||
#include <Grid/FFT.h>
 | 
			
		||||
 | 
			
		||||
#include <Grid/qcd/hmc/NerscCheckpointer.h>
 | 
			
		||||
#include <Grid/qcd/hmc/HmcRunner.h>
 | 
			
		||||
 
 | 
			
		||||
@@ -153,6 +153,7 @@ void GridParseLayout(char **argv,int argc,
 | 
			
		||||
    assert(ompthreads.size()==1);
 | 
			
		||||
    GridThread::SetThreads(ompthreads[0]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
 | 
			
		||||
    std::vector<int> cores(0);
 | 
			
		||||
    arg= GridCmdOptionPayload(argv,argv+argc,"--cores");
 | 
			
		||||
@@ -203,7 +204,6 @@ void Grid_init(int *argc,char ***argv)
 | 
			
		||||
    GridLogConfigure(logstreams);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
 | 
			
		||||
    Grid_debug_handler_init();
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -17,8 +17,8 @@ endif
 | 
			
		||||
include Make.inc
 | 
			
		||||
include Eigen.inc
 | 
			
		||||
 | 
			
		||||
lib_LTLIBRARIES = libGrid.la
 | 
			
		||||
lib_LIBRARIES = libGrid.a
 | 
			
		||||
 | 
			
		||||
libGrid_la_SOURCES             = $(CCFILES) $(extra_sources)
 | 
			
		||||
libGrid_ladir                  = $(pkgincludedir)
 | 
			
		||||
libGrid_a_SOURCES              = $(CCFILES) $(extra_sources)
 | 
			
		||||
libGrid_adir                   = $(pkgincludedir)
 | 
			
		||||
nobase_dist_pkginclude_HEADERS = $(HFILES) $(eigen_files) Config.h
 | 
			
		||||
 
 | 
			
		||||
@@ -265,7 +265,7 @@
 | 
			
		||||
	 //	 _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0);
 | 
			
		||||
       }
 | 
			
		||||
       inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
 | 
			
		||||
	 _mm_prefetch((char *)&_entries[ent+1],_MM_HINT_T0);
 | 
			
		||||
	 //_mm_prefetch((char *)&_entries[ent+1],_MM_HINT_T0);
 | 
			
		||||
	 local = _entries[ent]._is_local;
 | 
			
		||||
	 perm  = _entries[ent]._permute;
 | 
			
		||||
	 if (perm)  ptype = _permute_type[point]; 
 | 
			
		||||
 
 | 
			
		||||
@@ -127,21 +127,12 @@ class CartesianCommunicator {
 | 
			
		||||
			int recv_from_rank,
 | 
			
		||||
			int bytes);
 | 
			
		||||
 | 
			
		||||
    void SendToRecvFromInit(std::vector<CommsRequest_t> &list,
 | 
			
		||||
			    void *xmit,
 | 
			
		||||
			    int xmit_to_rank,
 | 
			
		||||
			    void *recv,
 | 
			
		||||
			    int recv_from_rank,
 | 
			
		||||
			    int bytes);
 | 
			
		||||
 | 
			
		||||
    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 | 
			
		||||
			 void *xmit,
 | 
			
		||||
			 int xmit_to_rank,
 | 
			
		||||
			 void *recv,
 | 
			
		||||
			 int recv_from_rank,
 | 
			
		||||
			 int bytes);
 | 
			
		||||
 | 
			
		||||
    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list);
 | 
			
		||||
    void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
 | 
			
		||||
 | 
			
		||||
    ////////////////////////////////////////////////////////////
 | 
			
		||||
 
 | 
			
		||||
@@ -144,28 +144,6 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Basic Halo comms primitive
 | 
			
		||||
// Basic Halo comms primitive
 | 
			
		||||
void CartesianCommunicator::SendToRecvFromInit(std::vector<CommsRequest_t> &list,
 | 
			
		||||
					       void *xmit,
 | 
			
		||||
					       int dest,
 | 
			
		||||
					       void *recv,
 | 
			
		||||
					       int from,
 | 
			
		||||
					       int bytes)
 | 
			
		||||
{
 | 
			
		||||
  MPI_Request xrq;
 | 
			
		||||
  MPI_Request rrq;
 | 
			
		||||
  int rank = _processor;
 | 
			
		||||
  int ierr;
 | 
			
		||||
  ierr =MPI_Send_init(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
 | 
			
		||||
  ierr|=MPI_Recv_init(recv, bytes, MPI_CHAR,dest,_processor,communicator,&rrq);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
  list.push_back(xrq);
 | 
			
		||||
  list.push_back(rrq);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list)
 | 
			
		||||
{
 | 
			
		||||
  MPI_Startall(list.size(),&list[0]);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 | 
			
		||||
						void *xmit,
 | 
			
		||||
						int dest,
 | 
			
		||||
@@ -173,12 +151,17 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 | 
			
		||||
						int from,
 | 
			
		||||
						int bytes)
 | 
			
		||||
{
 | 
			
		||||
  std::vector<CommsRequest_t> reqs(0);
 | 
			
		||||
  SendToRecvFromInit(reqs,xmit,dest,recv,from,bytes);
 | 
			
		||||
  SendToRecvFromBegin(reqs);
 | 
			
		||||
  for(int i=0;i<reqs.size();i++){
 | 
			
		||||
    list.push_back(reqs[i]);
 | 
			
		||||
  }
 | 
			
		||||
  MPI_Request xrq;
 | 
			
		||||
  MPI_Request rrq;
 | 
			
		||||
  int rank = _processor;
 | 
			
		||||
  int ierr;
 | 
			
		||||
  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
 | 
			
		||||
  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
 | 
			
		||||
  
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
 | 
			
		||||
  list.push_back(xrq);
 | 
			
		||||
  list.push_back(rrq);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 | 
			
		||||
{
 | 
			
		||||
 
 | 
			
		||||
@@ -84,19 +84,6 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 | 
			
		||||
{
 | 
			
		||||
  assert(0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::SendToRecvFromInit(std::vector<CommsRequest_t> &list,
 | 
			
		||||
						void *xmit,
 | 
			
		||||
						int dest,
 | 
			
		||||
						void *recv,
 | 
			
		||||
						int from,
 | 
			
		||||
						int bytes)
 | 
			
		||||
{
 | 
			
		||||
  assert(0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list)
 | 
			
		||||
{
 | 
			
		||||
  assert(0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 | 
			
		||||
{
 | 
			
		||||
  assert(0);
 | 
			
		||||
 
 | 
			
		||||
@@ -268,10 +268,6 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Basic Halo comms primitive
 | 
			
		||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list)
 | 
			
		||||
{
 | 
			
		||||
  assert(0); //unimplemented
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 | 
			
		||||
						void *xmit,
 | 
			
		||||
						int dest,
 | 
			
		||||
@@ -284,15 +280,6 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 | 
			
		||||
  //  shmem_putmem_nb(recv,xmit,bytes,dest,NULL);
 | 
			
		||||
  shmem_putmem(recv,xmit,bytes,dest);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::SendToRecvFromInit(std::vector<CommsRequest_t> &list,
 | 
			
		||||
						void *xmit,
 | 
			
		||||
						int dest,
 | 
			
		||||
						void *recv,
 | 
			
		||||
						int from,
 | 
			
		||||
						int bytes)
 | 
			
		||||
{
 | 
			
		||||
  assert(0); // Unimplemented
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 | 
			
		||||
{
 | 
			
		||||
  //  shmem_quiet();      // I'm done
 | 
			
		||||
 
 | 
			
		||||
@@ -349,7 +349,7 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
 | 
			
		||||
    assert(ig->_ldimensions[d] == og->_ldimensions[d]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  //PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int idx=0;idx<ig->lSites();idx++){
 | 
			
		||||
    std::vector<int> lcoor(ni);
 | 
			
		||||
    ig->LocalIndexToLocalCoor(idx,lcoor);
 | 
			
		||||
@@ -446,6 +446,79 @@ void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, in
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class vobj>
 | 
			
		||||
void InsertSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  sobj s;
 | 
			
		||||
 | 
			
		||||
  GridBase *lg = lowDim._grid;
 | 
			
		||||
  GridBase *hg = higherDim._grid;
 | 
			
		||||
  int nl = lg->_ndimension;
 | 
			
		||||
  int nh = hg->_ndimension;
 | 
			
		||||
 | 
			
		||||
  assert(nl == nh);
 | 
			
		||||
  assert(orthog<nh);
 | 
			
		||||
  assert(orthog>=0);
 | 
			
		||||
 | 
			
		||||
  for(int d=0;d<nh;d++){
 | 
			
		||||
    assert(lg->_processors[d]  == hg->_processors[d]);
 | 
			
		||||
    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // the above should guarantee that the operations are local
 | 
			
		||||
  //PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int idx=0;idx<lg->lSites();idx++){
 | 
			
		||||
    std::vector<int> lcoor(nl);
 | 
			
		||||
    std::vector<int> hcoor(nh);
 | 
			
		||||
    lg->LocalIndexToLocalCoor(idx,lcoor);
 | 
			
		||||
    if( lcoor[orthog] == slice_lo ) { 
 | 
			
		||||
      hcoor=lcoor;
 | 
			
		||||
      hcoor[orthog] = slice_hi;
 | 
			
		||||
      peekLocalSite(s,lowDim,lcoor);
 | 
			
		||||
      pokeLocalSite(s,higherDim,hcoor);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class vobj>
 | 
			
		||||
void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  sobj s;
 | 
			
		||||
 | 
			
		||||
  GridBase *lg = lowDim._grid;
 | 
			
		||||
  GridBase *hg = higherDim._grid;
 | 
			
		||||
  int nl = lg->_ndimension;
 | 
			
		||||
  int nh = hg->_ndimension;
 | 
			
		||||
 | 
			
		||||
  assert(nl == nh);
 | 
			
		||||
  assert(orthog<nh);
 | 
			
		||||
  assert(orthog>=0);
 | 
			
		||||
 | 
			
		||||
  for(int d=0;d<nh;d++){
 | 
			
		||||
    assert(lg->_processors[d]  == hg->_processors[d]);
 | 
			
		||||
    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // the above should guarantee that the operations are local
 | 
			
		||||
  //PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int idx=0;idx<lg->lSites();idx++){
 | 
			
		||||
    std::vector<int> lcoor(nl);
 | 
			
		||||
    std::vector<int> hcoor(nh);
 | 
			
		||||
    lg->LocalIndexToLocalCoor(idx,lcoor);
 | 
			
		||||
    if( lcoor[orthog] == slice_lo ) { 
 | 
			
		||||
      hcoor=lcoor;
 | 
			
		||||
      hcoor[orthog] = slice_hi;
 | 
			
		||||
      peekLocalSite(s,higherDim,hcoor);
 | 
			
		||||
      pokeLocalSite(s,lowDim,lcoor);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class vobj>
 | 
			
		||||
void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
 | 
			
		||||
{
 | 
			
		||||
 
 | 
			
		||||
@@ -194,22 +194,22 @@ class BinaryIO {
 | 
			
		||||
 | 
			
		||||
      std::vector<int> site({x,y,z,t});
 | 
			
		||||
 | 
			
		||||
      if ( grid->IsBoss() ) {
 | 
			
		||||
	fin.read((char *)&file_object,sizeof(file_object));
 | 
			
		||||
	bytes += sizeof(file_object);
 | 
			
		||||
	if(ieee32big) be32toh_v((void *)&file_object,sizeof(file_object));
 | 
			
		||||
	if(ieee32)    le32toh_v((void *)&file_object,sizeof(file_object));
 | 
			
		||||
	if(ieee64big) be64toh_v((void *)&file_object,sizeof(file_object));
 | 
			
		||||
	if(ieee64)    le64toh_v((void *)&file_object,sizeof(file_object));
 | 
			
		||||
      if (grid->IsBoss()) {
 | 
			
		||||
        fin.read((char *)&file_object, sizeof(file_object));
 | 
			
		||||
        bytes += sizeof(file_object);
 | 
			
		||||
        if (ieee32big) be32toh_v((void *)&file_object, sizeof(file_object));
 | 
			
		||||
        if (ieee32) le32toh_v((void *)&file_object, sizeof(file_object));
 | 
			
		||||
        if (ieee64big) be64toh_v((void *)&file_object, sizeof(file_object));
 | 
			
		||||
        if (ieee64) le64toh_v((void *)&file_object, sizeof(file_object));
 | 
			
		||||
 | 
			
		||||
	munge(file_object,munged,csum);
 | 
			
		||||
        munge(file_object, munged, csum);
 | 
			
		||||
      }
 | 
			
		||||
      // The boss who read the file has their value poked
 | 
			
		||||
      pokeSite(munged,Umu,site);
 | 
			
		||||
    }}}}
 | 
			
		||||
    timer.Stop();
 | 
			
		||||
    std::cout<<GridLogPerformance<<"readObjectSerial: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
 | 
			
		||||
	     << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl;
 | 
			
		||||
       << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl;
 | 
			
		||||
 | 
			
		||||
    return csum;
 | 
			
		||||
  }
 | 
			
		||||
@@ -254,20 +254,20 @@ class BinaryIO {
 | 
			
		||||
 | 
			
		||||
      
 | 
			
		||||
      if ( grid->IsBoss() ) {
 | 
			
		||||
	
 | 
			
		||||
	if(ieee32big) htobe32_v((void *)&file_object,sizeof(file_object));
 | 
			
		||||
	if(ieee32)    htole32_v((void *)&file_object,sizeof(file_object));
 | 
			
		||||
	if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
 | 
			
		||||
	if(ieee64)    htole64_v((void *)&file_object,sizeof(file_object));
 | 
			
		||||
  
 | 
			
		||||
  if(ieee32big) htobe32_v((void *)&file_object,sizeof(file_object));
 | 
			
		||||
  if(ieee32)    htole32_v((void *)&file_object,sizeof(file_object));
 | 
			
		||||
  if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
 | 
			
		||||
  if(ieee64)    htole64_v((void *)&file_object,sizeof(file_object));
 | 
			
		||||
 | 
			
		||||
	// NB could gather an xstrip as an optimisation.
 | 
			
		||||
	fout.write((char *)&file_object,sizeof(file_object));
 | 
			
		||||
	bytes+=sizeof(file_object);
 | 
			
		||||
  // NB could gather an xstrip as an optimisation.
 | 
			
		||||
  fout.write((char *)&file_object,sizeof(file_object));
 | 
			
		||||
  bytes+=sizeof(file_object);
 | 
			
		||||
      }
 | 
			
		||||
    }}}}
 | 
			
		||||
    timer.Stop();
 | 
			
		||||
    std::cout<<GridLogPerformance<<"writeObjectSerial: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
 | 
			
		||||
	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
 | 
			
		||||
       << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
 | 
			
		||||
 | 
			
		||||
    return csum;
 | 
			
		||||
  }
 | 
			
		||||
@@ -305,15 +305,15 @@ class BinaryIO {
 | 
			
		||||
      int l_idx=parallel.generator_idx(o_idx,i_idx);
 | 
			
		||||
 | 
			
		||||
      if( rank == grid->ThisRank() ){
 | 
			
		||||
	//	std::cout << "rank" << rank<<" Getting state for index "<<l_idx<<std::endl;
 | 
			
		||||
	parallel.GetState(saved,l_idx);
 | 
			
		||||
  //  std::cout << "rank" << rank<<" Getting state for index "<<l_idx<<std::endl;
 | 
			
		||||
  parallel.GetState(saved,l_idx);
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      grid->Broadcast(rank,(void *)&saved[0],bytes);
 | 
			
		||||
 | 
			
		||||
      if ( grid->IsBoss() ) {
 | 
			
		||||
	Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
 | 
			
		||||
	fout.write((char *)&saved[0],bytes);
 | 
			
		||||
  Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
 | 
			
		||||
  fout.write((char *)&saved[0],bytes);
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
@@ -355,14 +355,14 @@ class BinaryIO {
 | 
			
		||||
      int l_idx=parallel.generator_idx(o_idx,i_idx);
 | 
			
		||||
 | 
			
		||||
      if ( grid->IsBoss() ) {
 | 
			
		||||
	fin.read((char *)&saved[0],bytes);
 | 
			
		||||
	Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
 | 
			
		||||
  fin.read((char *)&saved[0],bytes);
 | 
			
		||||
  Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      grid->Broadcast(0,(void *)&saved[0],bytes);
 | 
			
		||||
 | 
			
		||||
      if( rank == grid->ThisRank() ){
 | 
			
		||||
	parallel.SetState(saved,l_idx);
 | 
			
		||||
  parallel.SetState(saved,l_idx);
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
@@ -415,15 +415,15 @@ class BinaryIO {
 | 
			
		||||
 | 
			
		||||
      if ( d == 0 ) parallel[d] = 0;
 | 
			
		||||
      if (parallel[d]) {
 | 
			
		||||
	range[d] = grid->_ldimensions[d];
 | 
			
		||||
	start[d] = grid->_processor_coor[d]*range[d];
 | 
			
		||||
	ioproc[d]= grid->_processor_coor[d];
 | 
			
		||||
  range[d] = grid->_ldimensions[d];
 | 
			
		||||
  start[d] = grid->_processor_coor[d]*range[d];
 | 
			
		||||
  ioproc[d]= grid->_processor_coor[d];
 | 
			
		||||
      } else {
 | 
			
		||||
	range[d] = grid->_gdimensions[d];
 | 
			
		||||
	start[d] = 0;
 | 
			
		||||
	ioproc[d]= 0;
 | 
			
		||||
  range[d] = grid->_gdimensions[d];
 | 
			
		||||
  start[d] = 0;
 | 
			
		||||
  ioproc[d]= 0;
 | 
			
		||||
 | 
			
		||||
	if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
 | 
			
		||||
  if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
 | 
			
		||||
      }
 | 
			
		||||
      slice_vol = slice_vol * range[d];
 | 
			
		||||
    }
 | 
			
		||||
@@ -434,9 +434,9 @@ class BinaryIO {
 | 
			
		||||
      std::cout<< std::dec ;
 | 
			
		||||
      std::cout<< GridLogMessage<< "Parallel read I/O to "<< file << " with " <<tmp<< " IOnodes for subslice ";
 | 
			
		||||
      for(int d=0;d<grid->_ndimension;d++){
 | 
			
		||||
	std::cout<< range[d];
 | 
			
		||||
	if( d< grid->_ndimension-1 ) 
 | 
			
		||||
	  std::cout<< " x ";
 | 
			
		||||
  std::cout<< range[d];
 | 
			
		||||
  if( d< grid->_ndimension-1 ) 
 | 
			
		||||
    std::cout<< " x ";
 | 
			
		||||
      }
 | 
			
		||||
      std::cout << std::endl;
 | 
			
		||||
    }
 | 
			
		||||
@@ -463,7 +463,7 @@ class BinaryIO {
 | 
			
		||||
 | 
			
		||||
      // need to implement these loops in Nd independent way with a lexico conversion
 | 
			
		||||
    for(int tlex=0;tlex<slice_vol;tlex++){
 | 
			
		||||
	
 | 
			
		||||
  
 | 
			
		||||
      std::vector<int> tsite(nd); // temporary mixed up site
 | 
			
		||||
      std::vector<int> gsite(nd);
 | 
			
		||||
      std::vector<int> lsite(nd);
 | 
			
		||||
@@ -472,8 +472,8 @@ class BinaryIO {
 | 
			
		||||
      Lexicographic::CoorFromIndex(tsite,tlex,range);
 | 
			
		||||
 | 
			
		||||
      for(int d=0;d<nd;d++){
 | 
			
		||||
	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
 | 
			
		||||
	gsite[d] = tsite[d]+start[d];               // global site
 | 
			
		||||
  lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
 | 
			
		||||
  gsite[d] = tsite[d]+start[d];               // global site
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      /////////////////////////
 | 
			
		||||
@@ -487,29 +487,29 @@ class BinaryIO {
 | 
			
		||||
      // iorank reads from the seek
 | 
			
		||||
      ////////////////////////////////
 | 
			
		||||
      if (myrank == iorank) {
 | 
			
		||||
	
 | 
			
		||||
	fin.seekg(offset+g_idx*sizeof(fileObj));
 | 
			
		||||
	fin.read((char *)&fileObj,sizeof(fileObj));
 | 
			
		||||
	bytes+=sizeof(fileObj);
 | 
			
		||||
	
 | 
			
		||||
	if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
 | 
			
		||||
	if(ieee32)    le32toh_v((void *)&fileObj,sizeof(fileObj));
 | 
			
		||||
	if(ieee64big) be64toh_v((void *)&fileObj,sizeof(fileObj));
 | 
			
		||||
	if(ieee64)    le64toh_v((void *)&fileObj,sizeof(fileObj));
 | 
			
		||||
	
 | 
			
		||||
	munge(fileObj,siteObj,csum);
 | 
			
		||||
  
 | 
			
		||||
  fin.seekg(offset+g_idx*sizeof(fileObj));
 | 
			
		||||
  fin.read((char *)&fileObj,sizeof(fileObj));
 | 
			
		||||
  bytes+=sizeof(fileObj);
 | 
			
		||||
  
 | 
			
		||||
  if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
 | 
			
		||||
  if(ieee32)    le32toh_v((void *)&fileObj,sizeof(fileObj));
 | 
			
		||||
  if(ieee64big) be64toh_v((void *)&fileObj,sizeof(fileObj));
 | 
			
		||||
  if(ieee64)    le64toh_v((void *)&fileObj,sizeof(fileObj));
 | 
			
		||||
  
 | 
			
		||||
  munge(fileObj,siteObj,csum);
 | 
			
		||||
 | 
			
		||||
      }	
 | 
			
		||||
      } 
 | 
			
		||||
 | 
			
		||||
      // Possibly do transport through pt2pt 
 | 
			
		||||
      if ( rank != iorank ) { 
 | 
			
		||||
	if ( (myrank == rank) || (myrank==iorank) ) {
 | 
			
		||||
	  grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,iorank,rank,sizeof(siteObj));
 | 
			
		||||
	}
 | 
			
		||||
  if ( (myrank == rank) || (myrank==iorank) ) {
 | 
			
		||||
    grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,iorank,rank,sizeof(siteObj));
 | 
			
		||||
  }
 | 
			
		||||
      }
 | 
			
		||||
      // Poke at destination
 | 
			
		||||
      if ( myrank == rank ) {
 | 
			
		||||
	  pokeLocalSite(siteObj,Umu,lsite);
 | 
			
		||||
    pokeLocalSite(siteObj,Umu,lsite);
 | 
			
		||||
      }
 | 
			
		||||
      grid->Barrier(); // necessary?
 | 
			
		||||
    }
 | 
			
		||||
@@ -520,7 +520,7 @@ class BinaryIO {
 | 
			
		||||
 | 
			
		||||
    timer.Stop();
 | 
			
		||||
    std::cout<<GridLogPerformance<<"readObjectParallel: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
 | 
			
		||||
	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
 | 
			
		||||
       << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
 | 
			
		||||
    
 | 
			
		||||
    return csum;
 | 
			
		||||
  }
 | 
			
		||||
@@ -558,15 +558,15 @@ class BinaryIO {
 | 
			
		||||
      if ( d!= grid->_ndimension-1 ) parallel[d] = 0;
 | 
			
		||||
 | 
			
		||||
      if (parallel[d]) {
 | 
			
		||||
	range[d] = grid->_ldimensions[d];
 | 
			
		||||
	start[d] = grid->_processor_coor[d]*range[d];
 | 
			
		||||
	ioproc[d]= grid->_processor_coor[d];
 | 
			
		||||
  range[d] = grid->_ldimensions[d];
 | 
			
		||||
  start[d] = grid->_processor_coor[d]*range[d];
 | 
			
		||||
  ioproc[d]= grid->_processor_coor[d];
 | 
			
		||||
      } else {
 | 
			
		||||
	range[d] = grid->_gdimensions[d];
 | 
			
		||||
	start[d] = 0;
 | 
			
		||||
	ioproc[d]= 0;
 | 
			
		||||
  range[d] = grid->_gdimensions[d];
 | 
			
		||||
  start[d] = 0;
 | 
			
		||||
  ioproc[d]= 0;
 | 
			
		||||
 | 
			
		||||
	if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
 | 
			
		||||
  if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      slice_vol = slice_vol * range[d];
 | 
			
		||||
@@ -577,9 +577,9 @@ class BinaryIO {
 | 
			
		||||
      grid->GlobalSum(tmp);
 | 
			
		||||
      std::cout<< GridLogMessage<< "Parallel write I/O from "<< file << " with " <<tmp<< " IOnodes for subslice ";
 | 
			
		||||
      for(int d=0;d<grid->_ndimension;d++){
 | 
			
		||||
	std::cout<< range[d];
 | 
			
		||||
	if( d< grid->_ndimension-1 ) 
 | 
			
		||||
	  std::cout<< " x ";
 | 
			
		||||
  std::cout<< range[d];
 | 
			
		||||
  if( d< grid->_ndimension-1 ) 
 | 
			
		||||
    std::cout<< " x ";
 | 
			
		||||
      }
 | 
			
		||||
      std::cout << std::endl;
 | 
			
		||||
    }
 | 
			
		||||
@@ -610,7 +610,7 @@ class BinaryIO {
 | 
			
		||||
    // should aggregate a whole chunk and then write.
 | 
			
		||||
    // need to implement these loops in Nd independent way with a lexico conversion
 | 
			
		||||
    for(int tlex=0;tlex<slice_vol;tlex++){
 | 
			
		||||
	
 | 
			
		||||
  
 | 
			
		||||
      std::vector<int> tsite(nd); // temporary mixed up site
 | 
			
		||||
      std::vector<int> gsite(nd);
 | 
			
		||||
      std::vector<int> lsite(nd);
 | 
			
		||||
@@ -619,8 +619,8 @@ class BinaryIO {
 | 
			
		||||
      Lexicographic::CoorFromIndex(tsite,tlex,range);
 | 
			
		||||
 | 
			
		||||
      for(int d=0;d<nd;d++){
 | 
			
		||||
	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
 | 
			
		||||
	gsite[d] = tsite[d]+start[d];               // global site
 | 
			
		||||
  lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
 | 
			
		||||
  gsite[d] = tsite[d]+start[d];               // global site
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -640,26 +640,26 @@ class BinaryIO {
 | 
			
		||||
 | 
			
		||||
      // Pair of nodes may need to do pt2pt send
 | 
			
		||||
      if ( rank != iorank ) { // comms is necessary
 | 
			
		||||
	if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it
 | 
			
		||||
	  // Send to IOrank 
 | 
			
		||||
	  grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,rank,iorank,sizeof(siteObj));
 | 
			
		||||
	}
 | 
			
		||||
  if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it
 | 
			
		||||
    // Send to IOrank 
 | 
			
		||||
    grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,rank,iorank,sizeof(siteObj));
 | 
			
		||||
  }
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      grid->Barrier(); // necessary?
 | 
			
		||||
 | 
			
		||||
      if (myrank == iorank) {
 | 
			
		||||
	
 | 
			
		||||
	munge(siteObj,fileObj,csum);
 | 
			
		||||
  
 | 
			
		||||
  munge(siteObj,fileObj,csum);
 | 
			
		||||
 | 
			
		||||
	if(ieee32big) htobe32_v((void *)&fileObj,sizeof(fileObj));
 | 
			
		||||
	if(ieee32)    htole32_v((void *)&fileObj,sizeof(fileObj));
 | 
			
		||||
	if(ieee64big) htobe64_v((void *)&fileObj,sizeof(fileObj));
 | 
			
		||||
	if(ieee64)    htole64_v((void *)&fileObj,sizeof(fileObj));
 | 
			
		||||
	
 | 
			
		||||
	fout.seekp(offset+g_idx*sizeof(fileObj));
 | 
			
		||||
	fout.write((char *)&fileObj,sizeof(fileObj));
 | 
			
		||||
	bytes+=sizeof(fileObj);
 | 
			
		||||
  if(ieee32big) htobe32_v((void *)&fileObj,sizeof(fileObj));
 | 
			
		||||
  if(ieee32)    htole32_v((void *)&fileObj,sizeof(fileObj));
 | 
			
		||||
  if(ieee64big) htobe64_v((void *)&fileObj,sizeof(fileObj));
 | 
			
		||||
  if(ieee64)    htole64_v((void *)&fileObj,sizeof(fileObj));
 | 
			
		||||
  
 | 
			
		||||
  fout.seekp(offset+g_idx*sizeof(fileObj));
 | 
			
		||||
  fout.write((char *)&fileObj,sizeof(fileObj));
 | 
			
		||||
  bytes+=sizeof(fileObj);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@@ -668,7 +668,7 @@ class BinaryIO {
 | 
			
		||||
 | 
			
		||||
    timer.Stop();
 | 
			
		||||
    std::cout<<GridLogPerformance<<"writeObjectParallel: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
 | 
			
		||||
	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
 | 
			
		||||
       << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
 | 
			
		||||
 | 
			
		||||
    return csum;
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -55,11 +55,14 @@ namespace QCD {
 | 
			
		||||
    //////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // QCD iMatrix types
 | 
			
		||||
    // Index conventions:                            Lorentz x Spin x Colour
 | 
			
		||||
    // note: static const int or constexpr will work for type deductions
 | 
			
		||||
    //       with the intel compiler (up to version 17)
 | 
			
		||||
    //////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    static const int ColourIndex = 2;
 | 
			
		||||
    static const int SpinIndex   = 1;
 | 
			
		||||
    static const int LorentzIndex= 0;
 | 
			
		||||
    #define ColourIndex  2
 | 
			
		||||
    #define SpinIndex    1
 | 
			
		||||
    #define LorentzIndex 0
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
    // Also should make these a named enum type
 | 
			
		||||
    static const int DaggerNo=0;
 | 
			
		||||
    static const int DaggerYes=1;
 | 
			
		||||
 
 | 
			
		||||
@@ -111,12 +111,16 @@ typedef SymanzikGaugeAction<ConjugateGimplD>        ConjugateSymanzikGaugeAction
 | 
			
		||||
#define FermOp4dVecTemplateInstantiate(A) \
 | 
			
		||||
  template class A<WilsonImplF>;		\
 | 
			
		||||
  template class A<WilsonImplD>;		\
 | 
			
		||||
  template class A<ZWilsonImplF>;		\
 | 
			
		||||
  template class A<ZWilsonImplD>;		\
 | 
			
		||||
  template class A<GparityWilsonImplF>;		\
 | 
			
		||||
  template class A<GparityWilsonImplD>;		
 | 
			
		||||
 | 
			
		||||
#define FermOp5dVecTemplateInstantiate(A) \
 | 
			
		||||
  template class A<DomainWallVec5dImplF>;	\
 | 
			
		||||
  template class A<DomainWallVec5dImplD>;	
 | 
			
		||||
  template class A<DomainWallVec5dImplD>;	\
 | 
			
		||||
  template class A<ZDomainWallVec5dImplF>;	\
 | 
			
		||||
  template class A<ZDomainWallVec5dImplD>;	
 | 
			
		||||
 | 
			
		||||
#define FermOpTemplateInstantiate(A) \
 | 
			
		||||
 FermOp4dVecTemplateInstantiate(A) \
 | 
			
		||||
@@ -138,6 +142,7 @@ typedef SymanzikGaugeAction<ConjugateGimplD>        ConjugateSymanzikGaugeAction
 | 
			
		||||
#include <Grid/qcd/action/fermion/DomainWallFermion.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/DomainWallFermion.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/MobiusFermion.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/ZMobiusFermion.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/ScaledShamirFermion.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/MobiusZolotarevFermion.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/ShamirZolotarevFermion.h>
 | 
			
		||||
@@ -176,6 +181,11 @@ typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
 | 
			
		||||
typedef MobiusFermion<WilsonImplR> MobiusFermionR;
 | 
			
		||||
typedef MobiusFermion<WilsonImplF> MobiusFermionF;
 | 
			
		||||
typedef MobiusFermion<WilsonImplD> MobiusFermionD;
 | 
			
		||||
 | 
			
		||||
typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
 | 
			
		||||
typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
 | 
			
		||||
typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
 | 
			
		||||
 | 
			
		||||
typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
 | 
			
		||||
typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
 | 
			
		||||
typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
 | 
			
		||||
 
 | 
			
		||||
@@ -54,18 +54,18 @@ template<class Impl>
 | 
			
		||||
void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
 | 
			
		||||
{
 | 
			
		||||
  int Ls=this->Ls;
 | 
			
		||||
  std::vector<RealD> diag (Ls,1.0);
 | 
			
		||||
  std::vector<RealD> upper(Ls,-1.0); upper[Ls-1]=mass;
 | 
			
		||||
  std::vector<RealD> lower(Ls,-1.0); lower[0]   =mass;
 | 
			
		||||
  std::vector<Coeff_t> diag (Ls,1.0);
 | 
			
		||||
  std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
 | 
			
		||||
  std::vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass;
 | 
			
		||||
  M5D(psi,chi,chi,lower,diag,upper);
 | 
			
		||||
}
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din)
 | 
			
		||||
{
 | 
			
		||||
  int Ls=this->Ls;
 | 
			
		||||
  std::vector<RealD> diag = bs;
 | 
			
		||||
  std::vector<RealD> upper= cs;
 | 
			
		||||
  std::vector<RealD> lower= cs; 
 | 
			
		||||
  std::vector<Coeff_t> diag = bs;
 | 
			
		||||
  std::vector<Coeff_t> upper= cs;
 | 
			
		||||
  std::vector<Coeff_t> lower= cs; 
 | 
			
		||||
  upper[Ls-1]=-mass*upper[Ls-1];
 | 
			
		||||
  lower[0]   =-mass*lower[0];
 | 
			
		||||
  M5D(psi,psi,Din,lower,diag,upper);
 | 
			
		||||
@@ -73,9 +73,9 @@ void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &D
 | 
			
		||||
template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &psi, FermionField &chi)
 | 
			
		||||
{
 | 
			
		||||
  int Ls=this->Ls;
 | 
			
		||||
  std::vector<RealD> diag = beo;
 | 
			
		||||
  std::vector<RealD> upper(Ls);
 | 
			
		||||
  std::vector<RealD> lower(Ls);
 | 
			
		||||
  std::vector<Coeff_t> diag = beo;
 | 
			
		||||
  std::vector<Coeff_t> upper(Ls);
 | 
			
		||||
  std::vector<Coeff_t> lower(Ls);
 | 
			
		||||
  for(int i=0;i<Ls;i++) {
 | 
			
		||||
    upper[i]=-ceo[i];
 | 
			
		||||
    lower[i]=-ceo[i];
 | 
			
		||||
@@ -88,9 +88,9 @@ template<class Impl>
 | 
			
		||||
void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
 | 
			
		||||
{
 | 
			
		||||
  int Ls=this->Ls;
 | 
			
		||||
  std::vector<RealD> diag = bee;
 | 
			
		||||
  std::vector<RealD> upper(Ls);
 | 
			
		||||
  std::vector<RealD> lower(Ls);
 | 
			
		||||
  std::vector<Coeff_t> diag = bee;
 | 
			
		||||
  std::vector<Coeff_t> upper(Ls);
 | 
			
		||||
  std::vector<Coeff_t> lower(Ls);
 | 
			
		||||
  for(int i=0;i<Ls;i++) {
 | 
			
		||||
    upper[i]=-cee[i];
 | 
			
		||||
    lower[i]=-cee[i];
 | 
			
		||||
@@ -104,9 +104,9 @@ template<class Impl>
 | 
			
		||||
void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
 | 
			
		||||
{
 | 
			
		||||
  int Ls=this->Ls;
 | 
			
		||||
  std::vector<RealD> diag = bee;
 | 
			
		||||
  std::vector<RealD> upper(Ls);
 | 
			
		||||
  std::vector<RealD> lower(Ls);
 | 
			
		||||
  std::vector<Coeff_t> diag = bee;
 | 
			
		||||
  std::vector<Coeff_t> upper(Ls);
 | 
			
		||||
  std::vector<Coeff_t> lower(Ls);
 | 
			
		||||
 | 
			
		||||
  for (int s=0;s<Ls;s++){
 | 
			
		||||
    // Assemble the 5d matrix
 | 
			
		||||
@@ -129,9 +129,9 @@ template<class Impl>
 | 
			
		||||
void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
 | 
			
		||||
{
 | 
			
		||||
  int Ls=this->Ls;
 | 
			
		||||
  std::vector<RealD> diag(Ls,1.0);
 | 
			
		||||
  std::vector<RealD> upper(Ls,-1.0);
 | 
			
		||||
  std::vector<RealD> lower(Ls,-1.0);
 | 
			
		||||
  std::vector<Coeff_t> diag(Ls,1.0);
 | 
			
		||||
  std::vector<Coeff_t> upper(Ls,-1.0);
 | 
			
		||||
  std::vector<Coeff_t> lower(Ls,-1.0);
 | 
			
		||||
  upper[Ls-1]=-mass*upper[Ls-1];
 | 
			
		||||
  lower[0]   =-mass*lower[0];
 | 
			
		||||
  M5Ddag(psi,chi,chi,lower,diag,upper);
 | 
			
		||||
@@ -141,9 +141,9 @@ template<class Impl>
 | 
			
		||||
void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din)
 | 
			
		||||
{
 | 
			
		||||
  int Ls=this->Ls;
 | 
			
		||||
  std::vector<RealD> diag =bs;
 | 
			
		||||
  std::vector<RealD> upper=cs;
 | 
			
		||||
  std::vector<RealD> lower=cs;
 | 
			
		||||
  std::vector<Coeff_t> diag =bs;
 | 
			
		||||
  std::vector<Coeff_t> upper=cs;
 | 
			
		||||
  std::vector<Coeff_t> lower=cs;
 | 
			
		||||
  upper[Ls-1]=-mass*upper[Ls-1];
 | 
			
		||||
  lower[0]   =-mass*lower[0];
 | 
			
		||||
  M5Ddag(psi,psi,Din,lower,diag,upper);
 | 
			
		||||
@@ -273,11 +273,21 @@ void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
 | 
			
		||||
{
 | 
			
		||||
  SetCoefficientsZolotarev(1.0,zdata,b,c);
 | 
			
		||||
  std::vector<Coeff_t> gamma(this->Ls);
 | 
			
		||||
  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
 | 
			
		||||
  SetCoefficientsInternal(1.0,gamma,b,c);
 | 
			
		||||
}
 | 
			
		||||
//Zolo
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
 | 
			
		||||
{
 | 
			
		||||
  std::vector<Coeff_t> gamma(this->Ls);
 | 
			
		||||
  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
 | 
			
		||||
  SetCoefficientsInternal(zolo_hi,gamma,b,c);
 | 
			
		||||
}
 | 
			
		||||
//Zolo
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c)
 | 
			
		||||
{
 | 
			
		||||
  int Ls=this->Ls;
 | 
			
		||||
 | 
			
		||||
@@ -315,7 +325,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolot
 | 
			
		||||
  double bmc = b-c;
 | 
			
		||||
  for(int i=0; i < Ls; i++){
 | 
			
		||||
    as[i] = 1.0;
 | 
			
		||||
    omega[i] = ((double)zdata->gamma[i])*zolo_hi; //NB reciprocal relative to Chroma NEF code
 | 
			
		||||
    omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code
 | 
			
		||||
    bs[i] = 0.5*(bpc/omega[i] + bmc);
 | 
			
		||||
    cs[i] = 0.5*(bpc/omega[i] - bmc);
 | 
			
		||||
  }
 | 
			
		||||
@@ -377,7 +387,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolot
 | 
			
		||||
  }
 | 
			
		||||
	
 | 
			
		||||
  { 
 | 
			
		||||
    double delta_d=mass*cee[Ls-1];
 | 
			
		||||
    Coeff_t delta_d=mass*cee[Ls-1];
 | 
			
		||||
    for(int j=0;j<Ls-1;j++) delta_d *= cee[j]/bee[j];
 | 
			
		||||
    dee[Ls-1] += delta_d;
 | 
			
		||||
  }  
 | 
			
		||||
 
 | 
			
		||||
@@ -62,16 +62,16 @@ namespace Grid {
 | 
			
		||||
      void M5D(const FermionField &psi,
 | 
			
		||||
	       const FermionField &phi, 
 | 
			
		||||
	       FermionField &chi,
 | 
			
		||||
	       std::vector<RealD> &lower,
 | 
			
		||||
	       std::vector<RealD> &diag,
 | 
			
		||||
	       std::vector<RealD> &upper);
 | 
			
		||||
	       std::vector<Coeff_t> &lower,
 | 
			
		||||
	       std::vector<Coeff_t> &diag,
 | 
			
		||||
	       std::vector<Coeff_t> &upper);
 | 
			
		||||
 | 
			
		||||
      void M5Ddag(const FermionField &psi,
 | 
			
		||||
		  const FermionField &phi, 
 | 
			
		||||
		  FermionField &chi,
 | 
			
		||||
		  std::vector<RealD> &lower,
 | 
			
		||||
		  std::vector<RealD> &diag,
 | 
			
		||||
		  std::vector<RealD> &upper);
 | 
			
		||||
		  std::vector<Coeff_t> &lower,
 | 
			
		||||
		  std::vector<Coeff_t> &diag,
 | 
			
		||||
		  std::vector<Coeff_t> &upper);
 | 
			
		||||
      void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv);
 | 
			
		||||
 | 
			
		||||
      virtual void   Instantiatable(void)=0;
 | 
			
		||||
@@ -91,23 +91,23 @@ namespace Grid {
 | 
			
		||||
      RealD mass;
 | 
			
		||||
 | 
			
		||||
      // Cayley form Moebius (tanh and zolotarev)
 | 
			
		||||
      std::vector<RealD> omega; 
 | 
			
		||||
      std::vector<RealD> bs;    // S dependent coeffs
 | 
			
		||||
      std::vector<RealD> cs;    
 | 
			
		||||
      std::vector<RealD> as;    
 | 
			
		||||
      std::vector<Coeff_t> omega; 
 | 
			
		||||
      std::vector<Coeff_t> bs;    // S dependent coeffs
 | 
			
		||||
      std::vector<Coeff_t> cs;    
 | 
			
		||||
      std::vector<Coeff_t> as;    
 | 
			
		||||
      // For preconditioning Cayley form
 | 
			
		||||
      std::vector<RealD> bee;    
 | 
			
		||||
      std::vector<RealD> cee;    
 | 
			
		||||
      std::vector<RealD> aee;    
 | 
			
		||||
      std::vector<RealD> beo;    
 | 
			
		||||
      std::vector<RealD> ceo;    
 | 
			
		||||
      std::vector<RealD> aeo;    
 | 
			
		||||
      std::vector<Coeff_t> bee;    
 | 
			
		||||
      std::vector<Coeff_t> cee;    
 | 
			
		||||
      std::vector<Coeff_t> aee;    
 | 
			
		||||
      std::vector<Coeff_t> beo;    
 | 
			
		||||
      std::vector<Coeff_t> ceo;    
 | 
			
		||||
      std::vector<Coeff_t> aeo;    
 | 
			
		||||
      // LDU factorisation of the eeoo matrix
 | 
			
		||||
      std::vector<RealD> lee;    
 | 
			
		||||
      std::vector<RealD> leem;    
 | 
			
		||||
      std::vector<RealD> uee;    
 | 
			
		||||
      std::vector<RealD> ueem;    
 | 
			
		||||
      std::vector<RealD> dee;    
 | 
			
		||||
      std::vector<Coeff_t> lee;    
 | 
			
		||||
      std::vector<Coeff_t> leem;    
 | 
			
		||||
      std::vector<Coeff_t> uee;    
 | 
			
		||||
      std::vector<Coeff_t> ueem;    
 | 
			
		||||
      std::vector<Coeff_t> dee;    
 | 
			
		||||
 | 
			
		||||
      // Constructors
 | 
			
		||||
      CayleyFermion5D(GaugeField &_Umu,
 | 
			
		||||
@@ -117,20 +117,19 @@ namespace Grid {
 | 
			
		||||
		      GridRedBlackCartesian &FourDimRedBlackGrid,
 | 
			
		||||
		      RealD _mass,RealD _M5,const ImplParams &p= ImplParams());
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    protected:
 | 
			
		||||
      void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
 | 
			
		||||
      void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
 | 
			
		||||
      void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
#define INSTANTIATE_DPERP(A)\
 | 
			
		||||
template void CayleyFermion5D< A >::M5D(const FermionField &psi,const FermionField &phi,FermionField &chi,\
 | 
			
		||||
					std::vector<RealD> &lower,std::vector<RealD> &diag,std::vector<RealD> &upper); \
 | 
			
		||||
					std::vector<Coeff_t> &lower,std::vector<Coeff_t> &diag,std::vector<Coeff_t> &upper); \
 | 
			
		||||
template void CayleyFermion5D< A >::M5Ddag(const FermionField &psi,const FermionField &phi,FermionField &chi,\
 | 
			
		||||
					   std::vector<RealD> &lower,std::vector<RealD> &diag,std::vector<RealD> &upper); \
 | 
			
		||||
					   std::vector<Coeff_t> &lower,std::vector<Coeff_t> &diag,std::vector<Coeff_t> &upper); \
 | 
			
		||||
template void CayleyFermion5D< A >::MooeeInv    (const FermionField &psi, FermionField &chi); \
 | 
			
		||||
template void CayleyFermion5D< A >::MooeeInvDag (const FermionField &psi, FermionField &chi);
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -43,9 +43,9 @@ template<class Impl>
 | 
			
		||||
void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 | 
			
		||||
				const FermionField &phi, 
 | 
			
		||||
				FermionField &chi,
 | 
			
		||||
				std::vector<RealD> &lower,
 | 
			
		||||
				std::vector<RealD> &diag,
 | 
			
		||||
				std::vector<RealD> &upper)
 | 
			
		||||
				std::vector<Coeff_t> &lower,
 | 
			
		||||
				std::vector<Coeff_t> &diag,
 | 
			
		||||
				std::vector<Coeff_t> &upper)
 | 
			
		||||
{
 | 
			
		||||
  int Ls =this->Ls;
 | 
			
		||||
  GridBase *grid=psi._grid;
 | 
			
		||||
@@ -82,9 +82,9 @@ template<class Impl>
 | 
			
		||||
void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 | 
			
		||||
				   const FermionField &phi, 
 | 
			
		||||
				   FermionField &chi,
 | 
			
		||||
				   std::vector<RealD> &lower,
 | 
			
		||||
				   std::vector<RealD> &diag,
 | 
			
		||||
				   std::vector<RealD> &upper)
 | 
			
		||||
				   std::vector<Coeff_t> &lower,
 | 
			
		||||
				   std::vector<Coeff_t> &diag,
 | 
			
		||||
				   std::vector<Coeff_t> &upper)
 | 
			
		||||
{
 | 
			
		||||
  int Ls =this->Ls;
 | 
			
		||||
  GridBase *grid=psi._grid;
 | 
			
		||||
@@ -204,6 +204,8 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
  INSTANTIATE_DPERP(WilsonImplD);
 | 
			
		||||
  INSTANTIATE_DPERP(GparityWilsonImplF);
 | 
			
		||||
  INSTANTIATE_DPERP(GparityWilsonImplD);
 | 
			
		||||
  INSTANTIATE_DPERP(ZWilsonImplF);
 | 
			
		||||
  INSTANTIATE_DPERP(ZWilsonImplD);
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
 
 | 
			
		||||
@@ -43,9 +43,9 @@ template<class Impl>
 | 
			
		||||
void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 | 
			
		||||
				const FermionField &phi, 
 | 
			
		||||
				FermionField &chi,
 | 
			
		||||
				std::vector<RealD> &lower,
 | 
			
		||||
				std::vector<RealD> &diag,
 | 
			
		||||
				std::vector<RealD> &upper)
 | 
			
		||||
				std::vector<Coeff_t> &lower,
 | 
			
		||||
				std::vector<Coeff_t> &diag,
 | 
			
		||||
				std::vector<Coeff_t> &upper)
 | 
			
		||||
{
 | 
			
		||||
  int Ls=this->Ls;
 | 
			
		||||
  for(int s=0;s<Ls;s++){
 | 
			
		||||
@@ -65,9 +65,9 @@ template<class Impl>
 | 
			
		||||
void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 | 
			
		||||
				   const FermionField &phi, 
 | 
			
		||||
				   FermionField &chi,
 | 
			
		||||
				   std::vector<RealD> &lower,
 | 
			
		||||
				   std::vector<RealD> &diag,
 | 
			
		||||
				   std::vector<RealD> &upper)
 | 
			
		||||
				   std::vector<Coeff_t> &lower,
 | 
			
		||||
				   std::vector<Coeff_t> &diag,
 | 
			
		||||
				   std::vector<Coeff_t> &upper)
 | 
			
		||||
{
 | 
			
		||||
  int Ls=this->Ls;
 | 
			
		||||
  for(int s=0;s<Ls;s++){
 | 
			
		||||
 
 | 
			
		||||
@@ -53,9 +53,9 @@ template<class Impl>
 | 
			
		||||
void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 | 
			
		||||
				const FermionField &phi, 
 | 
			
		||||
				FermionField &chi,
 | 
			
		||||
				std::vector<RealD> &lower,
 | 
			
		||||
				std::vector<RealD> &diag,
 | 
			
		||||
				std::vector<RealD> &upper)
 | 
			
		||||
				std::vector<Coeff_t> &lower,
 | 
			
		||||
				std::vector<Coeff_t> &diag,
 | 
			
		||||
				std::vector<Coeff_t> &upper)
 | 
			
		||||
{
 | 
			
		||||
  GridBase *grid=psi._grid;
 | 
			
		||||
  int Ls   = this->Ls;
 | 
			
		||||
@@ -121,9 +121,9 @@ template<class Impl>
 | 
			
		||||
void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 | 
			
		||||
				   const FermionField &phi, 
 | 
			
		||||
				   FermionField &chi,
 | 
			
		||||
				   std::vector<RealD> &lower,
 | 
			
		||||
				   std::vector<RealD> &diag,
 | 
			
		||||
				   std::vector<RealD> &upper)
 | 
			
		||||
				   std::vector<Coeff_t> &lower,
 | 
			
		||||
				   std::vector<Coeff_t> &diag,
 | 
			
		||||
				   std::vector<Coeff_t> &upper)
 | 
			
		||||
{
 | 
			
		||||
  GridBase *grid=psi._grid;
 | 
			
		||||
  int Ls   = this->Ls;
 | 
			
		||||
@@ -194,8 +194,8 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
 | 
			
		||||
 | 
			
		||||
  chi.checkerboard=psi.checkerboard;
 | 
			
		||||
  
 | 
			
		||||
  Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
 | 
			
		||||
  Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
 | 
			
		||||
  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
 | 
			
		||||
  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
 | 
			
		||||
  
 | 
			
		||||
  for(int s=0;s<Ls;s++){
 | 
			
		||||
    Pplus(s,s) = bee[s];
 | 
			
		||||
@@ -212,8 +212,8 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
 | 
			
		||||
  Pplus (0,Ls-1) = mass*cee[0];
 | 
			
		||||
  Pminus(Ls-1,0) = mass*cee[Ls-1];
 | 
			
		||||
  
 | 
			
		||||
  Eigen::MatrixXd PplusMat ;
 | 
			
		||||
  Eigen::MatrixXd PminusMat;
 | 
			
		||||
  Eigen::MatrixXcd PplusMat ;
 | 
			
		||||
  Eigen::MatrixXcd PminusMat;
 | 
			
		||||
  
 | 
			
		||||
  if ( inv ) {
 | 
			
		||||
    PplusMat =Pplus.inverse();
 | 
			
		||||
@@ -298,8 +298,12 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
 | 
			
		||||
INSTANTIATE_DPERP(DomainWallVec5dImplD);
 | 
			
		||||
INSTANTIATE_DPERP(DomainWallVec5dImplF);
 | 
			
		||||
INSTANTIATE_DPERP(ZDomainWallVec5dImplD);
 | 
			
		||||
INSTANTIATE_DPERP(ZDomainWallVec5dImplF);
 | 
			
		||||
 | 
			
		||||
template void CayleyFermion5D<DomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 | 
			
		||||
template void CayleyFermion5D<DomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 | 
			
		||||
template void CayleyFermion5D<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 | 
			
		||||
template void CayleyFermion5D<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
 
 | 
			
		||||
@@ -100,7 +100,8 @@ namespace Grid {
 | 
			
		||||
    typedef typename Impl::SiteHalfSpinor       SiteHalfSpinor;		\
 | 
			
		||||
    typedef typename Impl::Compressor               Compressor;		\
 | 
			
		||||
    typedef typename Impl::StencilImpl             StencilImpl;	  \
 | 
			
		||||
    typedef typename Impl::ImplParams ImplParams;
 | 
			
		||||
    typedef typename Impl::ImplParams ImplParams; \
 | 
			
		||||
    typedef typename Impl::Coeff_t       Coeff_t;
 | 
			
		||||
 | 
			
		||||
#define INHERIT_IMPL_TYPES(Base) \
 | 
			
		||||
    INHERIT_GIMPL_TYPES(Base)\
 | 
			
		||||
@@ -109,12 +110,14 @@ namespace Grid {
 | 
			
		||||
    ///////
 | 
			
		||||
    // Single flavour four spinors with colour index
 | 
			
		||||
    ///////
 | 
			
		||||
    template<class S,int Nrepresentation=Nc>
 | 
			
		||||
    template<class S,int Nrepresentation=Nc,class _Coeff_t = RealD>
 | 
			
		||||
    class WilsonImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S, Nrepresentation> > { 
 | 
			
		||||
    public:
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
      const bool LsVectorised=false;
 | 
			
		||||
 | 
			
		||||
      typedef _Coeff_t Coeff_t;
 | 
			
		||||
      typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;
 | 
			
		||||
 | 
			
		||||
      INHERIT_GIMPL_TYPES(Gimpl);
 | 
			
		||||
@@ -192,12 +195,13 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    ///////
 | 
			
		||||
    // Single flavour four spinors with colour index, 5d redblack
 | 
			
		||||
    ///////
 | 
			
		||||
    template<class S,int Nrepresentation=Nc>
 | 
			
		||||
    template<class S,int Nrepresentation=Nc,class _Coeff_t = RealD>
 | 
			
		||||
    class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
 | 
			
		||||
    public:
 | 
			
		||||
    
 | 
			
		||||
      const bool LsVectorised=true;
 | 
			
		||||
 | 
			
		||||
      typedef _Coeff_t Coeff_t;
 | 
			
		||||
      typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;
 | 
			
		||||
 | 
			
		||||
      INHERIT_GIMPL_TYPES(Gimpl);
 | 
			
		||||
@@ -287,12 +291,13 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    // Flavour doubled spinors; is Gparity the only? what about C*?
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
    template<class S,int Nrepresentation>
 | 
			
		||||
    template<class S,int Nrepresentation,class _Coeff_t = RealD>
 | 
			
		||||
    class GparityWilsonImpl : public ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> >{ 
 | 
			
		||||
    public:
 | 
			
		||||
 | 
			
		||||
      const bool LsVectorised=false;
 | 
			
		||||
 | 
			
		||||
      typedef _Coeff_t Coeff_t;
 | 
			
		||||
      typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;
 | 
			
		||||
 | 
			
		||||
      INHERIT_GIMPL_TYPES(Gimpl);
 | 
			
		||||
@@ -483,6 +488,18 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    typedef WilsonImpl<vComplexF,Nc> WilsonImplF; // Float
 | 
			
		||||
    typedef WilsonImpl<vComplexD,Nc> WilsonImplD; // Double
 | 
			
		||||
 | 
			
		||||
    typedef WilsonImpl<vComplex ,Nc,ComplexD> ZWilsonImplR; // Real.. whichever prec
 | 
			
		||||
    typedef WilsonImpl<vComplexF,Nc,ComplexD> ZWilsonImplF; // Float
 | 
			
		||||
    typedef WilsonImpl<vComplexD,Nc,ComplexD> ZWilsonImplD; // Double
 | 
			
		||||
 | 
			
		||||
    typedef DomainWallVec5dImpl<vComplex ,Nc> DomainWallVec5dImplR; // Real.. whichever prec
 | 
			
		||||
    typedef DomainWallVec5dImpl<vComplexF,Nc> DomainWallVec5dImplF; // Float
 | 
			
		||||
    typedef DomainWallVec5dImpl<vComplexD,Nc> DomainWallVec5dImplD; // Double
 | 
			
		||||
 | 
			
		||||
    typedef DomainWallVec5dImpl<vComplex ,Nc,ComplexD> ZDomainWallVec5dImplR; // Real.. whichever prec
 | 
			
		||||
    typedef DomainWallVec5dImpl<vComplexF,Nc,ComplexD> ZDomainWallVec5dImplF; // Float
 | 
			
		||||
    typedef DomainWallVec5dImpl<vComplexD,Nc,ComplexD> ZDomainWallVec5dImplD; // Double
 | 
			
		||||
 | 
			
		||||
    typedef DomainWallVec5dImpl<vComplex ,Nc> DomainWallVec5dImplR; // Real.. whichever prec
 | 
			
		||||
    typedef DomainWallVec5dImpl<vComplexF,Nc> DomainWallVec5dImplF; // Float
 | 
			
		||||
    typedef DomainWallVec5dImpl<vComplexD,Nc> DomainWallVec5dImplD; // Double
 | 
			
		||||
 
 | 
			
		||||
@@ -68,16 +68,21 @@ void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,
 | 
			
		||||
					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
					   int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  // No asm implementation yet.
 | 
			
		||||
  //  if ( AsmOpt )     WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
 | 
			
		||||
  //  else
 | 
			
		||||
  for(int site=0;site<Ns;site++) {
 | 
			
		||||
    for(int s=0;s<Ls;s++) {
 | 
			
		||||
      if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
 | 
			
		||||
      else         WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
 | 
			
		||||
      sF++;
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
  if ( AsmOpt ) {
 | 
			
		||||
    WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
 | 
			
		||||
  } else {
 | 
			
		||||
#else
 | 
			
		||||
  {  
 | 
			
		||||
#endif
 | 
			
		||||
    for(int site=0;site<Ns;site++) {
 | 
			
		||||
      for(int s=0;s<Ls;s++) {
 | 
			
		||||
	if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
 | 
			
		||||
	else         WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
 | 
			
		||||
	sF++;
 | 
			
		||||
      }
 | 
			
		||||
      sU++;
 | 
			
		||||
    }
 | 
			
		||||
    sU++;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -79,6 +79,10 @@ namespace Grid {
 | 
			
		||||
			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
			      int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
     void DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
			
		||||
			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
			      int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
     void DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
			
		||||
			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
@@ -92,7 +96,25 @@ namespace Grid {
 | 
			
		||||
     WilsonKernels(const ImplParams &p= ImplParams());
 | 
			
		||||
     
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    ///////////////////////////////////////////////////////////
 | 
			
		||||
    // Default to no assembler implementation
 | 
			
		||||
    ///////////////////////////////////////////////////////////
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 | 
			
		||||
						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
						   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 | 
			
		||||
    {
 | 
			
		||||
      assert(0);
 | 
			
		||||
    }
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 | 
			
		||||
						      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
						      int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 | 
			
		||||
    {
 | 
			
		||||
      assert(0);
 | 
			
		||||
    }
 | 
			
		||||
        
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -1,4 +1,4 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
@@ -26,59 +26,56 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#include <Grid.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////////////////////////////////////
 | 
			
		||||
  // Default to no assembler implementation
 | 
			
		||||
  ///////////////////////////////////////////////////////////
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 | 
			
		||||
					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
					       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  assert(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
  namespace QCD {
 | 
			
		||||
    
 | 
			
		||||
#if defined(AVX512) 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////////////////////////////////////
 | 
			
		||||
  // If we are AVX512 specialise the single precision routine
 | 
			
		||||
  ///////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    
 | 
			
		||||
    ///////////////////////////////////////////////////////////
 | 
			
		||||
    // If we are AVX512 specialise the single precision routine
 | 
			
		||||
    ///////////////////////////////////////////////////////////
 | 
			
		||||
    
 | 
			
		||||
#include <simd/Intel512wilson.h>
 | 
			
		||||
#include <simd/Intel512single.h>
 | 
			
		||||
 | 
			
		||||
static Vector<vComplexF> signs;
 | 
			
		||||
 | 
			
		||||
int setupSigns(void ){
 | 
			
		||||
  Vector<vComplexF> bother(2);
 | 
			
		||||
  signs = bother;
 | 
			
		||||
  vrsign(signs[0]);
 | 
			
		||||
  visign(signs[1]);
 | 
			
		||||
  return 1;
 | 
			
		||||
}
 | 
			
		||||
static int signInit = setupSigns();
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    static Vector<vComplexF> signs;
 | 
			
		||||
    
 | 
			
		||||
    int setupSigns(void ){
 | 
			
		||||
      Vector<vComplexF> bother(2);
 | 
			
		||||
      signs = bother;
 | 
			
		||||
      vrsign(signs[0]);
 | 
			
		||||
      visign(signs[1]);
 | 
			
		||||
      return 1;
 | 
			
		||||
    }
 | 
			
		||||
    static int signInit = setupSigns();
 | 
			
		||||
  
 | 
			
		||||
#define label(A)  ilabel(A)
 | 
			
		||||
#define ilabel(A) ".globl\n"  #A ":\n" 
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
#define MAYBEPERM(A,perm) if (perm) { A ; }
 | 
			
		||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 | 
			
		||||
#define FX(A) WILSONASM_ ##A
 | 
			
		||||
template<>
 | 
			
		||||
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 | 
			
		||||
						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
						     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 | 
			
		||||
  
 | 
			
		||||
#undef KERNEL_DAG
 | 
			
		||||
    template<>
 | 
			
		||||
    void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 | 
			
		||||
							 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 | 
			
		||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 | 
			
		||||
 | 
			
		||||
      
 | 
			
		||||
#define KERNEL_DAG
 | 
			
		||||
    template<>
 | 
			
		||||
    void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 | 
			
		||||
							    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 | 
			
		||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 | 
			
		||||
				    
 | 
			
		||||
#undef VMOVIDUP
 | 
			
		||||
#undef VMOVRDUP
 | 
			
		||||
#undef MAYBEPERM
 | 
			
		||||
@@ -89,32 +86,22 @@ void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrd
 | 
			
		||||
#define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
 | 
			
		||||
#define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
 | 
			
		||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
 | 
			
		||||
template<>
 | 
			
		||||
void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 | 
			
		||||
								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 | 
			
		||||
				    
 | 
			
		||||
#undef KERNEL_DAG
 | 
			
		||||
    template<>
 | 
			
		||||
    void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 | 
			
		||||
								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 | 
			
		||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 | 
			
		||||
 | 
			
		||||
				    
 | 
			
		||||
#define KERNEL_DAG
 | 
			
		||||
    template<>
 | 
			
		||||
    void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 | 
			
		||||
								     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 | 
			
		||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 | 
			
		||||
				    
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 | 
			
		||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							      int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
			
		||||
 | 
			
		||||
template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 | 
			
		||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
			
		||||
template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 | 
			
		||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
			
		||||
template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 | 
			
		||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
			
		||||
template void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 | 
			
		||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
			
		||||
template void WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 | 
			
		||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
			
		||||
}}
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -30,7 +30,11 @@
 | 
			
		||||
  basep = st.GetPFInfo(nent,plocal); nent++;
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD64(%r10,isigns);
 | 
			
		||||
#ifdef KERNEL_DAG
 | 
			
		||||
    XP_PROJMEM(base);
 | 
			
		||||
#else 
 | 
			
		||||
    XM_PROJMEM(base);
 | 
			
		||||
#endif
 | 
			
		||||
    MAYBEPERM(PERMUTE_DIR3,perm);
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI(base);
 | 
			
		||||
@@ -41,15 +45,22 @@
 | 
			
		||||
    MULT_2SPIN_DIR_PFXP(Xp,basep);
 | 
			
		||||
  }
 | 
			
		||||
  LOAD64(%r10,isigns);
 | 
			
		||||
#ifdef KERNEL_DAG
 | 
			
		||||
  XP_RECON;
 | 
			
		||||
#else
 | 
			
		||||
  XM_RECON;
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
  ////////////////////////////////
 | 
			
		||||
  // Yp
 | 
			
		||||
  ////////////////////////////////
 | 
			
		||||
  basep = st.GetPFInfo(nent,plocal); nent++;
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
			
		||||
#ifdef KERNEL_DAG
 | 
			
		||||
    YP_PROJMEM(base);
 | 
			
		||||
#else
 | 
			
		||||
    YM_PROJMEM(base);
 | 
			
		||||
#endif
 | 
			
		||||
    MAYBEPERM(PERMUTE_DIR2,perm);
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI(base);
 | 
			
		||||
@@ -60,7 +71,11 @@
 | 
			
		||||
    MULT_2SPIN_DIR_PFYP(Yp,basep);
 | 
			
		||||
  }
 | 
			
		||||
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
			
		||||
#ifdef KERNEL_DAG
 | 
			
		||||
  YP_RECON_ACCUM;
 | 
			
		||||
#else
 | 
			
		||||
  YM_RECON_ACCUM;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////
 | 
			
		||||
  // Zp
 | 
			
		||||
@@ -68,7 +83,11 @@
 | 
			
		||||
  basep = st.GetPFInfo(nent,plocal); nent++;
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
			
		||||
#ifdef KERNEL_DAG
 | 
			
		||||
    ZP_PROJMEM(base);
 | 
			
		||||
#else
 | 
			
		||||
    ZM_PROJMEM(base);
 | 
			
		||||
#endif
 | 
			
		||||
    MAYBEPERM(PERMUTE_DIR1,perm);
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI(base);
 | 
			
		||||
@@ -79,7 +98,11 @@
 | 
			
		||||
    MULT_2SPIN_DIR_PFZP(Zp,basep);
 | 
			
		||||
  }
 | 
			
		||||
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
			
		||||
#ifdef KERNEL_DAG
 | 
			
		||||
  ZP_RECON_ACCUM;
 | 
			
		||||
#else
 | 
			
		||||
  ZM_RECON_ACCUM;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////
 | 
			
		||||
  // Tp
 | 
			
		||||
@@ -87,7 +110,11 @@
 | 
			
		||||
  basep = st.GetPFInfo(nent,plocal); nent++;
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
			
		||||
#ifdef KERNEL_DAG
 | 
			
		||||
    TP_PROJMEM(base);
 | 
			
		||||
#else
 | 
			
		||||
    TM_PROJMEM(base);
 | 
			
		||||
#endif
 | 
			
		||||
    MAYBEPERM(PERMUTE_DIR0,perm);
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI(base);
 | 
			
		||||
@@ -98,7 +125,11 @@
 | 
			
		||||
    MULT_2SPIN_DIR_PFTP(Tp,basep);
 | 
			
		||||
  }
 | 
			
		||||
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
			
		||||
#ifdef KERNEL_DAG
 | 
			
		||||
  TP_RECON_ACCUM;
 | 
			
		||||
#else
 | 
			
		||||
  TM_RECON_ACCUM;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////
 | 
			
		||||
  // Xm
 | 
			
		||||
@@ -107,7 +138,11 @@
 | 
			
		||||
  //  basep= st.GetPFInfo(nent,plocal); nent++;
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
			
		||||
#ifdef KERNEL_DAG
 | 
			
		||||
    XM_PROJMEM(base);
 | 
			
		||||
#else
 | 
			
		||||
    XP_PROJMEM(base);
 | 
			
		||||
#endif
 | 
			
		||||
    MAYBEPERM(PERMUTE_DIR3,perm);
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI(base);
 | 
			
		||||
@@ -118,7 +153,11 @@
 | 
			
		||||
    MULT_2SPIN_DIR_PFXM(Xm,basep);
 | 
			
		||||
  }
 | 
			
		||||
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
			
		||||
#ifdef KERNEL_DAG
 | 
			
		||||
  XM_RECON_ACCUM;
 | 
			
		||||
#else
 | 
			
		||||
  XP_RECON_ACCUM;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////
 | 
			
		||||
  // Ym
 | 
			
		||||
@@ -126,7 +165,11 @@
 | 
			
		||||
  basep= st.GetPFInfo(nent,plocal); nent++;
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
			
		||||
#ifdef KERNEL_DAG
 | 
			
		||||
    YM_PROJMEM(base);
 | 
			
		||||
#else
 | 
			
		||||
    YP_PROJMEM(base);
 | 
			
		||||
#endif
 | 
			
		||||
    MAYBEPERM(PERMUTE_DIR2,perm);
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI(base);
 | 
			
		||||
@@ -137,7 +180,11 @@
 | 
			
		||||
    MULT_2SPIN_DIR_PFYM(Ym,basep);
 | 
			
		||||
  }
 | 
			
		||||
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
			
		||||
#ifdef KERNEL_DAG
 | 
			
		||||
  YM_RECON_ACCUM;
 | 
			
		||||
#else
 | 
			
		||||
  YP_RECON_ACCUM;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////
 | 
			
		||||
  // Zm
 | 
			
		||||
@@ -145,7 +192,11 @@
 | 
			
		||||
  basep= st.GetPFInfo(nent,plocal); nent++;
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
			
		||||
#ifdef KERNEL_DAG
 | 
			
		||||
    ZM_PROJMEM(base);
 | 
			
		||||
#else
 | 
			
		||||
    ZP_PROJMEM(base);
 | 
			
		||||
#endif
 | 
			
		||||
    MAYBEPERM(PERMUTE_DIR1,perm);
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI(base);
 | 
			
		||||
@@ -156,7 +207,11 @@
 | 
			
		||||
    MULT_2SPIN_DIR_PFZM(Zm,basep);
 | 
			
		||||
  }
 | 
			
		||||
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
			
		||||
#ifdef KERNEL_DAG
 | 
			
		||||
  ZM_RECON_ACCUM;
 | 
			
		||||
#else
 | 
			
		||||
  ZP_RECON_ACCUM;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////
 | 
			
		||||
  // Tm
 | 
			
		||||
@@ -164,7 +219,11 @@
 | 
			
		||||
  basep= st.GetPFInfo(nent,plocal); nent++;
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
			
		||||
#ifdef KERNEL_DAG
 | 
			
		||||
    TM_PROJMEM(base);
 | 
			
		||||
#else
 | 
			
		||||
    TP_PROJMEM(base);
 | 
			
		||||
#endif
 | 
			
		||||
    MAYBEPERM(PERMUTE_DIR0,perm);
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI(base);
 | 
			
		||||
@@ -175,7 +234,11 @@
 | 
			
		||||
    MULT_2SPIN_DIR_PFTM(Tm,basep);
 | 
			
		||||
  }
 | 
			
		||||
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
			
		||||
#ifdef KERNEL_DAG
 | 
			
		||||
  TM_RECON_ACCUM;
 | 
			
		||||
#else
 | 
			
		||||
  TP_RECON_ACCUM;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  basep= st.GetPFInfo(nent,plocal); nent++;
 | 
			
		||||
  SAVE_RESULT(base,basep);
 | 
			
		||||
 
 | 
			
		||||
@@ -839,46 +839,23 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,
 | 
			
		||||
////////////// Wilson ; uses this implementation /////////////////////
 | 
			
		||||
// Need Nc=3 though //
 | 
			
		||||
 | 
			
		||||
template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
			
		||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							       int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
			
		||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							       int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
			
		||||
								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								  int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
			
		||||
								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
#define INSTANTIATE_THEM(A) \
 | 
			
		||||
template void WilsonKernels<A>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
 | 
			
		||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
 | 
			
		||||
							       int ss,int sU,const FermionField &in, FermionField &out);\
 | 
			
		||||
template void WilsonKernels<A>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
 | 
			
		||||
								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
 | 
			
		||||
								  int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
			
		||||
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								      int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
			
		||||
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								      int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
			
		||||
									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
									 int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
			
		||||
									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
									 int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template void WilsonKernels<DomainWallVec5dImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
			
		||||
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								      int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template void WilsonKernels<DomainWallVec5dImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
			
		||||
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								      int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template void WilsonKernels<DomainWallVec5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
			
		||||
									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
									 int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template void WilsonKernels<DomainWallVec5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
			
		||||
									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
									 int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
INSTANTIATE_THEM(WilsonImplF);
 | 
			
		||||
INSTANTIATE_THEM(WilsonImplD);
 | 
			
		||||
INSTANTIATE_THEM(ZWilsonImplF);
 | 
			
		||||
INSTANTIATE_THEM(ZWilsonImplD);
 | 
			
		||||
INSTANTIATE_THEM(GparityWilsonImplF);
 | 
			
		||||
INSTANTIATE_THEM(GparityWilsonImplD);
 | 
			
		||||
INSTANTIATE_THEM(DomainWallVec5dImplF);
 | 
			
		||||
INSTANTIATE_THEM(DomainWallVec5dImplD);
 | 
			
		||||
INSTANTIATE_THEM(ZDomainWallVec5dImplF);
 | 
			
		||||
INSTANTIATE_THEM(ZDomainWallVec5dImplD);
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										79
									
								
								lib/qcd/action/fermion/ZMobiusFermion.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										79
									
								
								lib/qcd/action/fermion/ZMobiusFermion.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,79 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/qcd/action/fermion/MobiusFermion.h
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#ifndef  GRID_QCD_ZMOBIUS_FERMION_H
 | 
			
		||||
#define  GRID_QCD_ZMOBIUS_FERMION_H
 | 
			
		||||
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
  namespace QCD {
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    class ZMobiusFermion : public CayleyFermion5D<Impl>
 | 
			
		||||
    {
 | 
			
		||||
    public:
 | 
			
		||||
     INHERIT_IMPL_TYPES(Impl);
 | 
			
		||||
    public:
 | 
			
		||||
 | 
			
		||||
      virtual void   Instantiatable(void) {};
 | 
			
		||||
      // Constructors
 | 
			
		||||
      ZMobiusFermion(GaugeField &_Umu,
 | 
			
		||||
		     GridCartesian         &FiveDimGrid,
 | 
			
		||||
		     GridRedBlackCartesian &FiveDimRedBlackGrid,
 | 
			
		||||
		     GridCartesian         &FourDimGrid,
 | 
			
		||||
		     GridRedBlackCartesian &FourDimRedBlackGrid,
 | 
			
		||||
		     RealD _mass,RealD _M5,
 | 
			
		||||
		     std::vector<ComplexD> &gamma, RealD b,RealD c,const ImplParams &p= ImplParams()) : 
 | 
			
		||||
      
 | 
			
		||||
      CayleyFermion5D<Impl>(_Umu,
 | 
			
		||||
			    FiveDimGrid,
 | 
			
		||||
			    FiveDimRedBlackGrid,
 | 
			
		||||
			    FourDimGrid,
 | 
			
		||||
			    FourDimRedBlackGrid,_mass,_M5,p)
 | 
			
		||||
 | 
			
		||||
      {
 | 
			
		||||
	RealD eps = 1.0;
 | 
			
		||||
	
 | 
			
		||||
	std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
 | 
			
		||||
	std::vector<Coeff_t> zgamma(this->Ls);
 | 
			
		||||
	for(int s=0;s<this->Ls;s++){
 | 
			
		||||
	  zgamma[s] = gamma[s];
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// Call base setter
 | 
			
		||||
	this->SetCoefficientsInternal(1.0,zgamma,b,c);
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
@@ -1,300 +1,421 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/simd/Grid_qpx.h
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: neo <cossu@post.kek.jp>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
//----------------------------------------------------------------------
 | 
			
		||||
/*! @file Grid_qpx.h
 | 
			
		||||
  @brief Optimization libraries for QPX instructions set for BG/Q
 | 
			
		||||
 | 
			
		||||
  Using intrinsics
 | 
			
		||||
*/
 | 
			
		||||
// Time-stamp: <2015-05-27 11:30:21 neo>
 | 
			
		||||
//----------------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
// lot of undefined functions
 | 
			
		||||
/*******************************************************************************
 | 
			
		||||
 
 | 
			
		||||
 Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 
 | 
			
		||||
 Source file: ./lib/simd/Grid_qpx.h
 | 
			
		||||
 
 | 
			
		||||
 Copyright (C) 2016
 | 
			
		||||
 
 | 
			
		||||
 Author: Antonin Portelli <antonin.portelli@me.com>
 | 
			
		||||
 
 | 
			
		||||
 This program is free software; you can redistribute it and/or modify
 | 
			
		||||
 it under the terms of the GNU General Public License as published by
 | 
			
		||||
 the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
 (at your option) any later version.
 | 
			
		||||
 
 | 
			
		||||
 This program is distributed in the hope that it will be useful,
 | 
			
		||||
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
 GNU General Public License for more details.
 | 
			
		||||
 
 | 
			
		||||
 You should have received a copy of the GNU General Public License along
 | 
			
		||||
 with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 
 | 
			
		||||
 See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
 ******************************************************************************/
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace Optimization {
 | 
			
		||||
  typedef struct 
 | 
			
		||||
  {
 | 
			
		||||
    float v0,v1,v2,v3;
 | 
			
		||||
  } vector4float;
 | 
			
		||||
 | 
			
		||||
  inline std::ostream & operator<<(std::ostream& stream, const vector4double a)
 | 
			
		||||
  {
 | 
			
		||||
    stream << "{"<<vec_extract(a,0)<<","<<vec_extract(a,1)<<","<<vec_extract(a,2)<<","<<vec_extract(a,3)<<"}";
 | 
			
		||||
    return stream;
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  inline std::ostream & operator<<(std::ostream& stream, const vector4float a)
 | 
			
		||||
  {
 | 
			
		||||
    stream << "{"<< a.v0 <<","<< a.v1 <<","<< a.v2 <<","<< a.v3 <<"}";
 | 
			
		||||
    return stream;
 | 
			
		||||
  };
 | 
			
		||||
  
 | 
			
		||||
  struct Vsplat{
 | 
			
		||||
    //Complex float
 | 
			
		||||
    inline float operator()(float a, float b){
 | 
			
		||||
      return {a,b,a,b};
 | 
			
		||||
    inline vector4float operator()(float a, float b){
 | 
			
		||||
      return (vector4float){a, b, a, b};
 | 
			
		||||
    }
 | 
			
		||||
    // Real float
 | 
			
		||||
    inline float operator()(float a){
 | 
			
		||||
      return {a,a,a,a};
 | 
			
		||||
    inline vector4float operator()(float a){
 | 
			
		||||
      return (vector4float){a, a, a, a};
 | 
			
		||||
    }
 | 
			
		||||
    //Complex double
 | 
			
		||||
    inline vector4double operator()(double a, double b){
 | 
			
		||||
      return {a,b,a,b};
 | 
			
		||||
      return (vector4double){a, b, a, b};
 | 
			
		||||
    }
 | 
			
		||||
    //Real double
 | 
			
		||||
    inline vector4double operator()(double a){
 | 
			
		||||
      return {a,a,a,a};
 | 
			
		||||
      return (vector4double){a, a, a, a};
 | 
			
		||||
    }
 | 
			
		||||
    //Integer
 | 
			
		||||
    inline int operator()(Integer a){
 | 
			
		||||
#error
 | 
			
		||||
      return a;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
  struct Vstore{
 | 
			
		||||
    //Float 
 | 
			
		||||
    inline void operator()(float a, float* F){
 | 
			
		||||
      assert(0);
 | 
			
		||||
    //Float
 | 
			
		||||
    inline void operator()(vector4double a, float *f){
 | 
			
		||||
      vec_st(a, 0, f);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    inline void operator()(vector4double a, vector4float &f){
 | 
			
		||||
      vec_st(a, 0, (float *)(&f));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    inline void operator()(vector4float a, float *f){
 | 
			
		||||
      f[0] = a.v0;
 | 
			
		||||
      f[1] = a.v1;
 | 
			
		||||
      f[2] = a.v2;
 | 
			
		||||
      f[3] = a.v3;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    //Double
 | 
			
		||||
    inline void operator()(vector4double a, double* D){
 | 
			
		||||
      assert(0);
 | 
			
		||||
    inline void operator()(vector4double a, double *d){
 | 
			
		||||
      vec_st(a, 0, d);
 | 
			
		||||
    }
 | 
			
		||||
    //Integer
 | 
			
		||||
    inline void operator()(int a, Integer* I){
 | 
			
		||||
      assert(0);
 | 
			
		||||
    inline void operator()(int a, Integer *i){
 | 
			
		||||
      i[0] = a;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
  struct Vstream{
 | 
			
		||||
    //Float
 | 
			
		||||
    inline void operator()(float * a, float b){
 | 
			
		||||
      assert(0);
 | 
			
		||||
    }
 | 
			
		||||
    //Double
 | 
			
		||||
    inline void operator()(double * a, vector4double b){
 | 
			
		||||
      assert(0);
 | 
			
		||||
    inline void operator()(float *f, vector4double a){
 | 
			
		||||
      vec_st(a, 0, f);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    inline void operator()(vector4float f, vector4double a){
 | 
			
		||||
      vec_st(a, 0, (float *)(&f));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    inline void operator()(float *f, vector4float a){
 | 
			
		||||
      f[0] = a.v0;
 | 
			
		||||
      f[1] = a.v1;
 | 
			
		||||
      f[2] = a.v2;
 | 
			
		||||
      f[3] = a.v3;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    //Double
 | 
			
		||||
    inline void operator()(double *d, vector4double a){
 | 
			
		||||
      vec_st(a, 0, d);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
  struct Vset{
 | 
			
		||||
    // Complex float 
 | 
			
		||||
    inline float operator()(Grid::ComplexF *a){
 | 
			
		||||
      return {a[0].real(),a[0].imag(),a[1].real(),a[1].imag(),a[2].real(),a[2].imag(),a[3].real(),a[3].imag()};
 | 
			
		||||
    // Complex float
 | 
			
		||||
    inline vector4float operator()(Grid::ComplexF *a){
 | 
			
		||||
      return (vector4float){a[0].real(), a[0].imag(), a[1].real(), a[1].imag()};
 | 
			
		||||
    }
 | 
			
		||||
    // Complex double 
 | 
			
		||||
    // Complex double
 | 
			
		||||
    inline vector4double operator()(Grid::ComplexD *a){
 | 
			
		||||
      return {a[0].real(),a[0].imag(),a[1].real(),a[1].imag(),a[2].real(),a[2].imag(),a[3].real(),a[3].imag()};
 | 
			
		||||
      return vec_ld(0, (double *)a);
 | 
			
		||||
    }
 | 
			
		||||
    // Real float 
 | 
			
		||||
    inline float operator()(float *a){
 | 
			
		||||
      return {a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7]};
 | 
			
		||||
 | 
			
		||||
    // Real float
 | 
			
		||||
    inline vector4float operator()(float *a){
 | 
			
		||||
      return (vector4float){a[0], a[1], a[2], a[3]};
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    inline vector4double operator()(vector4float a){
 | 
			
		||||
      return vec_ld(0, (float *)(&a));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Real double
 | 
			
		||||
    inline vector4double operator()(double *a){
 | 
			
		||||
      return {a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7]};
 | 
			
		||||
      return vec_ld(0, a);
 | 
			
		||||
    }
 | 
			
		||||
    // Integer
 | 
			
		||||
    inline int operator()(Integer *a){
 | 
			
		||||
#error
 | 
			
		||||
      return a[0];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
  template <typename Out_type, typename In_type>
 | 
			
		||||
    struct Reduce{
 | 
			
		||||
      //Need templated class to overload output type
 | 
			
		||||
      //General form must generate error if compiled
 | 
			
		||||
      inline Out_type operator()(In_type in){
 | 
			
		||||
	printf("Error, using wrong Reduce function\n");
 | 
			
		||||
	exit(1);
 | 
			
		||||
	return 0;
 | 
			
		||||
      }
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
 | 
			
		||||
  struct Reduce{
 | 
			
		||||
    //Need templated class to overload output type
 | 
			
		||||
    //General form must generate error if compiled
 | 
			
		||||
    inline Out_type operator()(In_type in){
 | 
			
		||||
      printf("Error, using wrong Reduce function\n");
 | 
			
		||||
      exit(1);
 | 
			
		||||
      return 0;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
  
 | 
			
		||||
  /////////////////////////////////////////////////////
 | 
			
		||||
  // Arithmetic operations
 | 
			
		||||
  /////////////////////////////////////////////////////
 | 
			
		||||
  #define FLOAT_WRAP_2(fn, pref)\
 | 
			
		||||
  pref vector4float fn(vector4float a, vector4float b)\
 | 
			
		||||
  {\
 | 
			
		||||
    vector4double ad, bd, rd;\
 | 
			
		||||
    vector4float  r;\
 | 
			
		||||
    \
 | 
			
		||||
    ad = Vset()(a);\
 | 
			
		||||
    bd = Vset()(b);\
 | 
			
		||||
    rd = fn(ad, bd);\
 | 
			
		||||
    Vstore()(rd, r);\
 | 
			
		||||
    \
 | 
			
		||||
    return r;\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  #define FLOAT_WRAP_1(fn, pref)\
 | 
			
		||||
  pref vector4float fn(vector4float a)\
 | 
			
		||||
  {\
 | 
			
		||||
    vector4double ad, rd;\
 | 
			
		||||
    vector4float  r;\
 | 
			
		||||
    \
 | 
			
		||||
    ad = Vset()(a);\
 | 
			
		||||
    rd = fn(ad);\
 | 
			
		||||
    Vstore()(rd, r);\
 | 
			
		||||
    \
 | 
			
		||||
    return r;\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  struct Sum{
 | 
			
		||||
    //Complex/Real float
 | 
			
		||||
    inline float operator()(float a, float b){
 | 
			
		||||
#error
 | 
			
		||||
    }
 | 
			
		||||
    //Complex/Real double
 | 
			
		||||
    inline vector4double operator()(vector4double a, vector4double b){
 | 
			
		||||
      return vec_add(a,b);
 | 
			
		||||
      return vec_add(a, b);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    //Complex/Real float
 | 
			
		||||
    FLOAT_WRAP_2(operator(), inline)
 | 
			
		||||
 | 
			
		||||
    //Integer
 | 
			
		||||
    inline int operator()(int a, int b){
 | 
			
		||||
#error
 | 
			
		||||
      return a + b;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
  struct Sub{
 | 
			
		||||
    //Complex/Real float
 | 
			
		||||
    inline float operator()(float a, float b){
 | 
			
		||||
#error
 | 
			
		||||
    }
 | 
			
		||||
    //Complex/Real double
 | 
			
		||||
    inline vector4double operator()(vector4double a, vector4double b){
 | 
			
		||||
#error
 | 
			
		||||
      return vec_sub(a, b);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    //Complex/Real float
 | 
			
		||||
    FLOAT_WRAP_2(operator(), inline)
 | 
			
		||||
 | 
			
		||||
    //Integer
 | 
			
		||||
    inline floati operator()(int a, int b){
 | 
			
		||||
#error
 | 
			
		||||
    inline int operator()(int a, int b){
 | 
			
		||||
      return a - b;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
  struct MultComplex{
 | 
			
		||||
    // Complex float
 | 
			
		||||
    inline float operator()(float a, float b){
 | 
			
		||||
#error
 | 
			
		||||
    }
 | 
			
		||||
    // Complex double
 | 
			
		||||
    inline vector4double operator()(vector4double a, vector4double b){
 | 
			
		||||
#error
 | 
			
		||||
      return vec_xxnpmadd(a, b, vec_xmul(b, a));
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
    // Complex float
 | 
			
		||||
    FLOAT_WRAP_2(operator(), inline)
 | 
			
		||||
  };
 | 
			
		||||
  
 | 
			
		||||
  struct Mult{
 | 
			
		||||
    // Real float
 | 
			
		||||
    inline float operator()(float a, float b){
 | 
			
		||||
#error
 | 
			
		||||
    }
 | 
			
		||||
    // Real double
 | 
			
		||||
    inline vector4double operator()(vector4double a, vector4double b){
 | 
			
		||||
#error
 | 
			
		||||
      return vec_mul(a, b);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Real float
 | 
			
		||||
    FLOAT_WRAP_2(operator(), inline)
 | 
			
		||||
 | 
			
		||||
    // Integer
 | 
			
		||||
    inline int operator()(int a, int b){
 | 
			
		||||
#error
 | 
			
		||||
      return a*b;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
  struct Conj{
 | 
			
		||||
    // Complex single
 | 
			
		||||
    inline float operator()(float in){
 | 
			
		||||
      assert(0);
 | 
			
		||||
    }
 | 
			
		||||
    // Complex double
 | 
			
		||||
    inline vector4double operator()(vector4double in){
 | 
			
		||||
      assert(0);
 | 
			
		||||
    inline vector4double operator()(vector4double v){
 | 
			
		||||
      return vec_mul(v, (vector4double){1., -1., 1., -1.});
 | 
			
		||||
    }
 | 
			
		||||
    // do not define for integer input
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
    // Complex float
 | 
			
		||||
    FLOAT_WRAP_1(operator(), inline)
 | 
			
		||||
  };
 | 
			
		||||
  
 | 
			
		||||
  struct TimesMinusI{
 | 
			
		||||
    //Complex single
 | 
			
		||||
    inline float operator()(float in, float ret){
 | 
			
		||||
      assert(0);
 | 
			
		||||
    }
 | 
			
		||||
    //Complex double
 | 
			
		||||
    inline vector4double operator()(vector4double in, vector4double ret){
 | 
			
		||||
      assert(0);
 | 
			
		||||
    inline vector4double operator()(vector4double v, vector4double ret){
 | 
			
		||||
      return vec_xxcpnmadd(v, (vector4double){1., 1., 1., 1.},
 | 
			
		||||
                               (vector4double){0., 0., 0., 0.});
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    // Complex float
 | 
			
		||||
    FLOAT_WRAP_2(operator(), inline)
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
  struct TimesI{
 | 
			
		||||
    //Complex single
 | 
			
		||||
    inline float operator()(float in, float ret){
 | 
			
		||||
  
 | 
			
		||||
    }
 | 
			
		||||
    //Complex double
 | 
			
		||||
    inline vector4double operator()(vector4double in, vector4double ret){
 | 
			
		||||
  
 | 
			
		||||
    inline vector4double operator()(vector4double v, vector4double ret){
 | 
			
		||||
      return vec_xxcpnmadd(v, (vector4double){-1., -1., -1., -1.},
 | 
			
		||||
                              (vector4double){0., 0., 0., 0.});
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    // Complex float
 | 
			
		||||
    FLOAT_WRAP_2(operator(), inline)
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Permute{
 | 
			
		||||
    //Complex double
 | 
			
		||||
    static inline vector4double Permute0(vector4double v){ //0123 -> 2301
 | 
			
		||||
      return vec_perm(v, v, vec_gpci(02301));
 | 
			
		||||
    };
 | 
			
		||||
    static inline vector4double Permute1(vector4double v){ //0123 -> 1032
 | 
			
		||||
      return vec_perm(v, v, vec_gpci(01032));
 | 
			
		||||
    };
 | 
			
		||||
    static inline vector4double Permute2(vector4double v){
 | 
			
		||||
      return v;
 | 
			
		||||
    };
 | 
			
		||||
    static inline vector4double Permute3(vector4double v){
 | 
			
		||||
      return v;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    // Complex float
 | 
			
		||||
    FLOAT_WRAP_1(Permute0, static inline)
 | 
			
		||||
    FLOAT_WRAP_1(Permute1, static inline)
 | 
			
		||||
    FLOAT_WRAP_1(Permute2, static inline)
 | 
			
		||||
    FLOAT_WRAP_1(Permute3, static inline)
 | 
			
		||||
  };
 | 
			
		||||
  
 | 
			
		||||
  struct Rotate{
 | 
			
		||||
    static inline vector4double rotate(vector4double v, int n){
 | 
			
		||||
      switch(n){
 | 
			
		||||
        case 0:
 | 
			
		||||
          return v;
 | 
			
		||||
          break;
 | 
			
		||||
        case 1:
 | 
			
		||||
          return vec_perm(v, v, vec_gpci(01230));
 | 
			
		||||
          break;
 | 
			
		||||
        case 2:
 | 
			
		||||
          return vec_perm(v, v, vec_gpci(02301));
 | 
			
		||||
          break;
 | 
			
		||||
        case 3:
 | 
			
		||||
          return vec_perm(v, v, vec_gpci(03012));
 | 
			
		||||
          break;
 | 
			
		||||
        default: assert(0);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    static inline vector4float rotate(vector4float v, int n){
 | 
			
		||||
      vector4double vd, rd;
 | 
			
		||||
      vector4float  r;
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////
 | 
			
		||||
  // Some Template specialization
 | 
			
		||||
      vd = Vset()(v);
 | 
			
		||||
      rd = rotate(vd, n);
 | 
			
		||||
      Vstore()(rd, r);
 | 
			
		||||
 | 
			
		||||
      return r;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
  
 | 
			
		||||
  //Complex float Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
    inline Grid::ComplexF Reduce<Grid::ComplexF, float>::operator()(float in){
 | 
			
		||||
    assert(0);
 | 
			
		||||
  inline Grid::ComplexF
 | 
			
		||||
  Reduce<Grid::ComplexF, vector4float>::operator()(vector4float v) { //2 complex
 | 
			
		||||
    vector4float v1,v2;
 | 
			
		||||
    
 | 
			
		||||
    v1 = Optimization::Permute::Permute0(v);
 | 
			
		||||
    v1 = Optimization::Sum()(v1, v);
 | 
			
		||||
    
 | 
			
		||||
    return Grid::ComplexF(v1.v0, v1.v1);
 | 
			
		||||
  }
 | 
			
		||||
  //Real float Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
    inline Grid::RealF Reduce<Grid::RealF, float>::operator()(float in){
 | 
			
		||||
    assert(0);
 | 
			
		||||
  inline Grid::RealF
 | 
			
		||||
  Reduce<Grid::RealF, vector4float>::operator()(vector4float v){ //4 floats
 | 
			
		||||
    vector4float v1,v2;
 | 
			
		||||
    
 | 
			
		||||
    v1 = Optimization::Permute::Permute0(v);
 | 
			
		||||
    v1 = Optimization::Sum()(v1, v);
 | 
			
		||||
    v2 = Optimization::Permute::Permute1(v1);
 | 
			
		||||
    v1 = Optimization::Sum()(v1, v2);
 | 
			
		||||
    
 | 
			
		||||
    return v1.v0;
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  
 | 
			
		||||
  //Complex double Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
    inline Grid::ComplexD Reduce<Grid::ComplexD, vector4double>::operator()(vector4double in){
 | 
			
		||||
    assert(0);
 | 
			
		||||
  inline Grid::ComplexD
 | 
			
		||||
  Reduce<Grid::ComplexD, vector4double>::operator()(vector4double v){ //2 complex
 | 
			
		||||
    vector4double v1;
 | 
			
		||||
    
 | 
			
		||||
    v1 = Optimization::Permute::Permute0(v);
 | 
			
		||||
    v1 = vec_add(v1, v);
 | 
			
		||||
    
 | 
			
		||||
    return Grid::ComplexD(vec_extract(v1, 0), vec_extract(v1, 1));
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  //Real double Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
    inline Grid::RealD Reduce<Grid::RealD, vector4double>::operator()(vector4double in){
 | 
			
		||||
    assert(0);
 | 
			
		||||
  }
 | 
			
		||||
  inline Grid::RealD
 | 
			
		||||
  Reduce<Grid::RealD, vector4double>::operator()(vector4double v){ //4 doubles
 | 
			
		||||
    vector4double v1,v2;
 | 
			
		||||
    
 | 
			
		||||
    v1 = Optimization::Permute::Permute0(v);
 | 
			
		||||
    v1 = vec_add(v1, v);
 | 
			
		||||
    v2 = Optimization::Permute::Permute1(v1);
 | 
			
		||||
    v1 = vec_add(v1, v2);
 | 
			
		||||
 | 
			
		||||
    return vec_extract(v1, 0);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  //Integer Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
    inline Integer Reduce<Integer, floati>::operator()(float in){
 | 
			
		||||
  inline Integer Reduce<Integer, int>::operator()(int in){
 | 
			
		||||
    // FIXME unimplemented
 | 
			
		||||
    printf("Reduce : Missing integer implementation -> FIX\n");
 | 
			
		||||
    assert(0);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Here assign types 
 | 
			
		||||
namespace Grid {
 | 
			
		||||
  typedef float SIMD_Ftype  __attribute__ ((vector_size (16)));         // Single precision type
 | 
			
		||||
  typedef vector4double SIMD_Dtype; // Double precision type
 | 
			
		||||
  typedef int SIMD_Itype;           // Integer type
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Here assign types
 | 
			
		||||
typedef Optimization::vector4float SIMD_Ftype;  // Single precision type
 | 
			
		||||
typedef vector4double              SIMD_Dtype; // Double precision type
 | 
			
		||||
typedef int                        SIMD_Itype; // Integer type
 | 
			
		||||
 | 
			
		||||
  inline void v_prefetch0(int size, const char *ptr){};
 | 
			
		||||
 | 
			
		||||
  // Function name aliases
 | 
			
		||||
  typedef Optimization::Vsplat   VsplatSIMD;
 | 
			
		||||
  typedef Optimization::Vstore   VstoreSIMD;
 | 
			
		||||
  typedef Optimization::Vset     VsetSIMD;
 | 
			
		||||
  typedef Optimization::Vstream  VstreamSIMD;
 | 
			
		||||
  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
 | 
			
		||||
// prefetch utilities
 | 
			
		||||
inline void v_prefetch0(int size, const char *ptr){};
 | 
			
		||||
inline void prefetch_HINT_T0(const char *ptr){};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  // Arithmetic operations
 | 
			
		||||
  typedef Optimization::Sum         SumSIMD;
 | 
			
		||||
  typedef Optimization::Sub         SubSIMD;
 | 
			
		||||
  typedef Optimization::Mult        MultSIMD;
 | 
			
		||||
  typedef Optimization::MultComplex MultComplexSIMD;
 | 
			
		||||
  typedef Optimization::Conj        ConjSIMD;
 | 
			
		||||
  typedef Optimization::TimesMinusI TimesMinusISIMD;
 | 
			
		||||
  typedef Optimization::TimesI      TimesISIMD;
 | 
			
		||||
// Function name aliases
 | 
			
		||||
typedef Optimization::Vsplat   VsplatSIMD;
 | 
			
		||||
typedef Optimization::Vstore   VstoreSIMD;
 | 
			
		||||
typedef Optimization::Vset     VsetSIMD;
 | 
			
		||||
typedef Optimization::Vstream  VstreamSIMD;
 | 
			
		||||
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
 | 
			
		||||
 | 
			
		||||
// Arithmetic operations
 | 
			
		||||
typedef Optimization::Sum         SumSIMD;
 | 
			
		||||
typedef Optimization::Sub         SubSIMD;
 | 
			
		||||
typedef Optimization::Mult        MultSIMD;
 | 
			
		||||
typedef Optimization::MultComplex MultComplexSIMD;
 | 
			
		||||
typedef Optimization::Conj        ConjSIMD;
 | 
			
		||||
typedef Optimization::TimesMinusI TimesMinusISIMD;
 | 
			
		||||
typedef Optimization::TimesI      TimesISIMD;
 | 
			
		||||
  
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -388,6 +388,12 @@ class Grid_simd {
 | 
			
		||||
 | 
			
		||||
};  // end of Grid_simd class definition
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
inline void permute(ComplexD &y,ComplexD b, int perm) {  y=b; }
 | 
			
		||||
inline void permute(ComplexF &y,ComplexF b, int perm) {  y=b; }
 | 
			
		||||
inline void permute(RealD &y,RealD b, int perm) {  y=b; }
 | 
			
		||||
inline void permute(RealF &y,RealF b, int perm) {  y=b; }
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////
 | 
			
		||||
// General rotate
 | 
			
		||||
////////////////////////////////////////////////////////////////////
 | 
			
		||||
 
 | 
			
		||||
@@ -67,15 +67,13 @@ template <class scalar>
 | 
			
		||||
struct AsinRealFunctor {
 | 
			
		||||
  scalar operator()(const scalar &a) const { return asin(real(a)); }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <class scalar>
 | 
			
		||||
struct LogRealFunctor {
 | 
			
		||||
  scalar operator()(const scalar &a) const { return log(real(a)); }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <class scalar>
 | 
			
		||||
struct ExpRealFunctor {
 | 
			
		||||
  scalar operator()(const scalar &a) const { return exp(real(a)); }
 | 
			
		||||
struct ExpFunctor {
 | 
			
		||||
  scalar operator()(const scalar &a) const { return exp(a); }
 | 
			
		||||
};
 | 
			
		||||
template <class scalar>
 | 
			
		||||
struct NotFunctor {
 | 
			
		||||
@@ -85,7 +83,6 @@ template <class scalar>
 | 
			
		||||
struct AbsRealFunctor {
 | 
			
		||||
  scalar operator()(const scalar &a) const { return std::abs(real(a)); }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <class scalar>
 | 
			
		||||
struct PowRealFunctor {
 | 
			
		||||
  double y;
 | 
			
		||||
@@ -135,7 +132,6 @@ template <class Scalar>
 | 
			
		||||
inline Scalar rsqrt(const Scalar &r) {
 | 
			
		||||
  return (RSqrtRealFunctor<Scalar>(), r);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class S, class V>
 | 
			
		||||
inline Grid_simd<S, V> cos(const Grid_simd<S, V> &r) {
 | 
			
		||||
  return SimdApply(CosRealFunctor<S>(), r);
 | 
			
		||||
@@ -162,7 +158,7 @@ inline Grid_simd<S, V> abs(const Grid_simd<S, V> &r) {
 | 
			
		||||
}
 | 
			
		||||
template <class S, class V>
 | 
			
		||||
inline Grid_simd<S, V> exp(const Grid_simd<S, V> &r) {
 | 
			
		||||
  return SimdApply(ExpRealFunctor<S>(), r);
 | 
			
		||||
  return SimdApply(ExpFunctor<S>(), r);
 | 
			
		||||
}
 | 
			
		||||
template <class S, class V>
 | 
			
		||||
inline Grid_simd<S, V> Not(const Grid_simd<S, V> &r) {
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user