1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-17 07:17:06 +01:00

Merge branch 'develop' into feature/CG_repro

This commit is contained in:
Guido Cossu
2016-11-09 14:44:46 +00:00
20 changed files with 251 additions and 125 deletions

View File

@ -3,8 +3,8 @@ SUBDIRS = lib benchmarks tests
.PHONY: tests .PHONY: tests
tests: tests: all
make -C tests tests $(MAKE) -C tests tests
AM_CXXFLAGS += -I$(top_builddir)/include AM_CXXFLAGS += -I$(top_builddir)/include
ACLOCAL_AMFLAGS = -I m4 ACLOCAL_AMFLAGS = -I m4

44
README
View File

@ -1,44 +0,0 @@
This library provides data parallel C++ container classes with internal memory layout
that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
are provided, similar to HPF and cmfortran, and user control is given over the mapping of
array indices to both MPI tasks and SIMD processing elements.
* Identically shaped arrays then be processed with perfect data parallelisation.
* Such identically shapped arrays are called conformable arrays.
The transformation is based on the observation that Cartesian array processing involves
identical processing to be performed on different regions of the Cartesian array.
The library will (eventually) both geometrically decompose into MPI tasks and across SIMD lanes.
Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but
optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification
for most programmers.
The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
Presently SSE2 (128 bit) AVX, AVX2 (256 bit) and IMCI and AVX512 (512 bit) targets are supported.
These are presented as
vRealF, vRealD, vComplexF, vComplexD
internal vector data types. These may be useful in themselves for other programmers.
The corresponding scalar types are named
RealF, RealD, ComplexF, ComplexD
MPI parallelism is UNIMPLEMENTED and for now only OpenMP and SIMD parallelism is present in the library.
You can give `configure' initial values for configuration parameters
by setting variables in the command line or in the environment. Here
is are examples:
./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -msse4" --enable-simd=SSE4
./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx" --enable-simd=AVX1
./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx2" --enable-simd=AVX2
./configure CXX=icpc CXXFLAGS="-std=c++11 -O3 -mmic" --enable-simd=AVX512 --host=none

1
README Symbolic link
View File

@ -0,0 +1 @@
README.md

View File

@ -126,7 +126,7 @@ If you want to build all the tests at once just use `make tests`.
### Possible communication interfaces ### Possible communication interfaces
The following options can be use with the `--enable-simd=` option to target different communication interfaces: The following options can be use with the `--enable-comms=` option to target different communication interfaces:
| `<comm>` | Description | | `<comm>` | Description |
| -------------- | ------------------------------------------------------------- | | -------------- | ------------------------------------------------------------- |

View File

@ -193,6 +193,7 @@ int main (int argc, char ** argv)
} }
} }
Nloop=100; Nloop=100;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl; std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
@ -271,5 +272,90 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl; std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
} }
} }
Nloop=100;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L "<<"\t\t"<<" Ls "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
for(int lat=4;lat<=maxlat;lat+=2){
for(int Ls=1;Ls<=16;Ls*=2){
std::vector<int> latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
lat*mpi_layout[2],
lat*mpi_layout[3]});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
std::vector<HalfSpinColourVectorD *> xbuf(8);
std::vector<HalfSpinColourVectorD *> rbuf(8);
Grid.ShmBufferFreeAll();
for(int d=0;d<8;d++){
xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
}
int ncomm;
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
double start=usecond();
for(int i=0;i<Nloop;i++){
std::vector<CartesianCommunicator::CommsRequest_t> requests;
ncomm=0;
for(int mu=0;mu<4;mu++){
if (mpi_layout[mu]>1 ) {
ncomm++;
int comm_proc=1;
int xmit_to_rank;
int recv_from_rank;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
Grid.StencilSendToRecvFromBegin(requests,
(void *)&xbuf[mu][0],
xmit_to_rank,
(void *)&rbuf[mu][0],
recv_from_rank,
bytes);
// Grid.StencilSendToRecvFromComplete(requests);
// requests.resize(0);
comm_proc = mpi_layout[mu]-1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
Grid.StencilSendToRecvFromBegin(requests,
(void *)&xbuf[mu+4][0],
xmit_to_rank,
(void *)&rbuf[mu+4][0],
recv_from_rank,
bytes);
Grid.StencilSendToRecvFromComplete(requests);
requests.resize(0);
}
}
Grid.Barrier();
}
double stop=usecond();
double dbytes = bytes;
double xbytes = Nloop*dbytes*2.0*ncomm;
double rbytes = xbytes;
double bidibytes = xbytes+rbytes;
double time = stop-start; // microseconds
std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
}
}
Grid_finalize(); Grid_finalize();
} }

View File

@ -57,7 +57,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl; std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
std::vector<int> latt4 = GridDefaultLatt(); std::vector<int> latt4 = GridDefaultLatt();
const int Ls=16; const int Ls=8;
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
@ -138,7 +138,7 @@ int main (int argc, char ** argv)
int ncall =100; int ncall =100;
if (1) { if (1) {
FGrid->Barrier();
Dw.ZeroCounters(); Dw.ZeroCounters();
double t0=usecond(); double t0=usecond();
for(int i=0;i<ncall;i++){ for(int i=0;i<ncall;i++){
@ -147,6 +147,7 @@ int main (int argc, char ** argv)
__SSC_STOP; __SSC_STOP;
} }
double t1=usecond(); double t1=usecond();
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=1344*volume*ncall; double flops=1344*volume*ncall;
@ -158,7 +159,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl; std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
err = ref-result; err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl; std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
assert (norm2(err)< 1.0e-5 ); assert (norm2(err)< 1.0e-4 );
Dw.Report(); Dw.Report();
} }
@ -193,6 +194,7 @@ int main (int argc, char ** argv)
pokeSite(tmp,ssrc,site); pokeSite(tmp,ssrc,site);
}}}}} }}}}}
std::cout<<GridLogMessage<< "src norms "<< norm2(src)<<" " <<norm2(ssrc)<<std::endl; std::cout<<GridLogMessage<< "src norms "<< norm2(src)<<" " <<norm2(ssrc)<<std::endl;
FGrid->Barrier();
double t0=usecond(); double t0=usecond();
sDw.ZeroCounters(); sDw.ZeroCounters();
for(int i=0;i<ncall;i++){ for(int i=0;i<ncall;i++){
@ -201,6 +203,7 @@ int main (int argc, char ** argv)
__SSC_STOP; __SSC_STOP;
} }
double t1=usecond(); double t1=usecond();
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=1344*volume*ncall; double flops=1344*volume*ncall;
@ -240,7 +243,7 @@ int main (int argc, char ** argv)
} }
}}}}} }}}}}
std::cout<<GridLogMessage<<" difference between normal and simd is "<<sum<<std::endl; std::cout<<GridLogMessage<<" difference between normal and simd is "<<sum<<std::endl;
assert (sum< 1.0e-5 ); assert (sum< 1.0e-4 );
if (1) { if (1) {
@ -271,6 +274,7 @@ int main (int argc, char ** argv)
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl; if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl; std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
FGrid->Barrier();
sDw.ZeroCounters(); sDw.ZeroCounters();
sDw.stat.init("DhopEO"); sDw.stat.init("DhopEO");
double t0=usecond(); double t0=usecond();
@ -278,6 +282,7 @@ int main (int argc, char ** argv)
sDw.DhopEO(ssrc_o, sr_e, DaggerNo); sDw.DhopEO(ssrc_o, sr_e, DaggerNo);
} }
double t1=usecond(); double t1=usecond();
FGrid->Barrier();
sDw.stat.print(); sDw.stat.print();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
@ -301,7 +306,7 @@ int main (int argc, char ** argv)
error+= norm2(ssrc_o); error+= norm2(ssrc_o);
std::cout<<GridLogMessage << "sO norm diff "<< norm2(ssrc_o)<< " vec nrm"<<norm2(sr_o) <<std::endl; std::cout<<GridLogMessage << "sO norm diff "<< norm2(ssrc_o)<< " vec nrm"<<norm2(sr_o) <<std::endl;
if(error>1.0e-5) { if(error>1.0e-4) {
setCheckerboard(ssrc,ssrc_o); setCheckerboard(ssrc,ssrc_o);
setCheckerboard(ssrc,ssrc_e); setCheckerboard(ssrc,ssrc_e);
std::cout<< ssrc << std::endl; std::cout<< ssrc << std::endl;
@ -337,7 +342,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl; std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
err = ref-result; err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl; std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
assert(norm2(err)<1.0e-5); assert(norm2(err)<1.0e-4);
LatticeFermion src_e (FrbGrid); LatticeFermion src_e (FrbGrid);
LatticeFermion src_o (FrbGrid); LatticeFermion src_o (FrbGrid);
LatticeFermion r_e (FrbGrid); LatticeFermion r_e (FrbGrid);
@ -363,11 +368,13 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage<< "*********************************************************" <<std::endl; std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
{ {
Dw.ZeroCounters(); Dw.ZeroCounters();
FGrid->Barrier();
double t0=usecond(); double t0=usecond();
for(int i=0;i<ncall;i++){ for(int i=0;i<ncall;i++){
Dw.DhopEO(src_o,r_e,DaggerNo); Dw.DhopEO(src_o,r_e,DaggerNo);
} }
double t1=usecond(); double t1=usecond();
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(1344.0*volume*ncall)/2; double flops=(1344.0*volume*ncall)/2;
@ -389,14 +396,14 @@ int main (int argc, char ** argv)
err = r_eo-result; err = r_eo-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl; std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
assert(norm2(err)<1.0e-5); assert(norm2(err)<1.0e-4);
pickCheckerboard(Even,src_e,err); pickCheckerboard(Even,src_e,err);
pickCheckerboard(Odd,src_o,err); pickCheckerboard(Odd,src_o,err);
std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl; std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl;
std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl; std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl;
assert(norm2(src_e)<1.0e-5); assert(norm2(src_e)<1.0e-4);
assert(norm2(src_o)<1.0e-5); assert(norm2(src_o)<1.0e-4);
Grid_finalize(); Grid_finalize();
} }

View File

@ -69,8 +69,8 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "Volume \t\t\tProcs \t Dw \t eoDw \t sDw \t eosDw (Mflop/s) "<<std::endl; std::cout<<GridLogMessage << "Volume \t\t\tProcs \t Dw \t eoDw \t sDw \t eosDw (Mflop/s) "<<std::endl;
std::cout<<GridLogMessage << "=========================================================================="<<std::endl; std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
int Lmax=32; int Lmax=16;
int dmin=0; int dmin=2;
if ( getenv("LMAX") ) Lmax=atoi(getenv("LMAX")); if ( getenv("LMAX") ) Lmax=atoi(getenv("LMAX"));
if ( getenv("DMIN") ) dmin=atoi(getenv("DMIN")); if ( getenv("DMIN") ) dmin=atoi(getenv("DMIN"));
for (int L=8;L<=Lmax;L*=2){ for (int L=8;L<=Lmax;L*=2){

View File

@ -30,8 +30,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#define _GRID_FFT_H_ #define _GRID_FFT_H_
#ifdef HAVE_FFTW #ifdef HAVE_FFTW
#ifdef USE_MKL
#include <fftw/fftw3.h>
#else
#include <fftw3.h> #include <fftw3.h>
#endif #endif
#endif
namespace Grid { namespace Grid {
@ -122,6 +126,7 @@ namespace Grid {
double Flops(void) {return flops;} double Flops(void) {return flops;}
double MFlops(void) {return flops/usec;} double MFlops(void) {return flops/usec;}
double USec(void) {return (double)usec;}
FFT ( GridCartesian * grid ) : FFT ( GridCartesian * grid ) :
vgrid(grid), vgrid(grid),

View File

@ -93,7 +93,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
void Grid_quiesce_nodes(void) { void Grid_quiesce_nodes(void) {
int me = 0; int me = 0;
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) #if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L)
MPI_Comm_rank(MPI_COMM_WORLD, &me); MPI_Comm_rank(MPI_COMM_WORLD, &me);
#endif #endif
#ifdef GRID_COMMS_SHMEM #ifdef GRID_COMMS_SHMEM

View File

@ -43,6 +43,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#else #else
#include <sys/syscall.h> #include <sys/syscall.h>
#endif #endif
#ifdef __x86_64__
#include <x86intrin.h>
#endif
namespace Grid { namespace Grid {
@ -86,7 +89,6 @@ inline uint64_t cyclecount(void){
return tmp; return tmp;
} }
#elif defined __x86_64__ #elif defined __x86_64__
#include <x86intrin.h>
inline uint64_t cyclecount(void){ inline uint64_t cyclecount(void){
return __rdtsc(); return __rdtsc();
// unsigned int dummy; // unsigned int dummy;

View File

@ -191,7 +191,7 @@ class ConjugateGradient : public OperatorFunction<Field> {
<< LinalgTimer.Elapsed(); << LinalgTimer.Elapsed();
std::cout << std::endl; std::cout << std::endl;
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 1000.0); if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
if (!CGState.do_repro && ReproTest){ if (!CGState.do_repro && ReproTest){
CGState.do_repro = true; CGState.do_repro = true;

View File

@ -97,7 +97,7 @@ void CartesianCommunicator::Barrier(void){}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {} void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { } void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) { return 0;} int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) { return 0;}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){ assert(0);} void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){ coor = _processor_coor ;}
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{ {
source =0; source =0;

View File

@ -10,6 +10,7 @@
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk> Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -53,24 +54,26 @@ WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,
} }
#if defined(AVX512) #if defined(AVX512)
#include <simd/Intel512wilson.h>
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
// If we are AVX512 specialise the single precision routine // If we are AVX512 specialise the single precision routine
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
#include <simd/Intel512wilson.h>
#include <simd/Intel512single.h> #include <simd/Intel512single.h>
static Vector<vComplexF> signs; static Vector<vComplexF> signsF;
int setupSigns(void ){ template<typename vtype>
Vector<vComplexF> bother(2); int setupSigns(Vector<vtype>& signs ){
Vector<vtype> bother(2);
signs = bother; signs = bother;
vrsign(signs[0]); vrsign(signs[0]);
visign(signs[1]); visign(signs[1]);
return 1; return 1;
} }
static int signInit = setupSigns();
static int signInitF = setupSigns(signsF);
#define label(A) ilabel(A) #define label(A) ilabel(A)
#define ilabel(A) ".globl\n" #A ":\n" #define ilabel(A) ".globl\n" #A ":\n"
@ -78,6 +81,8 @@ static Vector<vComplexF> signs;
#define MAYBEPERM(A,perm) if (perm) { A ; } #define MAYBEPERM(A,perm) if (perm) { A ; }
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf) #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
#define FX(A) WILSONASM_ ##A #define FX(A) WILSONASM_ ##A
#define COMPLEX_TYPE vComplexF
#define signs signsF
#undef KERNEL_DAG #undef KERNEL_DAG
template<> void template<> void
@ -98,8 +103,8 @@ WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder
#undef FX #undef FX
#define FX(A) DWFASM_ ## A #define FX(A) DWFASM_ ## A
#define MAYBEPERM(A,B) #define MAYBEPERM(A,B)
#define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C) //#define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C)
#define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C) //#define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C)
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf) #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
#undef KERNEL_DAG #undef KERNEL_DAG
@ -113,8 +118,71 @@ template<> void
WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef COMPLEX_TYPE
#undef signs
#undef VMOVRDUP
#undef MAYBEPERM
#undef MULT_2SPIN
#undef FX
#endif ///////////////////////////////////////////////////////////
// If we are AVX512 specialise the double precision routine
///////////////////////////////////////////////////////////
#include <simd/Intel512double.h>
static Vector<vComplexD> signsD;
#define signs signsD
static int signInitD = setupSigns(signsD);
#define MAYBEPERM(A,perm) if (perm) { A ; }
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
#define FX(A) WILSONASM_ ##A
#define COMPLEX_TYPE vComplexD
#undef KERNEL_DAG
template<> void
WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#define KERNEL_DAG
template<> void
WilsonKernels<WilsonImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef VMOVIDUP
#undef VMOVRDUP
#undef MAYBEPERM
#undef MULT_2SPIN
#undef FX
#define FX(A) DWFASM_ ## A
#define MAYBEPERM(A,B)
//#define VMOVIDUP(A,B,C) VBCASTIDUPd(A,B,C)
//#define VMOVRDUP(A,B,C) VBCASTRDUPd(A,B,C)
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
#undef KERNEL_DAG
template<> void
WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#define KERNEL_DAG
template<> void
WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef COMPLEX_TYPE
#undef signs
#undef VMOVRDUP
#undef MAYBEPERM
#undef MULT_2SPIN
#undef FX
#endif //AVX512
#define INSTANTIATE_ASM(A)\ #define INSTANTIATE_ASM(A)\
template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\ template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\

View File

@ -5,7 +5,9 @@
const uint64_t plocal =(uint64_t) & in._odata[0]; const uint64_t plocal =(uint64_t) & in._odata[0];
// vComplexF isigns[2] = { signs[0], signs[1] }; // vComplexF isigns[2] = { signs[0], signs[1] };
vComplexF *isigns = &signs[0]; //COMPLEX_TYPE is vComplexF of vComplexD depending
//on the chosen precision
COMPLEX_TYPE *isigns = &signs[0];
MASK_REGS; MASK_REGS;
int nmax=U._grid->oSites(); int nmax=U._grid->oSites();

View File

@ -116,7 +116,7 @@ class NerscHmcRunnerTemplate {
NoSmearing<Gimpl> SmearingPolicy; NoSmearing<Gimpl> SmearingPolicy;
typedef MinimumNorm2<GaugeField, NoSmearing<Gimpl>, RepresentationsPolicy > typedef MinimumNorm2<GaugeField, NoSmearing<Gimpl>, RepresentationsPolicy >
IntegratorType; // change here to change the algorithm IntegratorType; // change here to change the algorithm
IntegratorParameters MDpar(20, 1.0); IntegratorParameters MDpar(40, 1.0);
IntegratorType MDynamics(UGrid, MDpar, TheAction, SmearingPolicy); IntegratorType MDynamics(UGrid, MDpar, TheAction, SmearingPolicy);
// Checkpoint strategy // Checkpoint strategy

View File

@ -382,7 +382,6 @@ namespace Optimization {
// Some Template specialization // Some Template specialization
// Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases // Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases
#ifndef __INTEL_COMPILER #ifndef __INTEL_COMPILER
#warning "Slow reduction due to incomplete reduce intrinsics" #warning "Slow reduction due to incomplete reduce intrinsics"
//Complex float Reduce //Complex float Reduce

View File

@ -9,4 +9,4 @@ endif
include Make.inc include Make.inc
subtests: subtests:
for d in $(SUBDIRS); do make -C $${d} tests; done for d in $(SUBDIRS); do $(MAKE) -C $${d} tests; done

View File

@ -42,7 +42,7 @@ class FourierAcceleratedGaugeFixer : public Gimpl {
static void GaugeLinkToLieAlgebraField(const std::vector<GaugeMat> &U,std::vector<GaugeMat> &A) { static void GaugeLinkToLieAlgebraField(const std::vector<GaugeMat> &U,std::vector<GaugeMat> &A) {
for(int mu=0;mu<Nd;mu++){ for(int mu=0;mu<Nd;mu++){
// ImplComplex cmi(0.0,-1.0); // ImplComplex cmi(0.0,-1.0);
ComplexD cmi(0.0,-1.0); Complex cmi(0.0,-1.0);
A[mu] = Ta(U[mu]) * cmi; A[mu] = Ta(U[mu]) * cmi;
} }
} }
@ -52,13 +52,13 @@ class FourierAcceleratedGaugeFixer : public Gimpl {
dmuAmu = dmuAmu + A[mu] - Cshift(A[mu],mu,-1); dmuAmu = dmuAmu + A[mu] - Cshift(A[mu],mu,-1);
} }
} }
static void SteepestDescentGaugeFix(GaugeLorentz &Umu,RealD & alpha,int maxiter,RealD Omega_tol, RealD Phi_tol) { static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol) {
GridBase *grid = Umu._grid; GridBase *grid = Umu._grid;
RealD org_plaq =WilsonLoops<Gimpl>::avgPlaquette(Umu); Real org_plaq =WilsonLoops<Gimpl>::avgPlaquette(Umu);
RealD org_link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); Real org_link_trace=WilsonLoops<Gimpl>::linkTrace(Umu);
RealD old_trace = org_link_trace; Real old_trace = org_link_trace;
RealD trG; Real trG;
std::vector<GaugeMat> U(Nd,grid); std::vector<GaugeMat> U(Nd,grid);
GaugeMat dmuAmu(grid); GaugeMat dmuAmu(grid);
@ -71,13 +71,13 @@ class FourierAcceleratedGaugeFixer : public Gimpl {
// Monitor progress and convergence test // Monitor progress and convergence test
// infrequently to minimise cost overhead // infrequently to minimise cost overhead
if ( i %20 == 0 ) { if ( i %20 == 0 ) {
RealD plaq =WilsonLoops<Gimpl>::avgPlaquette(Umu); Real plaq =WilsonLoops<Gimpl>::avgPlaquette(Umu);
RealD link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); Real link_trace=WilsonLoops<Gimpl>::linkTrace(Umu);
std::cout << GridLogMessage << " Iteration "<<i<< " plaq= "<<plaq<< " dmuAmu " << norm2(dmuAmu)<< std::endl; std::cout << GridLogMessage << " Iteration "<<i<< " plaq= "<<plaq<< " dmuAmu " << norm2(dmuAmu)<< std::endl;
RealD Phi = 1.0 - old_trace / link_trace ; Real Phi = 1.0 - old_trace / link_trace ;
RealD Omega= 1.0 - trG; Real Omega= 1.0 - trG;
std::cout << GridLogMessage << " Iteration "<<i<< " Phi= "<<Phi<< " Omega= " << Omega<< " trG " << trG <<std::endl; std::cout << GridLogMessage << " Iteration "<<i<< " Phi= "<<Phi<< " Omega= " << Omega<< " trG " << trG <<std::endl;
@ -91,7 +91,7 @@ class FourierAcceleratedGaugeFixer : public Gimpl {
} }
} }
}; };
static RealD SteepestDescentStep(std::vector<GaugeMat> &U,RealD & alpha, GaugeMat & dmuAmu) { static Real SteepestDescentStep(std::vector<GaugeMat> &U,Real & alpha, GaugeMat & dmuAmu) {
GridBase *grid = U[0]._grid; GridBase *grid = U[0]._grid;
std::vector<GaugeMat> A(Nd,grid); std::vector<GaugeMat> A(Nd,grid);
@ -101,26 +101,26 @@ class FourierAcceleratedGaugeFixer : public Gimpl {
ExpiAlphaDmuAmu(A,g,alpha,dmuAmu); ExpiAlphaDmuAmu(A,g,alpha,dmuAmu);
RealD vol = grid->gSites(); Real vol = grid->gSites();
RealD trG = TensorRemove(sum(trace(g))).real()/vol/Nc; Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
SU<Nc>::GaugeTransform(U,g); SU<Nc>::GaugeTransform(U,g);
return trG; return trG;
} }
static RealD FourierAccelSteepestDescentStep(std::vector<GaugeMat> &U,RealD & alpha, GaugeMat & dmuAmu) { static Real FourierAccelSteepestDescentStep(std::vector<GaugeMat> &U,Real & alpha, GaugeMat & dmuAmu) {
GridBase *grid = U[0]._grid; GridBase *grid = U[0]._grid;
RealD vol = grid->gSites(); Real vol = grid->gSites();
FFT theFFT((GridCartesian *)grid); FFT theFFT((GridCartesian *)grid);
LatticeComplex Fp(grid); LatticeComplex Fp(grid);
LatticeComplex psq(grid); psq=zero; LatticeComplex psq(grid); psq=zero;
LatticeComplex pmu(grid); LatticeComplex pmu(grid);
LatticeComplex one(grid); one = ComplexD(1.0,0.0); LatticeComplex one(grid); one = Complex(1.0,0.0);
GaugeMat g(grid); GaugeMat g(grid);
GaugeMat dmuAmu_p(grid); GaugeMat dmuAmu_p(grid);
@ -139,13 +139,13 @@ class FourierAcceleratedGaugeFixer : public Gimpl {
std::vector<int> coor(grid->_ndimension,0); std::vector<int> coor(grid->_ndimension,0);
for(int mu=0;mu<Nd;mu++) { for(int mu=0;mu<Nd;mu++) {
RealD TwoPiL = M_PI * 2.0/ latt_size[mu]; Real TwoPiL = M_PI * 2.0/ latt_size[mu];
LatticeCoordinate(pmu,mu); LatticeCoordinate(pmu,mu);
pmu = TwoPiL * pmu ; pmu = TwoPiL * pmu ;
psq = psq + 4.0*sin(pmu*0.5)*sin(pmu*0.5); psq = psq + 4.0*sin(pmu*0.5)*sin(pmu*0.5);
} }
ComplexD psqMax(16.0); Complex psqMax(16.0);
Fp = psqMax*one/psq; Fp = psqMax*one/psq;
static int once; static int once;
@ -160,20 +160,20 @@ class FourierAcceleratedGaugeFixer : public Gimpl {
theFFT.FFT_all_dim(dmuAmu,dmuAmu_p,FFT::backward); theFFT.FFT_all_dim(dmuAmu,dmuAmu_p,FFT::backward);
GaugeMat ciadmam(grid); GaugeMat ciadmam(grid);
ComplexD cialpha(0.0,-alpha); Complex cialpha(0.0,-alpha);
ciadmam = dmuAmu*cialpha; ciadmam = dmuAmu*cialpha;
SU<Nc>::taExp(ciadmam,g); SU<Nc>::taExp(ciadmam,g);
RealD trG = TensorRemove(sum(trace(g))).real()/vol/Nc; Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
SU<Nc>::GaugeTransform(U,g); SU<Nc>::GaugeTransform(U,g);
return trG; return trG;
} }
static void ExpiAlphaDmuAmu(const std::vector<GaugeMat> &A,GaugeMat &g,RealD & alpha, GaugeMat &dmuAmu) { static void ExpiAlphaDmuAmu(const std::vector<GaugeMat> &A,GaugeMat &g,Real & alpha, GaugeMat &dmuAmu) {
GridBase *grid = g._grid; GridBase *grid = g._grid;
ComplexD cialpha(0.0,-alpha); Complex cialpha(0.0,-alpha);
GaugeMat ciadmam(grid); GaugeMat ciadmam(grid);
DmuAmu(A,dmuAmu); DmuAmu(A,dmuAmu);
ciadmam = dmuAmu*cialpha; ciadmam = dmuAmu*cialpha;
@ -193,11 +193,11 @@ class FourierAcceleratedGaugeFixer : public Gimpl {
ComplexField pha(grid); ComplexField pha(grid);
GaugeMat Apha(grid); GaugeMat Apha(grid);
ComplexD ci(0.0,1.0); Complex ci(0.0,1.0);
for(int mu=0;mu<Nd;mu++){ for(int mu=0;mu<Nd;mu++){
RealD TwoPiL = M_PI * 2.0/ latt_size[mu]; Real TwoPiL = M_PI * 2.0/ latt_size[mu];
LatticeCoordinate(pmu,mu); LatticeCoordinate(pmu,mu);
pmu = TwoPiL * pmu ; pmu = TwoPiL * pmu ;
pha = exp(pmu * (0.5 *ci)); // e(ipmu/2) since Amu(x+mu/2) pha = exp(pmu * (0.5 *ci)); // e(ipmu/2) since Amu(x+mu/2)
@ -213,14 +213,14 @@ class FourierAcceleratedGaugeFixer : public Gimpl {
ComplexField pmu(grid); ComplexField pmu(grid);
ComplexField pha(grid); ComplexField pha(grid);
ComplexD ci(0.0,1.0); Complex ci(0.0,1.0);
// Sign convention for FFTW calls: // Sign convention for FFTW calls:
// A(x)= Sum_p e^ipx A(p) / V // A(x)= Sum_p e^ipx A(p) / V
// A(p)= Sum_p e^-ipx A(x) // A(p)= Sum_p e^-ipx A(x)
for(int mu=0;mu<Nd;mu++){ for(int mu=0;mu<Nd;mu++){
RealD TwoPiL = M_PI * 2.0/ latt_size[mu]; Real TwoPiL = M_PI * 2.0/ latt_size[mu];
LatticeCoordinate(pmu,mu); LatticeCoordinate(pmu,mu);
pmu = TwoPiL * pmu ; pmu = TwoPiL * pmu ;
pha = exp(-pmu * (0.5 *ci)); // e(+ipmu/2) since Amu(x+mu/2) pha = exp(-pmu * (0.5 *ci)); // e(+ipmu/2) since Amu(x+mu/2)
@ -241,7 +241,7 @@ int main (int argc, char ** argv)
int threads = GridThread::GetThreads(); int threads = GridThread::GetThreads();
std::vector<int> latt_size = GridDefaultLatt(); std::vector<int> latt_size = GridDefaultLatt();
std::vector<int> simd_layout( { vComplexD::Nsimd(),1,1,1}); std::vector<int> simd_layout( { vComplex::Nsimd(),1,1,1});
std::vector<int> mpi_layout = GridDefaultMpi(); std::vector<int> mpi_layout = GridDefaultMpi();
int vol = 1; int vol = 1;
@ -261,25 +261,25 @@ int main (int argc, char ** argv)
std::cout<< "* Testing we can gauge fix steep descent a RGT of Unit gauge *" <<std::endl; std::cout<< "* Testing we can gauge fix steep descent a RGT of Unit gauge *" <<std::endl;
std::cout<< "*****************************************************************" <<std::endl; std::cout<< "*****************************************************************" <<std::endl;
LatticeGaugeFieldD Umu(&GRID); LatticeGaugeField Umu(&GRID);
LatticeGaugeFieldD Uorg(&GRID); LatticeGaugeField Uorg(&GRID);
LatticeColourMatrixD g(&GRID); // Gauge xform LatticeColourMatrix g(&GRID); // Gauge xform
SU3::ColdConfiguration(pRNG,Umu); // Unit gauge SU3::ColdConfiguration(pRNG,Umu); // Unit gauge
Uorg=Umu; Uorg=Umu;
SU3::RandomGaugeTransform(pRNG,Umu,g); // Unit gauge SU3::RandomGaugeTransform(pRNG,Umu,g); // Unit gauge
RealD plaq=WilsonLoops<PeriodicGimplD>::avgPlaquette(Umu); Real plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
std::cout << " Initial plaquette "<<plaq << std::endl; std::cout << " Initial plaquette "<<plaq << std::endl;
RealD alpha=0.1; Real alpha=0.1;
FourierAcceleratedGaugeFixer<PeriodicGimplD>::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-10, 1.0e-10); FourierAcceleratedGaugeFixer<PeriodicGimplR>::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-10, 1.0e-10);
plaq=WilsonLoops<PeriodicGimplD>::avgPlaquette(Umu); plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
std::cout << " Final plaquette "<<plaq << std::endl; std::cout << " Final plaquette "<<plaq << std::endl;
Uorg = Uorg - Umu; Uorg = Uorg - Umu;

View File

@ -93,10 +93,10 @@ int main (int argc, char ** argv)
C=C-Ctilde; C=C-Ctilde;
std::cout << "diff scalar "<<norm2(C) << std::endl; std::cout << "diff scalar "<<norm2(C) << std::endl;
theFFT.FFT_dim(Stilde,S,0,FFT::forward); S=Stilde; std::cout << theFFT.MFlops()<<std::endl; theFFT.FFT_dim(Stilde,S,0,FFT::forward); S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
theFFT.FFT_dim(Stilde,S,1,FFT::forward); S=Stilde;std::cout << theFFT.MFlops()<<std::endl; theFFT.FFT_dim(Stilde,S,1,FFT::forward); S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
theFFT.FFT_dim(Stilde,S,2,FFT::forward); S=Stilde;std::cout << theFFT.MFlops()<<std::endl; theFFT.FFT_dim(Stilde,S,2,FFT::forward); S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<std::endl; theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<" "<<theFFT.USec() <<std::endl;
SpinMatrixF Sp; SpinMatrixF Sp;
Sp = zero; Sp = Sp+cVol; Sp = zero; Sp = Sp+cVol;

View File

@ -68,7 +68,7 @@ class HmcRunner : public NerscHmcRunner {
TwoFlavourPseudoFermionAction<ImplPolicy> Nf2(FermOp, CG, CG); TwoFlavourPseudoFermionAction<ImplPolicy> Nf2(FermOp, CG, CG);
// Set smearing (true/false), default: false // Set smearing (true/false), default: false
Nf2.is_smeared = true; Nf2.is_smeared = false;
// Collect actions // Collect actions
ActionLevel<LatticeGaugeField> Level1(1); ActionLevel<LatticeGaugeField> Level1(1);