mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-19 00:07:05 +01:00
Merge branch 'develop' into feature/gpu-port
This commit is contained in:
47
benchmarks/Benchmark_IO.cc
Normal file
47
benchmarks/Benchmark_IO.cc
Normal file
@ -0,0 +1,47 @@
|
||||
|
||||
#include "Benchmark_IO.hpp"
|
||||
|
||||
#ifndef BENCH_IO_LMAX
|
||||
#define BENCH_IO_LMAX 40
|
||||
#endif
|
||||
|
||||
using namespace Grid;
|
||||
|
||||
std::string filestem(const int l)
|
||||
{
|
||||
return "iobench_l" + std::to_string(l);
|
||||
}
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
int64_t threads = GridThread::GetThreads();
|
||||
MSG << "Grid is setup to use " << threads << " threads" << std::endl;
|
||||
MSG << SEP << std::endl;
|
||||
MSG << "Benchmark Lime write" << std::endl;
|
||||
MSG << SEP << std::endl;
|
||||
for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
|
||||
{
|
||||
auto mpi = GridDefaultMpi();
|
||||
std::vector<int> latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
|
||||
|
||||
std::cout << "-- Local volume " << l << "^4" << std::endl;
|
||||
writeBenchmark<LatticeFermion>(latt, filestem(l), limeWrite<LatticeFermion>);
|
||||
}
|
||||
|
||||
MSG << "Benchmark Lime read" << std::endl;
|
||||
MSG << SEP << std::endl;
|
||||
for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
|
||||
{
|
||||
auto mpi = GridDefaultMpi();
|
||||
std::vector<int> latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
|
||||
|
||||
std::cout << "-- Local volume " << l << "^4" << std::endl;
|
||||
readBenchmark<LatticeFermion>(latt, filestem(l), limeRead<LatticeFermion>);
|
||||
}
|
||||
|
||||
Grid_finalize();
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
107
benchmarks/Benchmark_IO.hpp
Normal file
107
benchmarks/Benchmark_IO.hpp
Normal file
@ -0,0 +1,107 @@
|
||||
#ifndef Benchmark_IO_hpp_
|
||||
#define Benchmark_IO_hpp_
|
||||
|
||||
#include <Grid/Grid.h>
|
||||
|
||||
#define MSG std::cout << GridLogMessage
|
||||
#define SEP \
|
||||
"============================================================================="
|
||||
|
||||
namespace Grid {
|
||||
|
||||
template <typename Field>
|
||||
using WriterFn = std::function<void(const std::string, Field &)> ;
|
||||
template <typename Field>
|
||||
using ReaderFn = std::function<void(Field &, const std::string)>;
|
||||
|
||||
template <typename Field>
|
||||
void limeWrite(const std::string filestem, Field &vec)
|
||||
{
|
||||
emptyUserRecord record;
|
||||
ScidacWriter binWriter(vec.Grid()->IsBoss());
|
||||
|
||||
binWriter.open(filestem + ".bin");
|
||||
binWriter.writeScidacFieldRecord(vec, record);
|
||||
binWriter.close();
|
||||
}
|
||||
|
||||
template <typename Field>
|
||||
void limeRead(Field &vec, const std::string filestem)
|
||||
{
|
||||
emptyUserRecord record;
|
||||
ScidacReader binReader;
|
||||
|
||||
binReader.open(filestem + ".bin");
|
||||
binReader.readScidacFieldRecord(vec, record);
|
||||
binReader.close();
|
||||
}
|
||||
|
||||
inline void makeGrid(std::shared_ptr<GridBase> &gPt,
|
||||
const std::shared_ptr<GridCartesian> &gBasePt,
|
||||
const unsigned int Ls = 1, const bool rb = false)
|
||||
{
|
||||
if (rb)
|
||||
{
|
||||
if (Ls > 1)
|
||||
{
|
||||
gPt.reset(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, gBasePt.get()));
|
||||
}
|
||||
else
|
||||
{
|
||||
gPt.reset(SpaceTimeGrid::makeFourDimRedBlackGrid(gBasePt.get()));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (Ls > 1)
|
||||
{
|
||||
gPt.reset(SpaceTimeGrid::makeFiveDimGrid(Ls, gBasePt.get()));
|
||||
}
|
||||
else
|
||||
{
|
||||
gPt = gBasePt;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Field>
|
||||
void writeBenchmark(const Coordinate &latt, const std::string filename,
|
||||
const WriterFn<Field> &write,
|
||||
const unsigned int Ls = 1, const bool rb = false)
|
||||
{
|
||||
auto mpi = GridDefaultMpi();
|
||||
auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd());
|
||||
std::shared_ptr<GridCartesian> gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi));
|
||||
std::shared_ptr<GridBase> gPt;
|
||||
|
||||
makeGrid(gPt, gBasePt, Ls, rb);
|
||||
|
||||
GridBase *g = gPt.get();
|
||||
GridParallelRNG rng(g);
|
||||
Field vec(g);
|
||||
|
||||
random(rng, vec);
|
||||
write(filename, vec);
|
||||
}
|
||||
|
||||
template <typename Field>
|
||||
void readBenchmark(const Coordinate &latt, const std::string filename,
|
||||
const ReaderFn<Field> &read,
|
||||
const unsigned int Ls = 1, const bool rb = false)
|
||||
{
|
||||
auto mpi = GridDefaultMpi();
|
||||
auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd());
|
||||
std::shared_ptr<GridCartesian> gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi));
|
||||
std::shared_ptr<GridBase> gPt;
|
||||
|
||||
makeGrid(gPt, gBasePt, Ls, rb);
|
||||
|
||||
GridBase *g = gPt.get();
|
||||
Field vec(g);
|
||||
|
||||
read(vec, filename);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif // Benchmark_IO_hpp_
|
79
benchmarks/Benchmark_IO_vs_dir.cc
Normal file
79
benchmarks/Benchmark_IO_vs_dir.cc
Normal file
@ -0,0 +1,79 @@
|
||||
#include "Benchmark_IO.hpp"
|
||||
|
||||
#define MSG std::cout << GridLogMessage
|
||||
#define SEP \
|
||||
"============================================================================="
|
||||
|
||||
using namespace Grid;
|
||||
using namespace QCD;
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
std::vector<std::string> dir;
|
||||
unsigned int Ls;
|
||||
bool rb;
|
||||
if (argc < 4)
|
||||
{
|
||||
std::cerr << "usage: " << argv[0] << " <Ls> <RB {0|1}> <dir1> [<dir2> ... <dirn>] [Grid options]";
|
||||
std::cerr << std::endl;
|
||||
}
|
||||
Ls = std::stoi(argv[1]);
|
||||
rb = (std::string(argv[2]) == "1");
|
||||
for (unsigned int i = 3; i < argc; ++i)
|
||||
{
|
||||
std::string a = argv[i];
|
||||
|
||||
if (a[0] != '-')
|
||||
{
|
||||
dir.push_back(std::string(argv[i]));
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
|
||||
int64_t threads = GridThread::GetThreads();
|
||||
MSG << "Grid is setup to use " << threads << " threads" << std::endl;
|
||||
MSG << SEP << std::endl;
|
||||
MSG << "Benchmark double precision Lime write" << std::endl;
|
||||
MSG << SEP << std::endl;
|
||||
for (auto &d: dir)
|
||||
{
|
||||
MSG << "-- Directory " << d << std::endl;
|
||||
writeBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", limeWrite<LatticeFermion>, Ls, rb);
|
||||
}
|
||||
|
||||
MSG << SEP << std::endl;
|
||||
MSG << "Benchmark double precision Lime read" << std::endl;
|
||||
MSG << SEP << std::endl;
|
||||
for (auto &d: dir)
|
||||
{
|
||||
MSG << "-- Directory " << d << std::endl;
|
||||
readBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", limeRead<LatticeFermion>, Ls, rb);
|
||||
}
|
||||
|
||||
MSG << SEP << std::endl;
|
||||
MSG << "Benchmark single precision Lime write" << std::endl;
|
||||
MSG << SEP << std::endl;
|
||||
for (auto &d: dir)
|
||||
{
|
||||
MSG << "-- Directory " << d << std::endl;
|
||||
writeBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeWrite<LatticeFermionF>, Ls, rb);
|
||||
}
|
||||
|
||||
MSG << SEP << std::endl;
|
||||
MSG << "Benchmark single precision Lime read" << std::endl;
|
||||
MSG << SEP << std::endl;
|
||||
for (auto &d: dir)
|
||||
{
|
||||
MSG << "-- Directory " << d << std::endl;
|
||||
readBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeRead<LatticeFermionF>, Ls, rb);
|
||||
}
|
||||
|
||||
Grid_finalize();
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
@ -446,7 +446,7 @@ int main (int argc, char ** argv)
|
||||
}
|
||||
|
||||
|
||||
|
||||
#ifdef GRID_OMP
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
@ -502,9 +502,9 @@ int main (int argc, char ** argv)
|
||||
int comm_proc = mpi_layout[mu]-1;
|
||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
}
|
||||
|
||||
int tid = omp_get_thread_num();
|
||||
tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
|
||||
(void *)&rbuf[dir][0], recv_from_rank, bytes,dir);
|
||||
(void *)&rbuf[dir][0], recv_from_rank, bytes,tid);
|
||||
|
||||
thread_critical { dbytes+=tbytes; }
|
||||
}
|
||||
@ -531,7 +531,7 @@ int main (int argc, char ** argv)
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl;
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
|
@ -48,7 +48,6 @@ int main (int argc, char ** argv)
|
||||
|
||||
|
||||
int threads = GridThread::GetThreads();
|
||||
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
||||
|
||||
Coordinate latt4 = GridDefaultLatt();
|
||||
int Ls=8;
|
||||
@ -57,6 +56,10 @@ int main (int argc, char ** argv)
|
||||
std::stringstream ss(argv[i+1]); ss >> Ls;
|
||||
}
|
||||
|
||||
GridLogLayout();
|
||||
|
||||
long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
|
||||
|
||||
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
@ -73,9 +76,9 @@ int main (int argc, char ** argv)
|
||||
std::vector<int> seeds5({5,6,7,8});
|
||||
|
||||
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
|
||||
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG"));
|
||||
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
|
||||
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
||||
GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG"));
|
||||
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
|
||||
|
||||
LatticeFermion src (FGrid); random(RNG5,src);
|
||||
@ -193,7 +196,7 @@ int main (int argc, char ** argv)
|
||||
FGrid->Barrier();
|
||||
|
||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||
double flops=1344*volume*ncall;
|
||||
double flops=single_site_flops*volume*ncall;
|
||||
|
||||
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
||||
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||
@ -231,7 +234,7 @@ int main (int argc, char ** argv)
|
||||
FGrid->Barrier();
|
||||
|
||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||
double flops=1344*volume*ncall;
|
||||
double flops=single_site_flops*volume*ncall;
|
||||
|
||||
std::cout<<GridLogMessage << "Called half prec comms Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
@ -283,7 +286,7 @@ int main (int argc, char ** argv)
|
||||
double t1=usecond();
|
||||
FGrid->Barrier();
|
||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||
double flops=1344*volume*ncall;
|
||||
double flops=single_site_flops*volume*ncall;
|
||||
|
||||
std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
@ -359,7 +362,7 @@ int main (int argc, char ** argv)
|
||||
// sDw.stat.print();
|
||||
|
||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||
double flops=(1344.0*volume*ncall)/2;
|
||||
double flops=(single_site_flops*volume*ncall)/2.0;
|
||||
|
||||
std::cout<<GridLogMessage << "sDeo mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
std::cout<<GridLogMessage << "sDeo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
|
||||
@ -491,7 +494,7 @@ int main (int argc, char ** argv)
|
||||
FGrid->Barrier();
|
||||
|
||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||
double flops=(1344.0*volume*ncall)/2;
|
||||
double flops=(single_site_flops*volume*ncall)/2.0;
|
||||
|
||||
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
std::cout<<GridLogMessage << "Deo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
|
||||
|
@ -50,6 +50,7 @@ int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
|
||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
|
||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||
@ -107,6 +108,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||
long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
|
||||
|
||||
std::vector<int> seeds4({1,2,3,4});
|
||||
std::vector<int> seeds5({5,6,7,8});
|
||||
@ -200,7 +202,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
|
||||
#endif
|
||||
if ( ! report ) {
|
||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||
double flops=1344*volume*ncall;
|
||||
double flops=single_site_flops*volume*ncall;
|
||||
std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t";
|
||||
}
|
||||
|
||||
@ -232,7 +234,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
|
||||
|
||||
if(!report){
|
||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||
double flops=(1344.0*volume*ncall)/2;
|
||||
double flops=(single_site_flops*volume*ncall)/2.0;
|
||||
std::cout<< flops/(t1-t0);
|
||||
}
|
||||
}
|
||||
@ -241,6 +243,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
|
||||
#define CHECK_SDW
|
||||
void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
|
||||
{
|
||||
long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
|
||||
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
@ -333,7 +336,7 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
|
||||
|
||||
if ( !report){
|
||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||
double flops=1344*volume*ncall;
|
||||
double flops=single_site_flops*volume*ncall;
|
||||
std::cout<<"\t"<< flops/(t1-t0);
|
||||
}
|
||||
|
||||
@ -375,7 +378,7 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
|
||||
|
||||
if ( ! report ) {
|
||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||
double flops=(1344.0*volume*ncall)/2;
|
||||
double flops=(single_site_flops*volume*ncall)/2.0;
|
||||
std::cout<<"\t"<< flops/(t1-t0);
|
||||
}
|
||||
}
|
||||
|
@ -107,7 +107,7 @@ int main (int argc, char ** argv)
|
||||
FGrid->Barrier();
|
||||
|
||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||
double flops=2*1344*volume*ncall;
|
||||
double flops=2*1320*volume*ncall;
|
||||
|
||||
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
||||
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||
@ -134,7 +134,7 @@ int main (int argc, char ** argv)
|
||||
FGrid->Barrier();
|
||||
|
||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||
double flops=2*1344*volume*ncall;
|
||||
double flops=2*1320*volume*ncall;
|
||||
|
||||
std::cout<<GridLogMessage << "Called half prec comms Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
@ -174,7 +174,7 @@ int main (int argc, char ** argv)
|
||||
FGrid_d->Barrier();
|
||||
|
||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||
double flops=2*1344*volume*ncall;
|
||||
double flops=2*1320*volume*ncall;
|
||||
|
||||
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
||||
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||
|
812
benchmarks/Benchmark_meson_field.cc
Normal file
812
benchmarks/Benchmark_meson_field.cc
Normal file
@ -0,0 +1,812 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./benchmarks/Benchmark_wilson.cc
|
||||
|
||||
Copyright (C) 2018
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
using namespace Grid::QCD;
|
||||
|
||||
|
||||
#include "Grid/util/Profiling.h"
|
||||
|
||||
template<class vobj>
|
||||
void sliceInnerProductMesonField(std::vector< std::vector<ComplexD> > &mat,
|
||||
const std::vector<Lattice<vobj> > &lhs,
|
||||
const std::vector<Lattice<vobj> > &rhs,
|
||||
int orthogdim)
|
||||
{
|
||||
typedef typename vobj::scalar_object sobj;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
|
||||
int Lblock = lhs.size();
|
||||
int Rblock = rhs.size();
|
||||
|
||||
GridBase *grid = lhs[0].Grid();
|
||||
|
||||
const int Nd = grid->_ndimension;
|
||||
const int Nsimd = grid->Nsimd();
|
||||
int Nt = grid->GlobalDimensions()[orthogdim];
|
||||
|
||||
assert(mat.size()==Lblock*Rblock);
|
||||
for(int t=0;t<mat.size();t++){
|
||||
assert(mat[t].size()==Nt);
|
||||
}
|
||||
|
||||
int fd=grid->_fdimensions[orthogdim];
|
||||
int ld=grid->_ldimensions[orthogdim];
|
||||
int rd=grid->_rdimensions[orthogdim];
|
||||
|
||||
// will locally sum vectors first
|
||||
// sum across these down to scalars
|
||||
// splitting the SIMD
|
||||
std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd*Lblock*Rblock);
|
||||
parallel_for (int r = 0; r < rd * Lblock * Rblock; r++){
|
||||
lvSum[r] = Zero();
|
||||
}
|
||||
|
||||
std::vector<scalar_type > lsSum(ld*Lblock*Rblock,scalar_type(0.0));
|
||||
|
||||
int e1= grid->_slice_nblock[orthogdim];
|
||||
int e2= grid->_slice_block [orthogdim];
|
||||
int stride=grid->_slice_stride[orthogdim];
|
||||
|
||||
std::cout << GridLogMessage << " Entering first parallel loop "<<std::endl;
|
||||
// Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
|
||||
parallel_for(int r=0;r<rd;r++){
|
||||
|
||||
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
||||
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
int ss= so+n*stride+b;
|
||||
for(int i=0;i<Lblock;i++){
|
||||
auto lhs_v = lhs[i].View();
|
||||
auto left = conjugate(lhs_v[ss]);
|
||||
for(int j=0;j<Rblock;j++){
|
||||
int idx = i+Lblock*j+Lblock*Rblock*r;
|
||||
auto rhs_v = rhs[j].View();
|
||||
auto right = rhs_v[ss];
|
||||
vector_type vv = left()(0)(0) * right()(0)(0)
|
||||
+ left()(0)(1) * right()(0)(1)
|
||||
+ left()(0)(2) * right()(0)(2)
|
||||
+ left()(1)(0) * right()(1)(0)
|
||||
+ left()(1)(1) * right()(1)(1)
|
||||
+ left()(1)(2) * right()(1)(2)
|
||||
+ left()(2)(0) * right()(2)(0)
|
||||
+ left()(2)(1) * right()(2)(1)
|
||||
+ left()(2)(2) * right()(2)(2)
|
||||
+ left()(3)(0) * right()(3)(0)
|
||||
+ left()(3)(1) * right()(3)(1)
|
||||
+ left()(3)(2) * right()(3)(2);
|
||||
lvSum[idx]=lvSum[idx]+vv;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << GridLogMessage << " Entering second parallel loop "<<std::endl;
|
||||
// Sum across simd lanes in the plane, breaking out orthog dir.
|
||||
parallel_for(int rt=0;rt<rd;rt++){
|
||||
|
||||
Coordinate icoor(Nd);
|
||||
|
||||
for(int i=0;i<Lblock;i++){
|
||||
for(int j=0;j<Rblock;j++){
|
||||
|
||||
iScalar<vector_type> temp;
|
||||
ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);
|
||||
|
||||
temp._internal = lvSum[i+Lblock*j+Lblock*Rblock*rt];
|
||||
|
||||
extract(temp,extracted);
|
||||
|
||||
for(int idx=0;idx<Nsimd;idx++){
|
||||
|
||||
grid->iCoorFromIindex(icoor,idx);
|
||||
|
||||
int ldx =rt+icoor[orthogdim]*rd;
|
||||
|
||||
int ij_dx = i+Lblock*j+Lblock*Rblock*ldx;
|
||||
lsSum[ij_dx]=lsSum[ij_dx]+extracted[idx]._internal;
|
||||
|
||||
}
|
||||
}}
|
||||
}
|
||||
|
||||
std::cout << GridLogMessage << " Entering non parallel loop "<<std::endl;
|
||||
for(int t=0;t<fd;t++)
|
||||
{
|
||||
int pt = t / ld; // processor plane
|
||||
int lt = t % ld;
|
||||
for(int i=0;i<Lblock;i++){
|
||||
for(int j=0;j<Rblock;j++){
|
||||
if (pt == grid->_processor_coor[orthogdim]){
|
||||
int ij_dx = i + Lblock * j + Lblock * Rblock * lt;
|
||||
mat[i+j*Lblock][t] = lsSum[ij_dx];
|
||||
}
|
||||
else{
|
||||
mat[i+j*Lblock][t] = scalar_type(0.0);
|
||||
}
|
||||
}}
|
||||
}
|
||||
std::cout << GridLogMessage << " Done "<<std::endl;
|
||||
// defer sum over nodes.
|
||||
return;
|
||||
}
|
||||
|
||||
template<class vobj>
|
||||
void sliceInnerProductMesonFieldGamma(std::vector< std::vector<ComplexD> > &mat,
|
||||
const std::vector<Lattice<vobj> > &lhs,
|
||||
const std::vector<Lattice<vobj> > &rhs,
|
||||
int orthogdim,
|
||||
std::vector<Gamma::Algebra> gammas)
|
||||
{
|
||||
typedef typename vobj::scalar_object sobj;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
|
||||
int Lblock = lhs.size();
|
||||
int Rblock = rhs.size();
|
||||
|
||||
GridBase *grid = lhs[0].Grid();
|
||||
|
||||
const int Nd = grid->_ndimension;
|
||||
const int Nsimd = grid->Nsimd();
|
||||
int Nt = grid->GlobalDimensions()[orthogdim];
|
||||
int Ngamma = gammas.size();
|
||||
|
||||
assert(mat.size()==Lblock*Rblock*Ngamma);
|
||||
for(int t=0;t<mat.size();t++){
|
||||
assert(mat[t].size()==Nt);
|
||||
}
|
||||
|
||||
int fd=grid->_fdimensions[orthogdim];
|
||||
int ld=grid->_ldimensions[orthogdim];
|
||||
int rd=grid->_rdimensions[orthogdim];
|
||||
|
||||
// will locally sum vectors first
|
||||
// sum across these down to scalars
|
||||
// splitting the SIMD
|
||||
int MFrvol = rd*Lblock*Rblock*Ngamma;
|
||||
int MFlvol = ld*Lblock*Rblock*Ngamma;
|
||||
|
||||
std::vector<vector_type,alignedAllocator<vector_type> > lvSum(MFrvol);
|
||||
parallel_for (int r = 0; r < MFrvol; r++){
|
||||
lvSum[r] = Zero();
|
||||
}
|
||||
|
||||
std::vector<scalar_type > lsSum(MFlvol);
|
||||
parallel_for (int r = 0; r < MFlvol; r++){
|
||||
lsSum[r]=scalar_type(0.0);
|
||||
}
|
||||
|
||||
int e1= grid->_slice_nblock[orthogdim];
|
||||
int e2= grid->_slice_block [orthogdim];
|
||||
int stride=grid->_slice_stride[orthogdim];
|
||||
|
||||
std::cout << GridLogMessage << " Entering first parallel loop "<<std::endl;
|
||||
|
||||
// Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
|
||||
parallel_for(int r=0;r<rd;r++){
|
||||
|
||||
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
||||
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
int ss= so+n*stride+b;
|
||||
for(int i=0;i<Lblock;i++){
|
||||
auto lhs_v=lhs[i].View();
|
||||
auto left = conjugate(lhs_v[ss]);
|
||||
for(int j=0;j<Rblock;j++){
|
||||
for(int mu=0;mu<Ngamma;mu++){
|
||||
|
||||
auto rhs_v = rhs[j].View();
|
||||
auto right = Gamma(gammas[mu])*rhs_v[ss];
|
||||
|
||||
vector_type vv = left()(0)(0) * right()(0)(0)
|
||||
+ left()(0)(1) * right()(0)(1)
|
||||
+ left()(0)(2) * right()(0)(2)
|
||||
+ left()(1)(0) * right()(1)(0)
|
||||
+ left()(1)(1) * right()(1)(1)
|
||||
+ left()(1)(2) * right()(1)(2)
|
||||
+ left()(2)(0) * right()(2)(0)
|
||||
+ left()(2)(1) * right()(2)(1)
|
||||
+ left()(2)(2) * right()(2)(2)
|
||||
+ left()(3)(0) * right()(3)(0)
|
||||
+ left()(3)(1) * right()(3)(1)
|
||||
+ left()(3)(2) * right()(3)(2);
|
||||
|
||||
int idx = mu+i*Ngamma+Lblock*Ngamma*j+Ngamma*Lblock*Rblock*r;
|
||||
|
||||
lvSum[idx]=lvSum[idx]+vv;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << GridLogMessage << " Entering second parallel loop "<<std::endl;
|
||||
// Sum across simd lanes in the plane, breaking out orthog dir.
|
||||
parallel_for(int rt=0;rt<rd;rt++){
|
||||
|
||||
iScalar<vector_type> temp;
|
||||
Coordinate icoor(Nd);
|
||||
ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);
|
||||
|
||||
for(int i=0;i<Lblock;i++){
|
||||
for(int j=0;j<Rblock;j++){
|
||||
for(int mu=0;mu<Ngamma;mu++){
|
||||
|
||||
int ij_rdx = mu+i*Ngamma+Ngamma*Lblock*j+Ngamma*Lblock*Rblock*rt;
|
||||
temp._internal = lvSum[ij_rdx];
|
||||
|
||||
extract(temp,extracted);
|
||||
|
||||
for(int idx=0;idx<Nsimd;idx++){
|
||||
|
||||
grid->iCoorFromIindex(icoor,idx);
|
||||
|
||||
int ldx =rt+icoor[orthogdim]*rd;
|
||||
|
||||
int ij_ldx = mu+i*Ngamma+Ngamma*Lblock*j+Ngamma*Lblock*Rblock*ldx;
|
||||
lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]._internal;
|
||||
|
||||
}
|
||||
}}}
|
||||
}
|
||||
|
||||
std::cout << GridLogMessage << " Entering non parallel loop "<<std::endl;
|
||||
for(int t=0;t<fd;t++)
|
||||
{
|
||||
int pt = t / ld; // processor plane
|
||||
int lt = t % ld;
|
||||
for(int i=0;i<Lblock;i++){
|
||||
for(int j=0;j<Rblock;j++){
|
||||
for(int mu=0;mu<Ngamma;mu++){
|
||||
if (pt == grid->_processor_coor[orthogdim]){
|
||||
int ij_dx = mu+i*Ngamma+Ngamma*Lblock*j+Ngamma*Lblock*Rblock* lt;
|
||||
mat[mu+i*Ngamma+j*Lblock*Ngamma][t] = lsSum[ij_dx];
|
||||
}
|
||||
else{
|
||||
mat[mu+i*Ngamma+j*Lblock*Ngamma][t] = scalar_type(0.0);
|
||||
}
|
||||
}}}
|
||||
}
|
||||
std::cout << GridLogMessage << " Done "<<std::endl;
|
||||
// defer sum over nodes.
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
template<class vobj>
|
||||
void sliceInnerProductMesonFieldGamma1(std::vector< std::vector<ComplexD> > &mat,
|
||||
const std::vector<Lattice<vobj> > &lhs,
|
||||
const std::vector<Lattice<vobj> > &rhs,
|
||||
int orthogdim,
|
||||
std::vector<Gamma::Algebra> gammas)
|
||||
{
|
||||
|
||||
typedef typename vobj::scalar_object sobj;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef iSpinMatrix<vector_type> SpinMatrix_v;
|
||||
typedef iSpinMatrix<scalar_type> SpinMatrix_s;
|
||||
|
||||
int Lblock = lhs.size();
|
||||
int Rblock = rhs.size();
|
||||
|
||||
GridBase *grid = lhs[0].Grid();
|
||||
|
||||
const int Nd = grid->_ndimension;
|
||||
const int Nsimd = grid->Nsimd();
|
||||
int Nt = grid->GlobalDimensions()[orthogdim];
|
||||
int Ngamma = gammas.size();
|
||||
|
||||
assert(mat.size()==Lblock*Rblock*Ngamma);
|
||||
for(int t=0;t<mat.size();t++){
|
||||
assert(mat[t].size()==Nt);
|
||||
}
|
||||
|
||||
int fd=grid->_fdimensions[orthogdim];
|
||||
int ld=grid->_ldimensions[orthogdim];
|
||||
int rd=grid->_rdimensions[orthogdim];
|
||||
|
||||
// will locally sum vectors first
|
||||
// sum across these down to scalars
|
||||
// splitting the SIMD
|
||||
int MFrvol = rd*Lblock*Rblock;
|
||||
int MFlvol = ld*Lblock*Rblock;
|
||||
|
||||
Vector<SpinMatrix_v > lvSum(MFrvol);
|
||||
parallel_for (int r = 0; r < MFrvol; r++){
|
||||
lvSum[r] = Zero();
|
||||
}
|
||||
|
||||
Vector<SpinMatrix_s > lsSum(MFlvol);
|
||||
parallel_for (int r = 0; r < MFlvol; r++){
|
||||
lsSum[r]=scalar_type(0.0);
|
||||
}
|
||||
|
||||
int e1= grid->_slice_nblock[orthogdim];
|
||||
int e2= grid->_slice_block [orthogdim];
|
||||
int stride=grid->_slice_stride[orthogdim];
|
||||
|
||||
std::cout << GridLogMessage << " Entering first parallel loop "<<std::endl;
|
||||
|
||||
// Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
|
||||
parallel_for(int r=0;r<rd;r++){
|
||||
|
||||
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
||||
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
int ss= so+n*stride+b;
|
||||
for(int i=0;i<Lblock;i++){
|
||||
|
||||
auto lhs_v=lhs[i].View();
|
||||
auto left = conjugate(lhs_v[ss]);
|
||||
for(int j=0;j<Rblock;j++){
|
||||
|
||||
SpinMatrix_v vv;
|
||||
auto rhs_v = rhs[j].View();
|
||||
auto right = rhs_v[ss];
|
||||
for(int s1=0;s1<Ns;s1++){
|
||||
for(int s2=0;s2<Ns;s2++){
|
||||
vv()(s2,s1)() = left()(s1)(0) * right()(s2)(0)
|
||||
+ left()(s1)(1) * right()(s2)(1)
|
||||
+ left()(s1)(2) * right()(s2)(2);
|
||||
}}
|
||||
|
||||
int idx = i+Lblock*j+Lblock*Rblock*r;
|
||||
|
||||
lvSum[idx]=lvSum[idx]+vv;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << GridLogMessage << " Entering second parallel loop "<<std::endl;
|
||||
// Sum across simd lanes in the plane, breaking out orthog dir.
|
||||
parallel_for(int rt=0;rt<rd;rt++){
|
||||
|
||||
Coordinate icoor(Nd);
|
||||
ExtractBuffer<SpinMatrix_s> extracted(Nsimd);
|
||||
|
||||
for(int i=0;i<Lblock;i++){
|
||||
for(int j=0;j<Rblock;j++){
|
||||
|
||||
int ij_rdx = i+Lblock*j+Lblock*Rblock*rt;
|
||||
|
||||
extract(lvSum[ij_rdx],extracted);
|
||||
|
||||
for(int idx=0;idx<Nsimd;idx++){
|
||||
|
||||
grid->iCoorFromIindex(icoor,idx);
|
||||
|
||||
int ldx = rt+icoor[orthogdim]*rd;
|
||||
|
||||
int ij_ldx = i+Lblock*j+Lblock*Rblock*ldx;
|
||||
|
||||
lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx];
|
||||
|
||||
}
|
||||
}}
|
||||
}
|
||||
|
||||
std::cout << GridLogMessage << " Entering third parallel loop "<<std::endl;
|
||||
parallel_for(int t=0;t<fd;t++)
|
||||
{
|
||||
int pt = t / ld; // processor plane
|
||||
int lt = t % ld;
|
||||
for(int i=0;i<Lblock;i++){
|
||||
for(int j=0;j<Rblock;j++){
|
||||
if (pt == grid->_processor_coor[orthogdim]){
|
||||
int ij_dx = i + Lblock * j + Lblock * Rblock * lt;
|
||||
for(int mu=0;mu<Ngamma;mu++){
|
||||
mat[mu+i*Ngamma+j*Lblock*Ngamma][t] = trace(lsSum[ij_dx]*Gamma(gammas[mu]));
|
||||
}
|
||||
}
|
||||
else{
|
||||
for(int mu=0;mu<Ngamma;mu++){
|
||||
mat[mu+i*Ngamma+j*Lblock*Ngamma][t] = scalar_type(0.0);
|
||||
}
|
||||
}
|
||||
}}
|
||||
}
|
||||
std::cout << GridLogMessage << " Done "<<std::endl;
|
||||
// defer sum over nodes.
|
||||
return;
|
||||
}
|
||||
|
||||
template<class vobj>
|
||||
void sliceInnerProductMesonFieldGammaMom(std::vector< std::vector<ComplexD> > &mat,
|
||||
const std::vector<Lattice<vobj> > &lhs,
|
||||
const std::vector<Lattice<vobj> > &rhs,
|
||||
int orthogdim,
|
||||
std::vector<Gamma::Algebra> gammas,
|
||||
const std::vector<LatticeComplex > &mom)
|
||||
{
|
||||
typedef typename vobj::scalar_object sobj;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef iSpinMatrix<vector_type> SpinMatrix_v;
|
||||
typedef iSpinMatrix<scalar_type> SpinMatrix_s;
|
||||
|
||||
int Lblock = lhs.size();
|
||||
int Rblock = rhs.size();
|
||||
|
||||
GridBase *grid = lhs[0].Grid();
|
||||
|
||||
const int Nd = grid->_ndimension;
|
||||
const int Nsimd = grid->Nsimd();
|
||||
int Nt = grid->GlobalDimensions()[orthogdim];
|
||||
int Ngamma = gammas.size();
|
||||
int Nmom = mom.size();
|
||||
|
||||
assert(mat.size()==Lblock*Rblock*Ngamma*Nmom);
|
||||
for(int t=0;t<mat.size();t++){
|
||||
assert(mat[t].size()==Nt);
|
||||
}
|
||||
|
||||
int fd=grid->_fdimensions[orthogdim];
|
||||
int ld=grid->_ldimensions[orthogdim];
|
||||
int rd=grid->_rdimensions[orthogdim];
|
||||
|
||||
// will locally sum vectors first
|
||||
// sum across these down to scalars
|
||||
// splitting the SIMD
|
||||
int MFrvol = rd*Lblock*Rblock*Nmom;
|
||||
int MFlvol = ld*Lblock*Rblock*Nmom;
|
||||
|
||||
Vector<SpinMatrix_v > lvSum(MFrvol);
|
||||
parallel_for (int r = 0; r < MFrvol; r++){
|
||||
lvSum[r] = Zero();
|
||||
}
|
||||
|
||||
Vector<SpinMatrix_s > lsSum(MFlvol);
|
||||
parallel_for (int r = 0; r < MFlvol; r++){
|
||||
lsSum[r]=scalar_type(0.0);
|
||||
}
|
||||
|
||||
int e1= grid->_slice_nblock[orthogdim];
|
||||
int e2= grid->_slice_block [orthogdim];
|
||||
int stride=grid->_slice_stride[orthogdim];
|
||||
|
||||
std::cout << GridLogMessage << " Entering first parallel loop "<<std::endl;
|
||||
|
||||
// Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
|
||||
parallel_for(int r=0;r<rd;r++){
|
||||
|
||||
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
||||
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
int ss= so+n*stride+b;
|
||||
|
||||
|
||||
for(int i=0;i<Lblock;i++){
|
||||
|
||||
auto lhs_v = lhs[i].View();
|
||||
auto left = conjugate(lhs_v[ss]);
|
||||
for(int j=0;j<Rblock;j++){
|
||||
|
||||
SpinMatrix_v vv;
|
||||
auto rhs_v = rhs[j].View();
|
||||
auto right = rhs_v[ss];
|
||||
for(int s1=0;s1<Ns;s1++){
|
||||
for(int s2=0;s2<Ns;s2++){
|
||||
vv()(s1,s2)() = left()(s1)(0) * right()(s2)(0)
|
||||
+ left()(s1)(1) * right()(s2)(1)
|
||||
+ left()(s1)(2) * right()(s2)(2);
|
||||
}}
|
||||
|
||||
// After getting the sitewise product do the mom phase loop
|
||||
int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
|
||||
// Trigger unroll
|
||||
for ( int m=0;m<Nmom;m++){
|
||||
int idx = m+base;
|
||||
auto mom_v = mom[m].View();
|
||||
auto phase = mom_v[ss];
|
||||
mac(&lvSum[idx],&vv,&phase);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << GridLogMessage << " Entering second parallel loop "<<std::endl;
|
||||
// Sum across simd lanes in the plane, breaking out orthog dir.
|
||||
parallel_for(int rt=0;rt<rd;rt++){
|
||||
|
||||
Coordinate icoor(Nd);
|
||||
ExtractBuffer<SpinMatrix_s> extracted(Nsimd);
|
||||
|
||||
|
||||
for(int i=0;i<Lblock;i++){
|
||||
for(int j=0;j<Rblock;j++){
|
||||
for(int m=0;m<Nmom;m++){
|
||||
|
||||
int ij_rdx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*rt;
|
||||
|
||||
extract(lvSum[ij_rdx],extracted);
|
||||
|
||||
for(int idx=0;idx<Nsimd;idx++){
|
||||
|
||||
grid->iCoorFromIindex(icoor,idx);
|
||||
|
||||
int ldx = rt+icoor[orthogdim]*rd;
|
||||
|
||||
int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx;
|
||||
|
||||
lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx];
|
||||
|
||||
}
|
||||
}}}
|
||||
}
|
||||
|
||||
std::cout << GridLogMessage << " Entering third parallel loop "<<std::endl;
|
||||
parallel_for(int t=0;t<fd;t++)
|
||||
{
|
||||
int pt = t / ld; // processor plane
|
||||
int lt = t % ld;
|
||||
for(int i=0;i<Lblock;i++){
|
||||
for(int j=0;j<Rblock;j++){
|
||||
if (pt == grid->_processor_coor[orthogdim]){
|
||||
for(int m=0;m<Nmom;m++){
|
||||
int ij_dx = m+Nmom*i + Nmom*Lblock * j + Nmom*Lblock * Rblock * lt;
|
||||
for(int mu=0;mu<Ngamma;mu++){
|
||||
mat[ mu
|
||||
+m*Ngamma
|
||||
+i*Nmom*Ngamma
|
||||
+j*Nmom*Ngamma*Lblock][t] = trace(lsSum[ij_dx]*Gamma(gammas[mu]));
|
||||
}
|
||||
}
|
||||
}
|
||||
else{
|
||||
for(int mu=0;mu<Ngamma;mu++){
|
||||
for(int m=0;m<Nmom;m++){
|
||||
mat[mu+m*Ngamma+i*Nmom*Ngamma+j*Nmom*Lblock*Ngamma][t] = scalar_type(0.0);
|
||||
}}
|
||||
}
|
||||
}}
|
||||
}
|
||||
std::cout << GridLogMessage << " Done "<<std::endl;
|
||||
// defer sum over nodes.
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
template void sliceInnerProductMesonField<SpinColourVector>(std::vector< std::vector<ComplexD> > &mat,
|
||||
const std::vector<Lattice<SpinColourVector> > &lhs,
|
||||
const std::vector<Lattice<SpinColourVector> > &rhs,
|
||||
int orthogdim) ;
|
||||
*/
|
||||
|
||||
std::vector<Gamma::Algebra> Gmu4 ( {
|
||||
Gamma::Algebra::GammaX,
|
||||
Gamma::Algebra::GammaY,
|
||||
Gamma::Algebra::GammaZ,
|
||||
Gamma::Algebra::GammaT });
|
||||
|
||||
std::vector<Gamma::Algebra> Gmu16 ( {
|
||||
Gamma::Algebra::Gamma5,
|
||||
Gamma::Algebra::GammaT,
|
||||
Gamma::Algebra::GammaTGamma5,
|
||||
Gamma::Algebra::GammaX,
|
||||
Gamma::Algebra::GammaXGamma5,
|
||||
Gamma::Algebra::GammaY,
|
||||
Gamma::Algebra::GammaYGamma5,
|
||||
Gamma::Algebra::GammaZ,
|
||||
Gamma::Algebra::GammaZGamma5,
|
||||
Gamma::Algebra::Identity,
|
||||
Gamma::Algebra::SigmaXT,
|
||||
Gamma::Algebra::SigmaXY,
|
||||
Gamma::Algebra::SigmaXZ,
|
||||
Gamma::Algebra::SigmaYT,
|
||||
Gamma::Algebra::SigmaYZ,
|
||||
Gamma::Algebra::SigmaZT
|
||||
});
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
auto latt_size = GridDefaultLatt();
|
||||
auto simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
|
||||
auto mpi_layout = GridDefaultMpi();
|
||||
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
||||
|
||||
const int Nmom=7;
|
||||
int nt = latt_size[Tp];
|
||||
uint64_t vol = 1;
|
||||
for(int d=0;d<Nd;d++){
|
||||
vol = vol*latt_size[d];
|
||||
}
|
||||
|
||||
std::vector<int> seeds({1,2,3,4});
|
||||
GridParallelRNG pRNG(&Grid);
|
||||
pRNG.SeedFixedIntegers(seeds);
|
||||
|
||||
|
||||
int Nm = atoi(argv[1]); // number of all modes (high + low)
|
||||
|
||||
std::vector<LatticeFermion> v(Nm,&Grid);
|
||||
std::vector<LatticeFermion> w(Nm,&Grid);
|
||||
std::vector<LatticeFermion> gammaV(Nm,&Grid);
|
||||
std::vector<LatticeComplex> phases(Nmom,&Grid);
|
||||
|
||||
for(int i=0;i<Nm;i++) {
|
||||
random(pRNG,v[i]);
|
||||
random(pRNG,w[i]);
|
||||
}
|
||||
|
||||
for(int i=0;i<Nmom;i++) {
|
||||
phases[i] = Complex(1.0);
|
||||
}
|
||||
|
||||
double flops = vol * (11.0 * 8.0 + 6.0) * Nm*Nm;
|
||||
double byte = vol * (12.0 * sizeof(Complex) ) * Nm*Nm;
|
||||
|
||||
std::vector<ComplexD> ip(nt);
|
||||
std::vector<std::vector<ComplexD> > MesonFields (Nm*Nm);
|
||||
std::vector<std::vector<ComplexD> > MesonFields4 (Nm*Nm*4);
|
||||
std::vector<std::vector<ComplexD> > MesonFields16 (Nm*Nm*16);
|
||||
std::vector<std::vector<ComplexD> > MesonFields161(Nm*Nm*16);
|
||||
std::vector<std::vector<ComplexD> > MesonFields16mom (Nm*Nm*16*Nmom);
|
||||
std::vector<std::vector<ComplexD> > MesonFieldsRef(Nm*Nm);
|
||||
|
||||
for(int i=0;i<MesonFields.size();i++ ) MesonFields [i].resize(nt);
|
||||
for(int i=0;i<MesonFieldsRef.size();i++) MesonFieldsRef[i].resize(nt);
|
||||
for(int i=0;i<MesonFields4.size();i++ ) MesonFields4 [i].resize(nt);
|
||||
for(int i=0;i<MesonFields16.size();i++ ) MesonFields16 [i].resize(nt);
|
||||
for(int i=0;i<MesonFields161.size();i++ ) MesonFields161[i].resize(nt);
|
||||
|
||||
for(int i=0;i<MesonFields16mom.size();i++ ) MesonFields16mom [i].resize(nt);
|
||||
|
||||
GridLogMessage.TimingMode(1);
|
||||
|
||||
std::cout<<GridLogMessage << "Running loop with sliceInnerProductVector"<<std::endl;
|
||||
double t0 = usecond();
|
||||
for(int i=0;i<Nm;i++) {
|
||||
for(int j=0;j<Nm;j++) {
|
||||
sliceInnerProductVector(ip, w[i],v[j],Tp);
|
||||
for(int t=0;t<nt;t++){
|
||||
MesonFieldsRef[i+j*Nm][t] = ip[t];
|
||||
}
|
||||
}}
|
||||
double t1 = usecond();
|
||||
std::cout<<GridLogMessage << "Done "<< (t1-t0) <<" usecond " <<std::endl;
|
||||
std::cout<<GridLogMessage << "Done "<< flops/(t1-t0) <<" mflops " <<std::endl;
|
||||
std::cout<<GridLogMessage << "Done "<< byte /(t1-t0) <<" MB/s " <<std::endl;
|
||||
|
||||
std::cout<<GridLogMessage << "Running loop with new code for Nt="<<nt<<std::endl;
|
||||
t0 = usecond();
|
||||
sliceInnerProductMesonField(MesonFields,w,v,Tp);
|
||||
t1 = usecond();
|
||||
std::cout<<GridLogMessage << "Done "<< (t1-t0) <<" usecond " <<std::endl;
|
||||
std::cout<<GridLogMessage << "Done "<< flops/(t1-t0) <<" mflops " <<std::endl;
|
||||
std::cout<<GridLogMessage << "Done "<< byte /(t1-t0) <<" MB/s " <<std::endl;
|
||||
|
||||
|
||||
std::cout<<GridLogMessage << "Running loop with Four gammas code for Nt="<<nt<<std::endl;
|
||||
flops = vol * (11.0 * 8.0 + 6.0) * Nm*Nm*4;
|
||||
byte = vol * (12.0 * sizeof(Complex) ) * Nm*Nm
|
||||
+ vol * ( 2.0 * sizeof(Complex) ) * Nm*Nm* 4;
|
||||
t0 = usecond();
|
||||
sliceInnerProductMesonFieldGamma(MesonFields4,w,v,Tp,Gmu4);
|
||||
t1 = usecond();
|
||||
std::cout<<GridLogMessage << "Done "<< (t1-t0) <<" usecond " <<std::endl;
|
||||
std::cout<<GridLogMessage << "Done "<< flops/(t1-t0) <<" mflops " <<std::endl;
|
||||
std::cout<<GridLogMessage << "Done "<< byte /(t1-t0) <<" MB/s " <<std::endl;
|
||||
|
||||
std::cout<<GridLogMessage << "Running loop with Sixteen gammas code for Nt="<<nt<<std::endl;
|
||||
flops = vol * (11.0 * 8.0 + 6.0) * Nm*Nm*16;
|
||||
byte = vol * (12.0 * sizeof(Complex) ) * Nm*Nm
|
||||
+ vol * ( 2.0 * sizeof(Complex) ) * Nm*Nm* 16;
|
||||
t0 = usecond();
|
||||
sliceInnerProductMesonFieldGamma(MesonFields16,w,v,Tp,Gmu16);
|
||||
t1 = usecond();
|
||||
std::cout<<GridLogMessage << "Done "<< (t1-t0) <<" usecond " <<std::endl;
|
||||
std::cout<<GridLogMessage << "Done "<< flops/(t1-t0) <<" mflops " <<std::endl;
|
||||
std::cout<<GridLogMessage << "Done "<< byte /(t1-t0) <<" MB/s " <<std::endl;
|
||||
|
||||
|
||||
std::cout<<GridLogMessage << "Running loop with Sixteen gammas code1 for Nt="<<nt<<std::endl;
|
||||
flops = vol * ( 2 * 8.0 + 6.0) * Nm*Nm*16;
|
||||
byte = vol * (12.0 * sizeof(Complex) ) * Nm*Nm
|
||||
+ vol * ( 2.0 * sizeof(Complex) ) * Nm*Nm* 16;
|
||||
t0 = usecond();
|
||||
sliceInnerProductMesonFieldGamma1(MesonFields161, w, v, Tp, Gmu16);
|
||||
t1 = usecond();
|
||||
std::cout<<GridLogMessage << "Done "<< (t1-t0) <<" usecond " <<std::endl;
|
||||
std::cout<<GridLogMessage << "Done "<< flops/(t1-t0) <<" mflops " <<std::endl;
|
||||
std::cout<<GridLogMessage << "Done "<< byte /(t1-t0) <<" MB/s " <<std::endl;
|
||||
|
||||
std::cout<<GridLogMessage << "Running loop with Sixteen gammas "<<Nmom<<" momenta "<<std::endl;
|
||||
flops = vol * ( 2 * 8.0 + 6.0 + 8.0*Nmom) * Nm*Nm*16;
|
||||
byte = vol * (12.0 * sizeof(Complex) ) * Nm*Nm
|
||||
+ vol * ( 2.0 * sizeof(Complex) *Nmom ) * Nm*Nm* 16;
|
||||
t0 = usecond();
|
||||
sliceInnerProductMesonFieldGammaMom(MesonFields16mom,w,v,Tp,Gmu16,phases);
|
||||
t1 = usecond();
|
||||
std::cout<<GridLogMessage << "Done "<< (t1-t0) <<" usecond " <<std::endl;
|
||||
std::cout<<GridLogMessage << "Done "<< flops/(t1-t0) <<" mflops " <<std::endl;
|
||||
std::cout<<GridLogMessage << "Done "<< byte /(t1-t0) <<" MB/s " <<std::endl;
|
||||
|
||||
|
||||
|
||||
RealD err = 0;
|
||||
RealD err2 = 0;
|
||||
ComplexD diff;
|
||||
ComplexD diff2;
|
||||
|
||||
for(int i=0;i<Nm;i++) {
|
||||
for(int j=0;j<Nm;j++) {
|
||||
for(int t=0;t<nt;t++){
|
||||
diff = MesonFields[i+Nm*j][t] - MesonFieldsRef[i+Nm*j][t];
|
||||
err += real(diff*conj(diff));
|
||||
}
|
||||
}}
|
||||
std::cout<<GridLogMessage << "Norm error "<< err <<std::endl;
|
||||
|
||||
err = err*0.;
|
||||
diff = diff*0.;
|
||||
|
||||
for (int mu = 0; mu < 16; mu++){
|
||||
for (int k = 0; k < gammaV.size(); k++){
|
||||
gammaV[k] = Gamma(Gmu16[mu]) * v[k];
|
||||
}
|
||||
for (int i = 0; i < Nm; i++){
|
||||
for (int j = 0; j < Nm; j++){
|
||||
sliceInnerProductVector(ip, w[i], gammaV[j], Tp);
|
||||
for (int t = 0; t < nt; t++){
|
||||
MesonFields[i + j * Nm][t] = ip[t];
|
||||
diff = MesonFields16[mu+i*16+Nm*16*j][t] - MesonFields161[mu+i*16+Nm*16*j][t];
|
||||
diff2 = MesonFields[i+j*Nm][t] - MesonFields161[mu+i*16+Nm*16*j][t];
|
||||
err += real(diff*conj(diff));
|
||||
err2 += real(diff2*conj(diff2));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
std::cout << GridLogMessage << "Norm error 16 gamma1/16 gamma naive " << err << std::endl;
|
||||
std::cout << GridLogMessage << "Norm error 16 gamma1/sliceInnerProduct " << err2 << std::endl;
|
||||
|
||||
Grid_finalize();
|
||||
}
|
||||
|
@ -124,6 +124,7 @@ int main (int argc, char ** argv)
|
||||
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
|
||||
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
|
||||
|
||||
|
||||
for(int lat=LMIN;lat<=LMAX;lat+=LADD){
|
||||
|
||||
Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
|
||||
@ -187,5 +188,82 @@ int main (int argc, char ** argv)
|
||||
|
||||
}
|
||||
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << "= Benchmarking SU3xSU3 CovShiftForward(z,x,y)"<<std::endl;
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
|
||||
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
|
||||
|
||||
for(int lat=LMIN;lat<=LMAX;lat+=LADD){
|
||||
|
||||
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
|
||||
int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
|
||||
|
||||
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
||||
GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
|
||||
|
||||
LatticeColourMatrix z(&Grid); random(pRNG,z);
|
||||
LatticeColourMatrix x(&Grid); random(pRNG,x);
|
||||
LatticeColourMatrix y(&Grid); random(pRNG,y);
|
||||
|
||||
for(int mu=0;mu<4;mu++){
|
||||
double start=usecond();
|
||||
for(int64_t i=0;i<Nloop;i++){
|
||||
z = PeriodicBC::CovShiftForward(x,mu,y);
|
||||
}
|
||||
double stop=usecond();
|
||||
double time = (stop-start)/Nloop*1000.0;
|
||||
|
||||
|
||||
double bytes=3*vol*Nc*Nc*sizeof(Complex);
|
||||
double flops=Nc*Nc*(6+8+8)*vol;
|
||||
std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
|
||||
}
|
||||
}
|
||||
#if 1
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << "= Benchmarking SU3xSU3 z= x * Cshift(y)"<<std::endl;
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
|
||||
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
|
||||
|
||||
for(int lat=LMIN;lat<=LMAX;lat+=LADD){
|
||||
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
|
||||
int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
|
||||
|
||||
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
||||
GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
|
||||
|
||||
LatticeColourMatrix z(&Grid); random(pRNG,z);
|
||||
LatticeColourMatrix x(&Grid); random(pRNG,x);
|
||||
LatticeColourMatrix y(&Grid); random(pRNG,y);
|
||||
LatticeColourMatrix tmp(&Grid);
|
||||
|
||||
for(int mu=0;mu<4;mu++){
|
||||
double tshift=0;
|
||||
double tmult =0;
|
||||
|
||||
double start=usecond();
|
||||
for(int64_t i=0;i<Nloop;i++){
|
||||
tshift-=usecond();
|
||||
tmp = Cshift(y,mu,-1);
|
||||
tshift+=usecond();
|
||||
tmult-=usecond();
|
||||
z = x*tmp;
|
||||
tmult+=usecond();
|
||||
}
|
||||
double stop=usecond();
|
||||
double time = (stop-start)/Nloop;
|
||||
tshift = tshift/Nloop;
|
||||
tmult = tmult /Nloop;
|
||||
|
||||
double bytes=3*vol*Nc*Nc*sizeof(Complex);
|
||||
double flops=Nc*Nc*(6+8+8)*vol;
|
||||
std::cout<<GridLogMessage<<std::setprecision(3) << "total us "<<time<<" shift "<<tshift <<" mult "<<tmult<<std::endl;
|
||||
time = time * 1000; // convert to NS for GB/s
|
||||
std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
Grid_finalize();
|
||||
}
|
||||
|
@ -4,7 +4,7 @@
|
||||
|
||||
Source file: ./benchmarks/Benchmark_wilson.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
Copyright (C) 2018
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
@ -32,6 +32,9 @@ using namespace std;
|
||||
using namespace Grid;
|
||||
;
|
||||
|
||||
|
||||
#include "Grid/util/Profiling.h"
|
||||
|
||||
template<class d>
|
||||
struct scal {
|
||||
d internal;
|
||||
@ -44,21 +47,40 @@ struct scal {
|
||||
Gamma::Algebra::GammaT
|
||||
};
|
||||
|
||||
bool overlapComms = false;
|
||||
bool perfProfiling = false;
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
Coordinate latt_size = GridDefaultLatt();
|
||||
Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
|
||||
Coordinate mpi_layout = GridDefaultMpi();
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
|
||||
overlapComms = true;
|
||||
}
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--perf") ){
|
||||
perfProfiling = true;
|
||||
}
|
||||
|
||||
long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
|
||||
|
||||
|
||||
auto latt_size = GridDefaultLatt();
|
||||
auto simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
|
||||
auto mpi_layout = GridDefaultMpi();
|
||||
|
||||
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
||||
GridRedBlackCartesian RBGrid(&Grid);
|
||||
|
||||
int threads = GridThread::GetThreads();
|
||||
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
||||
|
||||
GridLogLayout();
|
||||
|
||||
std::cout<<GridLogMessage << "Grid floating point word size is REALF"<< sizeof(RealF)<<std::endl;
|
||||
std::cout<<GridLogMessage << "Grid floating point word size is REALD"<< sizeof(RealD)<<std::endl;
|
||||
std::cout<<GridLogMessage << "Grid floating point word size is REAL"<< sizeof(Real)<<std::endl;
|
||||
std::cout<<GridLogMessage << "Grid number of colours : "<< Nc <<std::endl;
|
||||
std::cout<<GridLogMessage << "Benchmarking Wilson operator in the fundamental representation" << std::endl;
|
||||
|
||||
|
||||
std::vector<int> seeds({1,2,3,4});
|
||||
GridParallelRNG pRNG(&Grid);
|
||||
@ -135,9 +157,25 @@ int main (int argc, char ** argv)
|
||||
Dw.Dhop(src,result,0);
|
||||
}
|
||||
double t1=usecond();
|
||||
double flops=1344*volume*ncall;
|
||||
double flops=single_site_flops*volume*ncall;
|
||||
|
||||
if (perfProfiling){
|
||||
std::cout<<GridLogMessage << "Profiling Dw with perf"<<std::endl;
|
||||
|
||||
System::profile("kernel", [&]() {
|
||||
for(int i=0;i<ncall;i++){
|
||||
Dw.Dhop(src,result,0);
|
||||
}
|
||||
});
|
||||
|
||||
std::cout<<GridLogMessage << "Generated kernel.data"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Use with: perf report -i kernel.data"<<std::endl;
|
||||
|
||||
}
|
||||
|
||||
|
||||
std::cout<<GridLogMessage << "Called Dw"<<std::endl;
|
||||
std::cout<<GridLogMessage << "flops per site " << single_site_flops << std::endl;
|
||||
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
|
@ -58,6 +58,7 @@ int main (int argc, char ** argv)
|
||||
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
|
||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||
std::cout << GridLogMessage<< "* Number of colours "<< Nc <<std::endl;
|
||||
std::cout << GridLogMessage<< "* Benchmarking WilsonFermionR::Dhop "<<std::endl;
|
||||
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
|
||||
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
||||
@ -65,13 +66,15 @@ int main (int argc, char ** argv)
|
||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||
std::cout << GridLogMessage << "* OpenMP threads : "<< GridThread::GetThreads() <<std::endl;
|
||||
std::cout << GridLogMessage << "* MPI tasks : "<< GridCmdVectorIntToString(mpi_layout) << std::endl;
|
||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||
|
||||
std::cout<<GridLogMessage << "============================================================================="<< std::endl;
|
||||
std::cout<<GridLogMessage << "= Benchmarking Wilson" << std::endl;
|
||||
std::cout<<GridLogMessage << "============================================================================="<< std::endl;
|
||||
std::cout<<GridLogMessage << "Volume\t\t\tWilson/MFLOPs\tWilsonDag/MFLOPs" << std::endl;
|
||||
std::cout<<GridLogMessage << "============================================================================="<< std::endl;
|
||||
std::cout<<GridLogMessage << "================================================================================================="<< std::endl;
|
||||
std::cout<<GridLogMessage << "= Benchmarking Wilson operator in the fundamental representation" << std::endl;
|
||||
std::cout<<GridLogMessage << "================================================================================================="<< std::endl;
|
||||
std::cout<<GridLogMessage << "Volume\t\t\tWilson/MFLOPs\tWilsonDag/MFLOPs\tWilsonEO/MFLOPs\tWilsonDagEO/MFLOPs" << std::endl;
|
||||
std::cout<<GridLogMessage << "================================================================================================="<< std::endl;
|
||||
|
||||
int Lmax = 32;
|
||||
int dmin = 0;
|
||||
@ -93,12 +96,19 @@ int main (int argc, char ** argv)
|
||||
GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds);
|
||||
LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
|
||||
LatticeFermion src(&Grid); random(pRNG,src);
|
||||
LatticeFermion result(&Grid); result=Zero();
|
||||
LatticeFermion src_o(&RBGrid); pickCheckerboard(Odd,src_o,src);
|
||||
LatticeFermion result(&Grid); result=Zero();
|
||||
LatticeFermion result_e(&RBGrid); result_e=Zero();
|
||||
|
||||
double volume = std::accumulate(latt_size.begin(),latt_size.end(),1,std::multiplies<int>());
|
||||
|
||||
WilsonFermionR Dw(Umu,Grid,RBGrid,mass,params);
|
||||
|
||||
// Full operator
|
||||
bench_wilson(src,result,Dw,volume,DaggerNo);
|
||||
bench_wilson(src,result,Dw,volume,DaggerYes);
|
||||
std::cout << "\t";
|
||||
// EO
|
||||
bench_wilson(src,result,Dw,volume,DaggerNo);
|
||||
bench_wilson(src,result,Dw,volume,DaggerYes);
|
||||
std::cout << std::endl;
|
||||
@ -117,9 +127,26 @@ void bench_wilson (
|
||||
int const dag )
|
||||
{
|
||||
int ncall = 1000;
|
||||
long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
|
||||
double t0 = usecond();
|
||||
for(int i=0; i<ncall; i++) { Dw.Dhop(src,result,dag); }
|
||||
double t1 = usecond();
|
||||
double flops = 1344 * volume * ncall;
|
||||
double flops = single_site_flops * volume * ncall;
|
||||
std::cout << flops/(t1-t0) << "\t\t";
|
||||
}
|
||||
|
||||
void bench_wilson_eo (
|
||||
LatticeFermion & src,
|
||||
LatticeFermion & result,
|
||||
WilsonFermionR & Dw,
|
||||
double const volume,
|
||||
int const dag )
|
||||
{
|
||||
int ncall = 1000;
|
||||
long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
|
||||
double t0 = usecond();
|
||||
for(int i=0; i<ncall; i++) { Dw.DhopEO(src,result,dag); }
|
||||
double t1 = usecond();
|
||||
double flops = (single_site_flops * volume * ncall)/2.0;
|
||||
std::cout << flops/(t1-t0) << "\t\t";
|
||||
}
|
||||
|
Reference in New Issue
Block a user