mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-09 21:50:45 +01:00
Benchmarks modified for new GPU constructs
This commit is contained in:
parent
0184719216
commit
0561c2edeb
@ -5,7 +5,6 @@
|
|||||||
"============================================================================="
|
"============================================================================="
|
||||||
|
|
||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
using namespace QCD;
|
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
int main (int argc, char ** argv)
|
||||||
{
|
{
|
||||||
|
@ -153,8 +153,8 @@ public:
|
|||||||
dbytes=0;
|
dbytes=0;
|
||||||
ncomm=0;
|
ncomm=0;
|
||||||
|
|
||||||
thread_loop( (int dir=0;dir<8;dir++),{
|
thread_for(dir,8,{
|
||||||
|
|
||||||
double tbytes;
|
double tbytes;
|
||||||
int mu =dir % 4;
|
int mu =dir % 4;
|
||||||
|
|
||||||
|
@ -485,7 +485,7 @@ int main (int argc, char ** argv)
|
|||||||
dbytes=0;
|
dbytes=0;
|
||||||
ncomm=0;
|
ncomm=0;
|
||||||
|
|
||||||
thread_loop( (int dir=0;dir<8;dir++),{
|
thread_for(dir,8,{
|
||||||
|
|
||||||
double tbytes;
|
double tbytes;
|
||||||
int mu =dir % 4;
|
int mu =dir % 4;
|
||||||
|
@ -78,7 +78,7 @@ int main (int argc, char ** argv)
|
|||||||
}
|
}
|
||||||
|
|
||||||
double start=usecond();
|
double start=usecond();
|
||||||
thread_loop( (int t=0;t<threads;t++),{
|
thread_for(t,threads,{
|
||||||
auto x_t = x[t].View();
|
auto x_t = x[t].View();
|
||||||
sum[t] = x_t[0];
|
sum[t] = x_t[0];
|
||||||
for(int i=0;i<Nloop;i++){
|
for(int i=0;i<Nloop;i++){
|
||||||
|
@ -30,8 +30,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
using namespace Grid::QCD;
|
|
||||||
|
|
||||||
|
|
||||||
#include "Grid/util/Profiling.h"
|
#include "Grid/util/Profiling.h"
|
||||||
|
|
||||||
@ -67,7 +65,7 @@ void sliceInnerProductMesonField(std::vector< std::vector<ComplexD> > &mat,
|
|||||||
// sum across these down to scalars
|
// sum across these down to scalars
|
||||||
// splitting the SIMD
|
// splitting the SIMD
|
||||||
std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd*Lblock*Rblock);
|
std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd*Lblock*Rblock);
|
||||||
thread_loop( (int r = 0; r < rd * Lblock * Rblock; r++),{
|
thread_for(r, rd * Lblock * Rblock,{
|
||||||
lvSum[r] = Zero();
|
lvSum[r] = Zero();
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -79,7 +77,7 @@ void sliceInnerProductMesonField(std::vector< std::vector<ComplexD> > &mat,
|
|||||||
|
|
||||||
std::cout << GridLogMessage << " Entering first parallel loop "<<std::endl;
|
std::cout << GridLogMessage << " Entering first parallel loop "<<std::endl;
|
||||||
// Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
|
// Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
|
||||||
thread_loop((int r=0;r<rd;r++),{
|
thread_for(r,rd,{
|
||||||
|
|
||||||
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
||||||
|
|
||||||
@ -114,7 +112,7 @@ void sliceInnerProductMesonField(std::vector< std::vector<ComplexD> > &mat,
|
|||||||
|
|
||||||
std::cout << GridLogMessage << " Entering second parallel loop "<<std::endl;
|
std::cout << GridLogMessage << " Entering second parallel loop "<<std::endl;
|
||||||
// Sum across simd lanes in the plane, breaking out orthog dir.
|
// Sum across simd lanes in the plane, breaking out orthog dir.
|
||||||
thread_loop((int rt=0;rt<rd;rt++),{
|
thread_for(rt,rd,{
|
||||||
|
|
||||||
Coordinate icoor(Nd);
|
Coordinate icoor(Nd);
|
||||||
|
|
||||||
@ -199,12 +197,12 @@ void sliceInnerProductMesonFieldGamma(std::vector< std::vector<ComplexD> > &mat,
|
|||||||
int MFlvol = ld*Lblock*Rblock*Ngamma;
|
int MFlvol = ld*Lblock*Rblock*Ngamma;
|
||||||
|
|
||||||
std::vector<vector_type,alignedAllocator<vector_type> > lvSum(MFrvol);
|
std::vector<vector_type,alignedAllocator<vector_type> > lvSum(MFrvol);
|
||||||
thread_loop( (int r = 0; r < MFrvol; r++),{
|
thread_for(r,MFrvol,{
|
||||||
lvSum[r] = Zero();
|
lvSum[r] = Zero();
|
||||||
});
|
});
|
||||||
|
|
||||||
std::vector<scalar_type > lsSum(MFlvol);
|
std::vector<scalar_type > lsSum(MFlvol);
|
||||||
thread_loop( (int r = 0; r < MFlvol; r++),{
|
thread_for( r,MFlvol,{
|
||||||
lsSum[r]=scalar_type(0.0);
|
lsSum[r]=scalar_type(0.0);
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -215,7 +213,7 @@ void sliceInnerProductMesonFieldGamma(std::vector< std::vector<ComplexD> > &mat,
|
|||||||
std::cout << GridLogMessage << " Entering first parallel loop "<<std::endl;
|
std::cout << GridLogMessage << " Entering first parallel loop "<<std::endl;
|
||||||
|
|
||||||
// Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
|
// Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
|
||||||
thread_loop((int r=0;r<rd;r++),{
|
thread_for(r,rd,{
|
||||||
|
|
||||||
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
||||||
|
|
||||||
@ -256,7 +254,7 @@ void sliceInnerProductMesonFieldGamma(std::vector< std::vector<ComplexD> > &mat,
|
|||||||
|
|
||||||
std::cout << GridLogMessage << " Entering second parallel loop "<<std::endl;
|
std::cout << GridLogMessage << " Entering second parallel loop "<<std::endl;
|
||||||
// Sum across simd lanes in the plane, breaking out orthog dir.
|
// Sum across simd lanes in the plane, breaking out orthog dir.
|
||||||
thread_loop((int rt=0;rt<rd;rt++),{
|
thread_for(rt,rd,{
|
||||||
|
|
||||||
iScalar<vector_type> temp;
|
iScalar<vector_type> temp;
|
||||||
Coordinate icoor(Nd);
|
Coordinate icoor(Nd);
|
||||||
@ -347,12 +345,12 @@ void sliceInnerProductMesonFieldGamma1(std::vector< std::vector<ComplexD> > &mat
|
|||||||
int MFlvol = ld*Lblock*Rblock;
|
int MFlvol = ld*Lblock*Rblock;
|
||||||
|
|
||||||
Vector<SpinMatrix_v > lvSum(MFrvol);
|
Vector<SpinMatrix_v > lvSum(MFrvol);
|
||||||
thread_loop( (int r = 0; r < MFrvol; r++),{
|
thread_for(r,MFrvol,{
|
||||||
lvSum[r] = Zero();
|
lvSum[r] = Zero();
|
||||||
});
|
});
|
||||||
|
|
||||||
Vector<SpinMatrix_s > lsSum(MFlvol);
|
Vector<SpinMatrix_s > lsSum(MFlvol);
|
||||||
thread_loop( (int r = 0; r < MFlvol; r++),{
|
thread_for(r,MFlvol,{
|
||||||
lsSum[r]=scalar_type(0.0);
|
lsSum[r]=scalar_type(0.0);
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -363,7 +361,7 @@ void sliceInnerProductMesonFieldGamma1(std::vector< std::vector<ComplexD> > &mat
|
|||||||
std::cout << GridLogMessage << " Entering first parallel loop "<<std::endl;
|
std::cout << GridLogMessage << " Entering first parallel loop "<<std::endl;
|
||||||
|
|
||||||
// Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
|
// Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
|
||||||
thread_loop((int r=0;r<rd;r++),{
|
thread_for(r,rd,{
|
||||||
|
|
||||||
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
||||||
|
|
||||||
@ -398,7 +396,7 @@ void sliceInnerProductMesonFieldGamma1(std::vector< std::vector<ComplexD> > &mat
|
|||||||
|
|
||||||
std::cout << GridLogMessage << " Entering second parallel loop "<<std::endl;
|
std::cout << GridLogMessage << " Entering second parallel loop "<<std::endl;
|
||||||
// Sum across simd lanes in the plane, breaking out orthog dir.
|
// Sum across simd lanes in the plane, breaking out orthog dir.
|
||||||
thread_loop((int rt=0;rt<rd;rt++),{
|
thread_for(rt,rd,{
|
||||||
|
|
||||||
Coordinate icoor(Nd);
|
Coordinate icoor(Nd);
|
||||||
ExtractBuffer<SpinMatrix_s> extracted(Nsimd);
|
ExtractBuffer<SpinMatrix_s> extracted(Nsimd);
|
||||||
@ -425,7 +423,7 @@ void sliceInnerProductMesonFieldGamma1(std::vector< std::vector<ComplexD> > &mat
|
|||||||
});
|
});
|
||||||
|
|
||||||
std::cout << GridLogMessage << " Entering third parallel loop "<<std::endl;
|
std::cout << GridLogMessage << " Entering third parallel loop "<<std::endl;
|
||||||
thread_loop((int t=0;t<fd;t++)
|
thread_for(t,fd,
|
||||||
{
|
{
|
||||||
int pt = t / ld; // processor plane
|
int pt = t / ld; // processor plane
|
||||||
int lt = t % ld;
|
int lt = t % ld;
|
||||||
@ -490,13 +488,13 @@ void sliceInnerProductMesonFieldGammaMom(std::vector< std::vector<ComplexD> > &m
|
|||||||
int MFlvol = ld*Lblock*Rblock*Nmom;
|
int MFlvol = ld*Lblock*Rblock*Nmom;
|
||||||
|
|
||||||
Vector<SpinMatrix_v > lvSum(MFrvol);
|
Vector<SpinMatrix_v > lvSum(MFrvol);
|
||||||
thread_loop( (int r = 0; r < MFrvol; r++),
|
thread_for(r,MFrvol,
|
||||||
{
|
{
|
||||||
lvSum[r] = Zero();
|
lvSum[r] = Zero();
|
||||||
});
|
});
|
||||||
|
|
||||||
Vector<SpinMatrix_s > lsSum(MFlvol);
|
Vector<SpinMatrix_s > lsSum(MFlvol);
|
||||||
thread_loop( (int r = 0; r < MFlvol; r++),
|
thread_for(r,MFlvol,
|
||||||
{
|
{
|
||||||
lsSum[r]=scalar_type(0.0);
|
lsSum[r]=scalar_type(0.0);
|
||||||
});
|
});
|
||||||
@ -508,7 +506,7 @@ void sliceInnerProductMesonFieldGammaMom(std::vector< std::vector<ComplexD> > &m
|
|||||||
std::cout << GridLogMessage << " Entering first parallel loop "<<std::endl;
|
std::cout << GridLogMessage << " Entering first parallel loop "<<std::endl;
|
||||||
|
|
||||||
// Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
|
// Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
|
||||||
thread_loop((int r=0;r<rd;r++),
|
thread_for(r,rd,
|
||||||
{
|
{
|
||||||
|
|
||||||
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
||||||
@ -552,7 +550,7 @@ void sliceInnerProductMesonFieldGammaMom(std::vector< std::vector<ComplexD> > &m
|
|||||||
|
|
||||||
std::cout << GridLogMessage << " Entering second parallel loop "<<std::endl;
|
std::cout << GridLogMessage << " Entering second parallel loop "<<std::endl;
|
||||||
// Sum across simd lanes in the plane, breaking out orthog dir.
|
// Sum across simd lanes in the plane, breaking out orthog dir.
|
||||||
thread_loop((int rt=0;rt<rd;rt++),
|
thread_for(rt,rd,
|
||||||
{
|
{
|
||||||
|
|
||||||
Coordinate icoor(Nd);
|
Coordinate icoor(Nd);
|
||||||
@ -582,7 +580,7 @@ void sliceInnerProductMesonFieldGammaMom(std::vector< std::vector<ComplexD> > &m
|
|||||||
});
|
});
|
||||||
|
|
||||||
std::cout << GridLogMessage << " Entering third parallel loop "<<std::endl;
|
std::cout << GridLogMessage << " Entering third parallel loop "<<std::endl;
|
||||||
thread_loop((int t=0;t<fd;t++),
|
thread_for(t,fd,
|
||||||
{
|
{
|
||||||
int pt = t / ld; // processor plane
|
int pt = t / ld; // processor plane
|
||||||
int lt = t % ld;
|
int lt = t % ld;
|
||||||
|
@ -71,26 +71,21 @@ int main (int argc, char ** argv)
|
|||||||
auto z_v = z.View();
|
auto z_v = z.View();
|
||||||
const uint64_t Nsite = x_v.size();
|
const uint64_t Nsite = x_v.size();
|
||||||
const uint64_t nsimd = vComplex::Nsimd();
|
const uint64_t nsimd = vComplex::Nsimd();
|
||||||
const uint64_t NN = Nsite*nsimd;
|
|
||||||
for(int64_t i=0;i<Nwarm;i++){
|
for(int64_t i=0;i<Nwarm;i++){
|
||||||
accelerator_loopN( sss, NN, {
|
accelerator_for( ss, Nsite, nsimd ,{
|
||||||
uint64_t lane = sss % nsimd;
|
auto xx = coalescedRead(x_v[ss]);
|
||||||
uint64_t ss = sss / nsimd;
|
auto yy = coalescedRead(y_v[ss]);
|
||||||
auto xx = extractLane(lane,x_v[ss]);
|
|
||||||
auto yy = extractLane(lane,y_v[ss]);
|
|
||||||
auto zz = xx*yy;
|
auto zz = xx*yy;
|
||||||
insertLane(lane,z_v[ss],zz);
|
coalescedWrite(z_v[ss],zz);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
double start=usecond();
|
double start=usecond();
|
||||||
for(int64_t i=0;i<Nloop;i++){
|
for(int64_t i=0;i<Nloop;i++){
|
||||||
accelerator_loopN( sss, NN, {
|
accelerator_for( ss, Nsite, nsimd ,{
|
||||||
uint64_t lane = sss % nsimd;
|
auto xx = coalescedRead(x_v[ss]);
|
||||||
uint64_t ss = sss / nsimd;
|
auto yy = coalescedRead(y_v[ss]);
|
||||||
auto xx = extractLane(lane,x_v[ss]);
|
|
||||||
auto yy = extractLane(lane,y_v[ss]);
|
|
||||||
auto zz = xx*yy;
|
auto zz = xx*yy;
|
||||||
insertLane(lane,z_v[ss],zz);
|
coalescedWrite(z_v[ss],zz);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
double stop=usecond();
|
double stop=usecond();
|
||||||
@ -126,26 +121,21 @@ int main (int argc, char ** argv)
|
|||||||
auto z_v = z.View();
|
auto z_v = z.View();
|
||||||
const uint64_t Nsite = x_v.size();
|
const uint64_t Nsite = x_v.size();
|
||||||
const uint64_t nsimd = vComplex::Nsimd();
|
const uint64_t nsimd = vComplex::Nsimd();
|
||||||
const uint64_t NN = Nsite*nsimd;
|
|
||||||
for(int64_t i=0;i<Nwarm;i++){
|
for(int64_t i=0;i<Nwarm;i++){
|
||||||
accelerator_loopN( sss, NN, {
|
accelerator_for( ss, Nsite, nsimd ,{
|
||||||
uint64_t lane = sss % nsimd;
|
auto xx = coalescedRead(x_v[ss]);
|
||||||
uint64_t ss = sss / nsimd;
|
auto yy = coalescedRead(y_v[ss]);
|
||||||
auto xx = extractLane(lane,x_v[ss]);
|
|
||||||
auto yy = extractLane(lane,y_v[ss]);
|
|
||||||
auto zz = xx*yy;
|
auto zz = xx*yy;
|
||||||
insertLane(lane,x_v[ss],zz);
|
coalescedWrite(x_v[ss],zz);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
double start=usecond();
|
double start=usecond();
|
||||||
for(int64_t i=0;i<Nloop;i++){
|
for(int64_t i=0;i<Nloop;i++){
|
||||||
accelerator_loopN( sss, NN, {
|
accelerator_for( ss, Nsite, nsimd ,{
|
||||||
uint64_t lane = sss % nsimd;
|
auto xx = coalescedRead(x_v[ss]);
|
||||||
uint64_t ss = sss / nsimd;
|
auto yy = coalescedRead(y_v[ss]);
|
||||||
auto xx = extractLane(lane,x_v[ss]);
|
|
||||||
auto yy = extractLane(lane,y_v[ss]);
|
|
||||||
auto zz = xx*yy;
|
auto zz = xx*yy;
|
||||||
insertLane(lane,x_v[ss],zz);
|
coalescedWrite(x_v[ss],zz);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
double stop=usecond();
|
double stop=usecond();
|
||||||
@ -182,28 +172,23 @@ int main (int argc, char ** argv)
|
|||||||
auto z_v = z.View();
|
auto z_v = z.View();
|
||||||
const uint64_t Nsite = x_v.size();
|
const uint64_t Nsite = x_v.size();
|
||||||
const uint64_t nsimd = vComplex::Nsimd();
|
const uint64_t nsimd = vComplex::Nsimd();
|
||||||
const uint64_t NN = Nsite*nsimd;
|
|
||||||
for(int64_t i=0;i<Nwarm;i++){
|
for(int64_t i=0;i<Nwarm;i++){
|
||||||
accelerator_loopN( sss, NN, {
|
accelerator_for( ss, Nsite, nsimd ,{
|
||||||
uint64_t lane = sss % nsimd;
|
auto xx = coalescedRead(x_v[ss]);
|
||||||
uint64_t ss = sss / nsimd;
|
auto yy = coalescedRead(y_v[ss]);
|
||||||
auto xx = extractLane(lane,x_v[ss]);
|
auto zz = coalescedRead(z_v[ss]);
|
||||||
auto yy = extractLane(lane,y_v[ss]);
|
zz = zz+xx*yy;
|
||||||
auto zz = extractLane(lane,z_v[ss]);
|
coalescedWrite(z_v[ss],zz);
|
||||||
zz = zz + xx * yy;
|
|
||||||
insertLane(lane,z_v[ss],zz);
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
double start=usecond();
|
double start=usecond();
|
||||||
for(int64_t i=0;i<Nloop;i++){
|
for(int64_t i=0;i<Nloop;i++){
|
||||||
accelerator_loopN( sss, NN, {
|
accelerator_for( ss, Nsite, nsimd ,{
|
||||||
uint64_t lane = sss % nsimd;
|
auto xx = coalescedRead(x_v[ss]);
|
||||||
uint64_t ss = sss / nsimd;
|
auto yy = coalescedRead(y_v[ss]);
|
||||||
auto xx = extractLane(lane,x_v[ss]);
|
auto zz = coalescedRead(z_v[ss]);
|
||||||
auto yy = extractLane(lane,y_v[ss]);
|
zz = zz+xx*yy;
|
||||||
auto zz = extractLane(lane,z_v[ss]);
|
coalescedWrite(z_v[ss],zz);
|
||||||
zz = zz + xx * yy;
|
|
||||||
insertLane(lane,x_v[ss],zz);
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
double stop=usecond();
|
double stop=usecond();
|
||||||
@ -241,28 +226,23 @@ int main (int argc, char ** argv)
|
|||||||
auto w_v = z.View();
|
auto w_v = z.View();
|
||||||
const uint64_t Nsite = x_v.size();
|
const uint64_t Nsite = x_v.size();
|
||||||
const uint64_t nsimd = vComplex::Nsimd();
|
const uint64_t nsimd = vComplex::Nsimd();
|
||||||
const uint64_t NN = Nsite*nsimd;
|
|
||||||
for(int64_t i=0;i<Nwarm;i++){
|
for(int64_t i=0;i<Nwarm;i++){
|
||||||
accelerator_loopN( sss, NN, {
|
accelerator_for( ss, Nsite, nsimd ,{
|
||||||
uint64_t lane = sss % nsimd;
|
auto xx = coalescedRead(x_v[ss]);
|
||||||
uint64_t ss = sss / nsimd;
|
auto yy = coalescedRead(y_v[ss]);
|
||||||
auto xx = extractLane(lane,x_v[ss]);
|
auto zz = coalescedRead(z_v[ss]);
|
||||||
auto yy = extractLane(lane,y_v[ss]);
|
auto ww = zz+xx*yy;
|
||||||
auto zz = extractLane(lane,z_v[ss]);
|
coalescedWrite(w_v[ss],ww);
|
||||||
auto ww = zz + xx * yy;
|
|
||||||
insertLane(lane,w_v[ss],ww);
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
double start=usecond();
|
double start=usecond();
|
||||||
for(int64_t i=0;i<Nloop;i++){
|
for(int64_t i=0;i<Nloop;i++){
|
||||||
accelerator_loopN( sss, NN, {
|
accelerator_for( ss, Nsite, nsimd ,{
|
||||||
uint64_t lane = sss % nsimd;
|
auto xx = coalescedRead(x_v[ss]);
|
||||||
uint64_t ss = sss / nsimd;
|
auto yy = coalescedRead(y_v[ss]);
|
||||||
auto xx = extractLane(lane,x_v[ss]);
|
auto zz = coalescedRead(z_v[ss]);
|
||||||
auto yy = extractLane(lane,y_v[ss]);
|
auto ww = zz+xx*yy;
|
||||||
auto zz = extractLane(lane,z_v[ss]);
|
coalescedWrite(w_v[ss],ww);
|
||||||
auto ww = zz + xx * yy;
|
|
||||||
insertLane(lane,w_v[ss],ww);
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
double stop=usecond();
|
double stop=usecond();
|
||||||
|
Loading…
x
Reference in New Issue
Block a user