1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-13 04:37:05 +01:00

Compare commits

...

17 Commits

Author SHA1 Message Date
a78a61d76f Update configure 2024-10-15 14:38:45 +00:00
2eff3f34ed Alternate reduction; default to grids own but make a configure flag
--enable-reduction=grid|mpi
2024-10-15 14:36:06 +00:00
03687c1d62 Final version of test, closer to original again 2024-10-15 14:35:17 +00:00
febfe4e77f Make my own reduction a configure flag 2024-10-15 14:32:35 +00:00
4d1aa134b5 Use normal reduction, configure flag to force deterministic 2024-10-15 14:32:11 +00:00
5ec879860a Odd rounding issue - bears looking into 2024-10-15 14:30:54 +00:00
b728af903c Fast axpy norm under CFLAG 2024-10-11 03:23:09 +00:00
54f1999030 axpy_norm_fast -- wasn't using the determinstic MPI sum causing issues 2024-10-11 03:22:18 +00:00
fd58f0b669 Return ok 2024-10-11 03:21:21 +00:00
c5c67b706e cl::sycl -> SYCL 2024-10-10 22:04:12 +00:00
be7a543e2c Revert barriers -- these were not the problem 2024-10-10 22:03:29 +00:00
68f112d576 New software moves cl::sycl 2024-10-10 22:03:04 +00:00
ec1395a304 Better flight logging 2024-10-10 22:01:57 +00:00
beb0e474ee Use deterministic own brand reduction 2024-10-10 22:01:24 +00:00
2b5fdcbbc5 New software version 2024-10-10 21:59:02 +00:00
295127d456 Deterministic homebrew reduction 2024-10-10 21:58:26 +00:00
7dcfb13694 New software stack 2024-10-10 21:57:35 +00:00
20 changed files with 267 additions and 73 deletions

View File

@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
typedef cublasHandle_t gridblasHandle_t; typedef cublasHandle_t gridblasHandle_t;
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
typedef cl::sycl::queue *gridblasHandle_t; typedef sycl::queue *gridblasHandle_t;
#endif #endif
#ifdef GRID_ONE_MKL #ifdef GRID_ONE_MKL
typedef cl::sycl::queue *gridblasHandle_t; typedef sycl::queue *gridblasHandle_t;
#endif #endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL) #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
typedef int32_t gridblasHandle_t; typedef int32_t gridblasHandle_t;
@ -89,9 +89,9 @@ public:
gridblasHandle = theGridAccelerator; gridblasHandle = theGridAccelerator;
#endif #endif
#ifdef GRID_ONE_MKL #ifdef GRID_ONE_MKL
cl::sycl::gpu_selector selector; sycl::gpu_selector selector;
cl::sycl::device selectedDevice { selector }; sycl::device selectedDevice { selector };
cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()}; sycl::property_list q_prop{sycl::property::queue::in_order()};
gridblasHandle =new sycl::queue (selectedDevice,q_prop); gridblasHandle =new sycl::queue (selectedDevice,q_prop);
#endif #endif
gridblasInit=1; gridblasInit=1;

View File

@ -116,14 +116,14 @@ NAMESPACE_BEGIN(Grid);
//Compute double precision rsd and also new RHS vector. //Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d, tmp_d); Linop_d.HermOp(sol_d, tmp_d);
RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
std::cout<<GridLogMessage<<" rsd norm "<<norm<<std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl; std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
if(norm < OuterLoopNormMult * stop){ if(norm < OuterLoopNormMult * stop){
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl; std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
break; break;
} }
while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ?? while(norm * inner_tol * inner_tol < stop*1.01) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
PrecChangeTimer.Start(); PrecChangeTimer.Start();
precisionChange(src_f, src_d, pc_wk_dp_to_sp); precisionChange(src_f, src_d, pc_wk_dp_to_sp);

View File

@ -57,18 +57,29 @@ int CartesianCommunicator::ProcessorCount(void) { return
// very VERY rarely (Log, serial RNG) we need world without a grid // very VERY rarely (Log, serial RNG) we need world without a grid
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#ifdef USE_GRID_REDUCTION
void CartesianCommunicator::GlobalSum(ComplexF &c)
{
GlobalSumP2P(c);
}
void CartesianCommunicator::GlobalSum(ComplexD &c)
{
GlobalSumP2P(c);
}
#else
void CartesianCommunicator::GlobalSum(ComplexF &c) void CartesianCommunicator::GlobalSum(ComplexF &c)
{ {
GlobalSumVector((float *)&c,2); GlobalSumVector((float *)&c,2);
} }
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
{
GlobalSumVector((float *)c,2*N);
}
void CartesianCommunicator::GlobalSum(ComplexD &c) void CartesianCommunicator::GlobalSum(ComplexD &c)
{ {
GlobalSumVector((double *)&c,2); GlobalSumVector((double *)&c,2);
} }
#endif
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
{
GlobalSumVector((float *)c,2*N);
}
void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N) void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
{ {
GlobalSumVector((double *)c,2*N); GlobalSumVector((double *)c,2*N);

View File

@ -128,6 +128,34 @@ public:
void GlobalXOR(uint32_t &); void GlobalXOR(uint32_t &);
void GlobalXOR(uint64_t &); void GlobalXOR(uint64_t &);
template<class obj> void GlobalSumP2P(obj &o)
{
std::vector<obj> column;
obj accum = o;
int source,dest;
for(int d=0;d<_ndimension;d++){
column.resize(_processors[d]);
column[0] = accum;
std::vector<CommsRequest_t> list;
for(int p=1;p<_processors[d];p++){
ShiftedRanks(d,p,source,dest);
SendToRecvFromBegin(list,
&column[0],
dest,
&column[p],
source,
sizeof(obj),d*100+p);
}
CommsComplete(list);
for(int p=1;p<_processors[d];p++){
accum = accum + column[p];
}
}
Broadcast(0,accum);
o=accum;
}
template<class obj> void GlobalSum(obj &o){ template<class obj> void GlobalSum(obj &o){
typedef typename obj::scalar_type scalar_type; typedef typename obj::scalar_type scalar_type;
int words = sizeof(obj)/sizeof(scalar_type); int words = sizeof(obj)/sizeof(scalar_type);

View File

@ -257,6 +257,25 @@ CartesianCommunicator::~CartesianCommunicator()
} }
} }
} }
#ifdef USE_GRID_REDUCTION
void CartesianCommunicator::GlobalSum(float &f){
CartesianCommunicator::GlobalSumP2P(f);
}
void CartesianCommunicator::GlobalSum(double &d)
{
CartesianCommunicator::GlobalSumP2P(d);
}
#else
void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
#endif
void CartesianCommunicator::GlobalSum(uint32_t &u){ void CartesianCommunicator::GlobalSum(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0); assert(ierr==0);
@ -287,20 +306,11 @@ void CartesianCommunicator::GlobalMax(double &d)
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator); int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(float *f,int N) void CartesianCommunicator::GlobalSumVector(float *f,int N)
{ {
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalSum(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(double *d,int N) void CartesianCommunicator::GlobalSumVector(double *d,int N)
{ {
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);

View File

@ -569,8 +569,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef GRID_SYCL_LEVEL_ZERO_IPC #ifdef GRID_SYCL_LEVEL_ZERO_IPC
typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t; typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device()); auto zeDevice = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
auto zeContext = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context()); auto zeContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
ze_ipc_mem_handle_t ihandle; ze_ipc_mem_handle_t ihandle;
clone_mem_t handle; clone_mem_t handle;

View File

@ -257,17 +257,30 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
}); });
} }
#define FAST_AXPY_NORM
template<class sobj,class vobj> inline template<class sobj,class vobj> inline
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y) RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
{ {
GRID_TRACE("axpy_norm"); GRID_TRACE("axpy_norm");
return axpy_norm_fast(ret,a,x,y); #ifdef FAST_AXPY_NORM
return axpy_norm_fast(ret,a,x,y);
#else
ret = a*x+y;
RealD nn=norm2(ret);
return nn;
#endif
} }
template<class sobj,class vobj> inline template<class sobj,class vobj> inline
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y) RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
{ {
GRID_TRACE("axpby_norm"); GRID_TRACE("axpby_norm");
return axpby_norm_fast(ret,a,b,x,y); #ifdef FAST_AXPY_NORM
return axpby_norm_fast(ret,a,b,x,y);
#else
ret = a*x+b*y;
RealD nn=norm2(ret);
return nn;
#endif
} }
/// Trace product /// Trace product

View File

@ -290,8 +290,10 @@ template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) { inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
GridBase *grid = left.Grid(); GridBase *grid = left.Grid();
bool ok;
#ifdef GRID_SYCL #ifdef GRID_SYCL
uint64_t csum=0; uint64_t csum=0;
uint64_t csum2=0;
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone) if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
{ {
// Hack // Hack
@ -300,13 +302,33 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t); Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
uint64_t *base= (uint64_t *)&l_v[0]; uint64_t *base= (uint64_t *)&l_v[0];
csum=svm_xor(base,words); csum=svm_xor(base,words);
ok = FlightRecorder::CsumLog(csum);
if ( !ok ) {
csum2=svm_xor(base,words);
std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
} else {
// csum2=svm_xor(base,words);
// std::cerr<< " ok CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
}
assert(ok);
} }
FlightRecorder::CsumLog(csum);
#endif #endif
FlightRecorder::StepLog("rank inner product");
ComplexD nrm = rankInnerProduct(left,right); ComplexD nrm = rankInnerProduct(left,right);
// ComplexD nrmck=nrm;
RealD local = real(nrm); RealD local = real(nrm);
FlightRecorder::NormLog(real(nrm)); ok = FlightRecorder::NormLog(real(nrm));
if ( !ok ) {
ComplexD nrm2 = rankInnerProduct(left,right);
RealD local2 = real(nrm2);
std::cerr<< " Bad NORM " << local << " recomputed as "<<local2<<std::endl;
assert(ok);
}
FlightRecorder::StepLog("Start global sum");
// grid->GlobalSumP2P(nrm);
grid->GlobalSum(nrm); grid->GlobalSum(nrm);
FlightRecorder::StepLog("Finished global sum");
// std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
FlightRecorder::ReductionLog(local,real(nrm)); FlightRecorder::ReductionLog(local,real(nrm));
return nrm; return nrm;
} }
@ -353,8 +375,44 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp)); coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
coalescedWrite(z_v[ss],tmp); coalescedWrite(z_v[ss],tmp);
}); });
bool ok;
uint64_t csum=0;
uint64_t csum2=0;
#ifdef GRID_SYCL
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
{
// z_v
{
Integer words = sites*sizeof(vobj)/sizeof(uint64_t);
uint64_t *base= (uint64_t *)&z_v[0];
csum=svm_xor(base,words);
ok = FlightRecorder::CsumLog(csum);
if ( !ok ) {
csum2=svm_xor(base,words);
std::cerr<< " Bad z_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
}
assert(ok);
}
// inner_v
{
Integer words = sites*sizeof(inner_t)/sizeof(uint64_t);
uint64_t *base= (uint64_t *)&inner_tmp_v[0];
csum=svm_xor(base,words);
ok = FlightRecorder::CsumLog(csum);
if ( !ok ) {
csum2=svm_xor(base,words);
std::cerr<< " Bad inner_tmp_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
}
assert(ok);
}
}
#endif
nrm = real(TensorRemove(sumD(inner_tmp_v,sites))); nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
ok = FlightRecorder::NormLog(real(nrm));
assert(ok);
RealD local = real(nrm);
grid->GlobalSum(nrm); grid->GlobalSum(nrm);
FlightRecorder::ReductionLog(local,real(nrm));
return nrm; return nrm;
} }

View File

@ -16,11 +16,11 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os
Integer nsimd= vobj::Nsimd(); Integer nsimd= vobj::Nsimd();
{ {
sycl::buffer<sobj, 1> abuff(&ret, {1}); sycl::buffer<sobj, 1> abuff(&ret, {1});
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(abuff,cgh,identity,std::plus<>()); auto Reduction = sycl::reduction(abuff,cgh,identity,std::plus<>());
cgh.parallel_for(cl::sycl::range<1>{osites}, cgh.parallel_for(sycl::range<1>{osites},
Reduction, Reduction,
[=] (cl::sycl::id<1> item, auto &sum) { [=] (sycl::id<1> item, auto &sum) {
auto osite = item[0]; auto osite = item[0];
sum +=Reduce(lat[osite]); sum +=Reduce(lat[osite]);
}); });
@ -75,11 +75,11 @@ template<class Word> Word svm_xor(Word *vec,uint64_t L)
Word ret = 0; Word ret = 0;
{ {
sycl::buffer<Word, 1> abuff(&ret, {1}); sycl::buffer<Word, 1> abuff(&ret, {1});
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(abuff,cgh,identity,std::bit_xor<>()); auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
cgh.parallel_for(cl::sycl::range<1>{L}, cgh.parallel_for(sycl::range<1>{L},
Reduction, Reduction,
[=] (cl::sycl::id<1> index, auto &sum) { [=] (sycl::id<1> index, auto &sum) {
sum ^=vec[index]; sum ^=vec[index];
}); });
}); });

View File

@ -141,11 +141,11 @@ inline void sliceSumReduction_sycl_small(const vobj *Data,
}); });
for (int r = 0; r < rd; r++) { for (int r = 0; r < rd; r++) {
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>()); auto Reduction = sycl::reduction(&mysum[r],std::plus<>());
cgh.parallel_for(cl::sycl::range<1>{subvol_size}, cgh.parallel_for(sycl::range<1>{subvol_size},
Reduction, Reduction,
[=](cl::sycl::id<1> item, auto &sum) { [=](sycl::id<1> item, auto &sum) {
auto s = item[0]; auto s = item[0];
sum += rb_p[r*subvol_size+s]; sum += rb_p[r*subvol_size+s];
}); });

View File

@ -364,9 +364,10 @@ public:
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
{ {
FlightRecorder::StepLog("Communicate begin");
// All GPU kernel tasks must complete // All GPU kernel tasks must complete
accelerator_barrier(); // All kernels should ALREADY be complete // accelerator_barrier(); // All kernels should ALREADY be complete
_grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer // _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
// But the HaloGather had a barrier too. // But the HaloGather had a barrier too.
for(int i=0;i<Packets.size();i++){ for(int i=0;i<Packets.size();i++){
_grid->StencilSendToRecvFromBegin(MpiReqs, _grid->StencilSendToRecvFromBegin(MpiReqs,
@ -386,18 +387,20 @@ public:
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
{ {
FlightRecorder::StepLog("Start communicate complete");
_grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
if ( this->partialDirichlet ) DslashLogPartial(); if ( this->partialDirichlet ) DslashLogPartial();
else if ( this->fullDirichlet ) DslashLogDirichlet(); else if ( this->fullDirichlet ) DslashLogDirichlet();
else DslashLogFull(); else DslashLogFull();
acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete // acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
accelerator_barrier(); // accelerator_barrier();
_grid->StencilBarrier(); _grid->StencilBarrier();
// run any checksums // run any checksums
for(int i=0;i<Packets.size();i++){ for(int i=0;i<Packets.size();i++){
if ( Packets[i].do_recv ) if ( Packets[i].do_recv )
FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank); FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
} }
FlightRecorder::StepLog("Finish communicate complete");
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Blocking send and receive. Either sequential or parallel. // Blocking send and receive. Either sequential or parallel.
@ -473,7 +476,7 @@ public:
template<class compressor> template<class compressor>
void HaloGather(const Lattice<vobj> &source,compressor &compress) void HaloGather(const Lattice<vobj> &source,compressor &compress)
{ {
accelerator_barrier(); // accelerator_barrier();
_grid->StencilBarrier();// Synch shared memory on a single nodes _grid->StencilBarrier();// Synch shared memory on a single nodes
assert(source.Grid()==_grid); assert(source.Grid()==_grid);
@ -487,7 +490,7 @@ public:
HaloGatherDir(source,compress,point,face_idx); HaloGatherDir(source,compress,point,face_idx);
} }
accelerator_barrier(); // All my local gathers are complete accelerator_barrier(); // All my local gathers are complete
_grid->StencilBarrier();// Synch shared memory on a single nodes // _grid->StencilBarrier();// Synch shared memory on a single nodes
face_table_computed=1; face_table_computed=1;
assert(u_comm_offset==_unified_buffer_size); assert(u_comm_offset==_unified_buffer_size);
} }

View File

@ -202,13 +202,13 @@ void acceleratorInit(void)
#ifdef GRID_SYCL #ifdef GRID_SYCL
cl::sycl::queue *theGridAccelerator; sycl::queue *theGridAccelerator;
cl::sycl::queue *theCopyAccelerator; sycl::queue *theCopyAccelerator;
void acceleratorInit(void) void acceleratorInit(void)
{ {
int nDevices = 1; int nDevices = 1;
// cl::sycl::gpu_selector selector; // sycl::gpu_selector selector;
// cl::sycl::device selectedDevice { selector }; // sycl::device selectedDevice { selector };
theGridAccelerator = new sycl::queue (sycl::gpu_selector_v); theGridAccelerator = new sycl::queue (sycl::gpu_selector_v);
theCopyAccelerator = new sycl::queue (sycl::gpu_selector_v); theCopyAccelerator = new sycl::queue (sycl::gpu_selector_v);
// theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway. // theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
@ -242,14 +242,14 @@ void acceleratorInit(void)
gethostname(hostname, HOST_NAME_MAX+1); gethostname(hostname, HOST_NAME_MAX+1);
if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname); if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname);
auto devices = cl::sycl::device::get_devices(); auto devices = sycl::device::get_devices();
for(int d = 0;d<devices.size();d++){ for(int d = 0;d<devices.size();d++){
#define GPU_PROP_STR(prop) \ #define GPU_PROP_STR(prop) \
printf("AcceleratorSyclInit: " #prop ": %s \n",devices[d].get_info<cl::sycl::info::device::prop>().c_str()); printf("AcceleratorSyclInit: " #prop ": %s \n",devices[d].get_info<sycl::info::device::prop>().c_str());
#define GPU_PROP_FMT(prop,FMT) \ #define GPU_PROP_FMT(prop,FMT) \
printf("AcceleratorSyclInit: " #prop ": " FMT" \n",devices[d].get_info<cl::sycl::info::device::prop>()); printf("AcceleratorSyclInit: " #prop ": " FMT" \n",devices[d].get_info<sycl::info::device::prop>());
#define GPU_PROP(prop) GPU_PROP_FMT(prop,"%ld"); #define GPU_PROP(prop) GPU_PROP_FMT(prop,"%ld");
if ( world_rank == 0) { if ( world_rank == 0) {

View File

@ -302,7 +302,7 @@ NAMESPACE_END(Grid);
// Force deterministic reductions // Force deterministic reductions
#define SYCL_REDUCTION_DETERMINISTIC #define SYCL_REDUCTION_DETERMINISTIC
#include <sycl/CL/sycl.hpp> #include <sycl/sycl.hpp>
#include <sycl/usm.hpp> #include <sycl/usm.hpp>
#include <level_zero/ze_api.h> #include <level_zero/ze_api.h>
#include <sycl/ext/oneapi/backend/level_zero.hpp> #include <sycl/ext/oneapi/backend/level_zero.hpp>
@ -314,8 +314,8 @@ inline void acceleratorMem(void)
std::cout <<" SYCL acceleratorMem not implemented"<<std::endl; std::cout <<" SYCL acceleratorMem not implemented"<<std::endl;
} }
extern cl::sycl::queue *theGridAccelerator; extern sycl::queue *theGridAccelerator;
extern cl::sycl::queue *theCopyAccelerator; extern sycl::queue *theCopyAccelerator;
#ifdef __SYCL_DEVICE_ONLY__ #ifdef __SYCL_DEVICE_ONLY__
#define GRID_SIMT #define GRID_SIMT
@ -326,24 +326,24 @@ extern cl::sycl::queue *theCopyAccelerator;
accelerator_inline int acceleratorSIMTlane(int Nsimd) { accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#ifdef GRID_SIMT #ifdef GRID_SIMT
return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; return __spirv::initLocalInvocationId<3, sycl::id<3>>()[2];
#else #else
return 0; return 0;
#endif #endif
} // SYCL specific } // SYCL specific
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \ theGridAccelerator->submit([&](sycl::handler &cgh) { \
unsigned long nt=acceleratorThreads(); \ unsigned long nt=acceleratorThreads(); \
if(nt < 8)nt=8; \ if(nt < 8)nt=8; \
unsigned long unum1 = num1; \ unsigned long unum1 = num1; \
unsigned long unum2 = num2; \ unsigned long unum2 = num2; \
unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt; \ unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt; \
cl::sycl::range<3> local {nt,1,nsimd}; \ sycl::range<3> local {nt,1,nsimd}; \
cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd}; \ sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd}; \
cgh.parallel_for( \ cgh.parallel_for( \
cl::sycl::nd_range<3>(global,local), \ sycl::nd_range<3>(global,local), \
[=] (cl::sycl::nd_item<3> item) /*mutable*/ \ [=] (sycl::nd_item<3> item) /*mutable*/ \
[[intel::reqd_sub_group_size(16)]] \ [[intel::reqd_sub_group_size(16)]] \
{ \ { \
auto iter1 = item.get_global_id(0); \ auto iter1 = item.get_global_id(0); \
@ -369,8 +369,8 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccele
inline int acceleratorIsCommunicable(void *ptr) inline int acceleratorIsCommunicable(void *ptr)
{ {
#if 0 #if 0
auto uvm = cl::sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context()); auto uvm = sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context());
if ( uvm = cl::sycl::usm::alloc::shared ) return 1; if ( uvm = sycl::usm::alloc::shared ) return 1;
else return 0; else return 0;
#endif #endif
return 1; return 1;

View File

@ -39,6 +39,8 @@ int FlightRecorder::ContinueOnFail;
int FlightRecorder::LoggingMode; int FlightRecorder::LoggingMode;
int FlightRecorder::ChecksumComms; int FlightRecorder::ChecksumComms;
int FlightRecorder::ChecksumCommsSend; int FlightRecorder::ChecksumCommsSend;
const char * FlightRecorder::StepName;
int32_t FlightRecorder::StepLoggingCounter;
int32_t FlightRecorder::XmitLoggingCounter; int32_t FlightRecorder::XmitLoggingCounter;
int32_t FlightRecorder::RecvLoggingCounter; int32_t FlightRecorder::RecvLoggingCounter;
int32_t FlightRecorder::CsumLoggingCounter; int32_t FlightRecorder::CsumLoggingCounter;
@ -58,6 +60,8 @@ void FlightRecorder::ResetCounters(void)
CsumLoggingCounter=0; CsumLoggingCounter=0;
NormLoggingCounter=0; NormLoggingCounter=0;
ReductionLoggingCounter=0; ReductionLoggingCounter=0;
StepName = "No steps started";
StepLoggingCounter=0;
} }
void FlightRecorder::Truncate(void) void FlightRecorder::Truncate(void)
{ {
@ -88,6 +92,12 @@ void FlightRecorder::SetLoggingMode(FlightRecorder::LoggingMode_t mode)
assert(0); assert(0);
} }
} }
bool FlightRecorder::StepLog(const char *name)
{
StepName = name;
StepLoggingCounter ++;
return true;
}
void FlightRecorder::SetLoggingModePrint(void) void FlightRecorder::SetLoggingModePrint(void)
{ {
@ -111,17 +121,19 @@ uint64_t FlightRecorder::ErrorCount(void)
{ {
return ErrorCounter; return ErrorCounter;
} }
void FlightRecorder::NormLog(double value) bool FlightRecorder::NormLog(double value)
{ {
uint64_t hex = * ( (uint64_t *)&value ); uint64_t hex = * ( (uint64_t *)&value );
if(LoggingMode == LoggingModePrint) { if(LoggingMode == LoggingModePrint) {
std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
NormLoggingCounter++; NormLoggingCounter++;
return true;
} }
if(LoggingMode == LoggingModeRecord) { if(LoggingMode == LoggingModeRecord) {
std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
NormLogVector.push_back(value); NormLogVector.push_back(value);
NormLoggingCounter++; NormLoggingCounter++;
return true;
} }
if(LoggingMode == LoggingModeVerify) { if(LoggingMode == LoggingModeVerify) {
@ -130,6 +142,9 @@ void FlightRecorder::NormLog(double value)
if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) { if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) {
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
FlightRecorder::StepLoggingCounter,
FlightRecorder::StepName);
std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter
<<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" " <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" "
<<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl; <<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl;
@ -142,7 +157,9 @@ void FlightRecorder::NormLog(double value)
NormLoggingCounter,NormLogVector.size(), NormLoggingCounter,NormLogVector.size(),
value, NormLogVector[NormLoggingCounter]); fflush(stderr); value, NormLogVector[NormLoggingCounter]); fflush(stderr);
if(!ContinueOnFail)assert(0); // Force takedown of job BACKTRACEFP(stderr);
if(!ContinueOnFail) return false;
ErrorCounter++; ErrorCounter++;
} else { } else {
@ -159,18 +176,21 @@ void FlightRecorder::NormLog(double value)
} }
NormLoggingCounter++; NormLoggingCounter++;
} }
return true;
} }
void FlightRecorder::CsumLog(uint64_t hex) bool FlightRecorder::CsumLog(uint64_t hex)
{ {
if(LoggingMode == LoggingModePrint) { if(LoggingMode == LoggingModePrint) {
std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
CsumLoggingCounter++; CsumLoggingCounter++;
return true;
} }
if(LoggingMode == LoggingModeRecord) { if(LoggingMode == LoggingModeRecord) {
std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
CsumLogVector.push_back(hex); CsumLogVector.push_back(hex);
CsumLoggingCounter++; CsumLoggingCounter++;
return true;
} }
if(LoggingMode == LoggingModeVerify) { if(LoggingMode == LoggingModeVerify) {
@ -181,6 +201,9 @@ void FlightRecorder::CsumLog(uint64_t hex)
if ( hex != hexref ) { if ( hex != hexref ) {
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
FlightRecorder::StepLoggingCounter,
FlightRecorder::StepName);
std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter
<<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl; <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
@ -188,9 +211,10 @@ void FlightRecorder::CsumLog(uint64_t hex)
GridHostname(), GridHostname(),
GlobalSharedMemory::WorldShmRank, GlobalSharedMemory::WorldShmRank,
CsumLoggingCounter,hex, hexref); CsumLoggingCounter,hex, hexref);
BACKTRACEFP(stderr);
fflush(stderr); fflush(stderr);
if(!ContinueOnFail) assert(0); // Force takedown of job if(!ContinueOnFail) return false;
ErrorCounter++; ErrorCounter++;
@ -207,7 +231,9 @@ void FlightRecorder::CsumLog(uint64_t hex)
} }
CsumLoggingCounter++; CsumLoggingCounter++;
} }
return true;
} }
void FlightRecorder::ReductionLog(double local,double global) void FlightRecorder::ReductionLog(double local,double global)
{ {
uint64_t hex_l = * ( (uint64_t *)&local ); uint64_t hex_l = * ( (uint64_t *)&local );
@ -224,11 +250,15 @@ void FlightRecorder::ReductionLog(double local,double global)
if(LoggingMode == LoggingModeVerify) { if(LoggingMode == LoggingModeVerify) {
if(ReductionLoggingCounter < ReductionLogVector.size()){ if(ReductionLoggingCounter < ReductionLogVector.size()){
if ( global != ReductionLogVector[ReductionLoggingCounter] ) { if ( global != ReductionLogVector[ReductionLoggingCounter] ) {
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
FlightRecorder::StepLoggingCounter,
FlightRecorder::StepName);
fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n", fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n",
GridHostname(), GridHostname(),
GlobalSharedMemory::WorldShmRank, GlobalSharedMemory::WorldShmRank,
ReductionLoggingCounter,ReductionLogVector.size(), ReductionLoggingCounter,ReductionLogVector.size(),
global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr); global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr);
BACKTRACEFP(stderr);
if ( !ContinueOnFail ) assert(0); if ( !ContinueOnFail ) assert(0);
@ -267,11 +297,15 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
if(LoggingMode == LoggingModeVerify) { if(LoggingMode == LoggingModeVerify) {
if(XmitLoggingCounter < XmitLogVector.size()){ if(XmitLoggingCounter < XmitLogVector.size()){
if ( _xor != XmitLogVector[XmitLoggingCounter] ) { if ( _xor != XmitLogVector[XmitLoggingCounter] ) {
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
FlightRecorder::StepLoggingCounter,
FlightRecorder::StepName);
fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu %lx expect glb %lx\n", fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu %lx expect glb %lx\n",
GridHostname(), GridHostname(),
GlobalSharedMemory::WorldShmRank, GlobalSharedMemory::WorldShmRank,
XmitLoggingCounter,XmitLogVector.size(), XmitLoggingCounter,XmitLogVector.size(),
_xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr); _xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr);
BACKTRACEFP(stderr);
if ( !ContinueOnFail ) assert(0); if ( !ContinueOnFail ) assert(0);
@ -309,11 +343,15 @@ void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
if(LoggingMode == LoggingModeVerify) { if(LoggingMode == LoggingModeVerify) {
if(RecvLoggingCounter < RecvLogVector.size()){ if(RecvLoggingCounter < RecvLogVector.size()){
if ( _xor != RecvLogVector[RecvLoggingCounter] ) { if ( _xor != RecvLogVector[RecvLoggingCounter] ) {
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
FlightRecorder::StepLoggingCounter,
FlightRecorder::StepName);
fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu %lx expect glb %lx from MPI rank %d\n", fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu %lx expect glb %lx from MPI rank %d\n",
GridHostname(), GridHostname(),
GlobalSharedMemory::WorldShmRank, GlobalSharedMemory::WorldShmRank,
RecvLoggingCounter,RecvLogVector.size(), RecvLoggingCounter,RecvLogVector.size(),
_xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr); _xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr);
BACKTRACEFP(stderr);
if ( !ContinueOnFail ) assert(0); if ( !ContinueOnFail ) assert(0);

View File

@ -12,6 +12,8 @@ class FlightRecorder {
static int LoggingMode; static int LoggingMode;
static uint64_t ErrorCounter; static uint64_t ErrorCounter;
static const char * StepName;
static int32_t StepLoggingCounter;
static int32_t XmitLoggingCounter; static int32_t XmitLoggingCounter;
static int32_t RecvLoggingCounter; static int32_t RecvLoggingCounter;
static int32_t CsumLoggingCounter; static int32_t CsumLoggingCounter;
@ -30,8 +32,9 @@ class FlightRecorder {
static void SetLoggingModeRecord(void); static void SetLoggingModeRecord(void);
static void SetLoggingModeVerify(void); static void SetLoggingModeVerify(void);
static void SetLoggingMode(LoggingMode_t mode); static void SetLoggingMode(LoggingMode_t mode);
static void NormLog(double value); static bool StepLog(const char *name);
static void CsumLog(uint64_t csum); static bool NormLog(double value);
static bool CsumLog(uint64_t csum);
static void ReductionLog(double lcl, double glbl); static void ReductionLog(double lcl, double glbl);
static void Truncate(void); static void Truncate(void);
static void ResetCounters(void); static void ResetCounters(void);

View File

@ -552,6 +552,9 @@ void * Grid_backtrace_buffer[_NBACKTRACE];
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr) void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
{ {
fprintf(stderr,"Signal handler on host %s\n",hostname); fprintf(stderr,"Signal handler on host %s\n",hostname);
fprintf(stderr,"FlightRecorder step %d stage %s \n",
FlightRecorder::StepLoggingCounter,
FlightRecorder::StepName);
fprintf(stderr,"Caught signal %d\n",si->si_signo); fprintf(stderr,"Caught signal %d\n",si->si_signo);
fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr); fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
fprintf(stderr," code %d\n",si->si_code); fprintf(stderr," code %d\n",si->si_code);

View File

@ -128,6 +128,20 @@ case ${ac_LAPACK} in
AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);; AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
esac esac
############### internal reduction
AC_ARG_ENABLE([reduction],
[AS_HELP_STRING([--enable-reduction=mpi|grid],[enable reduction])],
[ac_REDUCTION=${enable_reduction}], [ac_REDUCTION=grid])
case ${ac_REDUCTION} in
mpi)
;;
grid)
AC_DEFINE([USE_GRID_REDUCTION],[1],[use GRID REDUCTION]);;
*)
AC_DEFINE([USE_GRID_REDUCTION],[1],[use GRID REDUCTION]);;
esac
############### tracing ############### tracing
AC_ARG_ENABLE([tracing], AC_ARG_ENABLE([tracing],
[AS_HELP_STRING([--enable-tracing=none|nvtx|roctx|timer],[enable tracing])], [AS_HELP_STRING([--enable-tracing=none|nvtx|roctx|timer],[enable tracing])],

View File

@ -1,8 +1,14 @@
#Ahead of time compile for PVC
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions "
#JIT compile
#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
#export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions "
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel -fsycl -fno-exceptions "
../../configure \ ../../configure \
--enable-simd=GPU \ --enable-simd=GPU \
--enable-reduction=grid \
--enable-gen-simd-width=64 \ --enable-gen-simd-width=64 \
--enable-comms=mpi-auto \ --enable-comms=mpi-auto \
--enable-debug \ --enable-debug \

View File

@ -1,7 +1,8 @@
module load oneapi/release/2023.12.15.001 #module load oneapi/release/2023.12.15.001
#module load mpich/icc-all-debug-pmix-gpu/52.2 #module load mpich/icc-all-debug-pmix-gpu/52.2
#module load mpich-config/mode/deterministic #module load mpich-config/mode/deterministic
#module load intel_compute_runtime/release/821.35 #module load intel_compute_runtime/release/821.35
source ~/spack/share/spack/setup-env.sh source ~/spack/share/spack/setup-env.sh
spack load c-lime spack load c-lime
spack load openssl spack load openssl

View File

@ -124,6 +124,8 @@ int main (int argc, char ** argv)
SchurDiagMooeeOperatorParanoid<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf); SchurDiagMooeeOperatorParanoid<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
SchurDiagMooeeOperatorParanoid<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f); SchurDiagMooeeOperatorParanoid<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
// SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
// SchurDiagMooeeOperator<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
int nsecs=600; int nsecs=600;
if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){ if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){
@ -131,6 +133,10 @@ int main (int argc, char ** argv)
GridCmdOptionInt(arg,nsecs); GridCmdOptionInt(arg,nsecs);
} }
std::cout << GridLogMessage << "::::::::::::: Job startup Barrier " << std::endl;
UGrid->Barrier();
std::cout << GridLogMessage << "::::::::::::: Job startup Barrier complete" << std::endl;
std::cout << GridLogMessage << "::::::::::::: Starting mixed CG for "<<nsecs <<" seconds" << std::endl; std::cout << GridLogMessage << "::::::::::::: Starting mixed CG for "<<nsecs <<" seconds" << std::endl;
MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO); MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO);
@ -148,7 +154,7 @@ int main (int argc, char ** argv)
FlightRecorder::ContinueOnFail = 0; FlightRecorder::ContinueOnFail = 0;
FlightRecorder::PrintEntireLog = 0; FlightRecorder::PrintEntireLog = 0;
FlightRecorder::ChecksumComms = 1; FlightRecorder::ChecksumComms = 0;
FlightRecorder::ChecksumCommsSend=0; FlightRecorder::ChecksumCommsSend=0;
if(char *s=getenv("GRID_PRINT_ENTIRE_LOG")) FlightRecorder::PrintEntireLog = atoi(s); if(char *s=getenv("GRID_PRINT_ENTIRE_LOG")) FlightRecorder::PrintEntireLog = atoi(s);
@ -180,7 +186,7 @@ int main (int argc, char ** argv)
iter ++; iter ++;
now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now)); now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now));
} while (now < (start + nsecs/10) ); } while (now < (start + nsecs/10) );
std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl; std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl;
ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000); ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
int i=0; int i=0;