1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-12-04 13:24:40 +00:00

Compare commits

..

11 Commits

Author SHA1 Message Date
7132a4fd28 Update 2025-12-02 23:22:42 +00:00
e8057d6b4a Updated for verbose on host vs. device side csum 2025-12-02 23:15:32 +00:00
973584e039 Update on aurora 2025-12-02 22:24:54 +00:00
Peter Boyle
ea46c2dc3c config command 2025-12-02 16:01:37 -05:00
Peter Boyle
50bcd76fc1 Changes to help with error logging on aurora -- triage MPI / Slingshot vs. host-device / SYCL on checksum error 2025-12-02 15:51:29 -05:00
4a0aaf0786 Fix issue with Aurora compilers 2025-11-21 21:41:13 +00:00
9c3835524c Fix compile warn 2025-11-21 21:41:12 +00:00
549351bb8a Stag verbose clean up 2025-11-20 18:22:57 +00:00
Peter Boyle
74e6b19f83 Looks like the reuse of xfers in staggered has bugs or corner cases depending on volume 2025-11-17 22:29:06 -05:00
Peter Boyle
2e684028de Improvements 2025-11-14 18:12:27 -05:00
c54d87a472 Aurora compile fix for new compiler 2025-11-06 18:17:33 +00:00
19 changed files with 170 additions and 35 deletions

View File

@@ -589,6 +589,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
srq.PacketType = InterNodeRecv; srq.PacketType = InterNodeRecv;
srq.bytes = rbytes; srq.bytes = rbytes;
srq.req = rrq; srq.req = rrq;
srq.dest = from;
srq.host_buf = host_recv; srq.host_buf = host_recv;
srq.device_buf = recv; srq.device_buf = recv;
srq.tag = tag; srq.tag = tag;
@@ -765,7 +766,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
srq.host_buf = NULL; srq.host_buf = NULL;
srq.device_buf = xmit; srq.device_buf = xmit;
srq.tag = -1; srq.tag = -1;
srq.dest = dest; srq.dest = from;
srq.commdir = dir; srq.commdir = dir;
list.push_back(srq); list.push_back(srq);
} }
@@ -817,12 +818,40 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing. int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
GRID_ASSERT(ierr==0); GRID_ASSERT(ierr==0);
} }
// for(int r=0;r<nreq;r++){ // for(int r=0;r<nreq;r++){
// if ( list[r].PacketType==InterNodeRecv ) { // if ( list[r].PacketType==InterNodeRecv ) {
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes); // acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
// } // }
// } // }
// Get global error count in comms receive
#define AUDIT_FLIGHT_RECORDER_ERRORS
#ifdef AUDIT_FLIGHT_RECORDER_ERRORS
uint64_t EC = FlightRecorder::CommsErrorCount();
if (EC) std::cerr << " global sum error count "<<EC<<std::endl;
this->GlobalSum(EC);
if (EC) {
for(int r=0;r<list.size();r++){
#ifdef GRID_CHECKSUM_COMMS
uint64_t rbytes_data = list[r].bytes - 8;
#else
uint64_t rbytes_data = list[r].bytes;
#endif
if (list[r].PacketType == InterNodeReceiveHtoD) {
std::cerr << " calling xor reduce "<<std::endl;
uint64_t csg = gpu_xor((uint64_t*)list[r].device_buf,rbytes_data/8);
uint64_t csh = cpu_xor((uint64_t*)list[r].host_buf,rbytes_data/8);
std::cerr << " Packet "<<r<<" Receive from " <<list[r].dest<<" host csum "<<csh<<" gpu csum "<<csg<<std::endl;
} if (list[r].PacketType == InterNodeXmitISend ) {
std::cerr << " calling xor reduce "<<std::endl;
uint64_t csg = gpu_xor((uint64_t*)list[r].device_buf,rbytes_data/8);
uint64_t csh = cpu_xor((uint64_t*)list[r].host_buf,rbytes_data/8);
std::cerr << " Packet "<<r<<" Send to " <<list[r].dest<<" host csum "<<csh<<" gpu csum "<<csg<<std::endl;
}
}
}
#endif
#ifdef GRID_CHECKSUM_COMMS #ifdef GRID_CHECKSUM_COMMS
for(int r=0;r<list.size();r++){ for(int r=0;r<list.size();r++){
if ( list[r].PacketType == InterNodeReceiveHtoD ) { if ( list[r].PacketType == InterNodeReceiveHtoD ) {

View File

@@ -27,6 +27,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */ /* END LEGAL */
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
void GridAbort(void) { abort(); }
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -34,7 +36,6 @@ NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
Grid_MPI_Comm CartesianCommunicator::communicator_world; Grid_MPI_Comm CartesianCommunicator::communicator_world;
void GridAbort(void) { abort(); }
void CartesianCommunicator::Init(int *argc, char *** arv) void CartesianCommunicator::Init(int *argc, char *** arv)
{ {

View File

@@ -124,7 +124,7 @@ template<class vobj> void Cshift_simple(Lattice<vobj>& ret,const Lattice<vobj> &
void *hsend_buf = (void *)&hrhs[0]; void *hsend_buf = (void *)&hrhs[0];
void *hrecv_buf = (void *)&hret[0]; void *hrecv_buf = (void *)&hret[0];
acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes); acceleratorCopyFromDevice(send_buf,hsend_buf,bytes);
grid->SendToRecvFrom(hsend_buf, grid->SendToRecvFrom(hsend_buf,
xmit_to_rank, xmit_to_rank,
@@ -132,7 +132,7 @@ template<class vobj> void Cshift_simple(Lattice<vobj>& ret,const Lattice<vobj> &
recv_from_rank, recv_from_rank,
bytes); bytes);
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes); acceleratorCopyToDevice(hrecv_buf,recv_buf,bytes);
#endif #endif
} }
} }

View File

@@ -68,8 +68,7 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
return result; return result;
} }
template<class Word> Word gpu_xor(Word *vec,uint64_t L)
template<class Word> Word svm_xor(Word *vec,uint64_t L)
{ {
Word identity; identity=0; Word identity; identity=0;
Word ret = 0; Word ret = 0;
@@ -87,6 +86,18 @@ template<class Word> Word svm_xor(Word *vec,uint64_t L)
theGridAccelerator->wait(); theGridAccelerator->wait();
return ret; return ret;
} }
template<class Word> Word cpu_xor(Word *vec,uint64_t L)
{
Word csum=0;
for(uint64_t w=0;w<L;w++){
csum = csum ^ vec[w];
}
return csum;
}
template<class Word> Word svm_xor(Word *vec,uint64_t L)
{
gpu_xor(vec,L);
}
template<class Word> Word checksum_gpu(Word *vec,uint64_t L) template<class Word> Word checksum_gpu(Word *vec,uint64_t L)
{ {
Word identity; identity=0; Word identity; identity=0;

View File

@@ -411,7 +411,7 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
#undef LoopBody #undef LoopBody
} }
#ifdef GRID_SYCL #if 0
extern "C" { extern "C" {
ulong SYCL_EXTERNAL __attribute__((overloadable)) intel_get_cycle_counter( void ); ulong SYCL_EXTERNAL __attribute__((overloadable)) intel_get_cycle_counter( void );
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_active_channel_mask( void ); uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_active_channel_mask( void );

View File

@@ -138,10 +138,13 @@ public:
//auto start = std::chrono::high_resolution_clock::now(); //auto start = std::chrono::high_resolution_clock::now();
autoView(U_v,U,AcceleratorWrite); autoView(U_v,U,AcceleratorWrite);
autoView(P_v,P,AcceleratorRead); autoView(P_v,P,AcceleratorRead);
accelerator_for(ss, P.Grid()->oSites(),1,{ typedef typename Field::vector_object vobj;
const int Nsimd = vobj::Nsimd();
accelerator_for(ss, P.Grid()->oSites(),Nsimd,{
for (int mu = 0; mu < Nd; mu++) { for (int mu = 0; mu < Nd; mu++) {
U_v[ss](mu) = Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu); auto tmp = Exponentiate(P_v(ss)(mu), ep, Nexp) * U_v(ss)(mu);
U_v[ss](mu) = Group::ProjectOnGeneralGroup(U_v[ss](mu)); tmp = Group::ProjectOnGeneralGroup(tmp);
coalescedWrite(U_v[ss](mu),tmp);
} }
}); });
//auto end = std::chrono::high_resolution_clock::now(); //auto end = std::chrono::high_resolution_clock::now();

View File

@@ -291,8 +291,8 @@ public:
int idx=0; int idx=0;
for(int mu=0;mu<4;mu++){ for(int mu=0;mu<4;mu++){
for(int nu=0;nu<4;nu++){ for(int nu=0;nu<4;nu++){
if ( mu!=nu) GRID_ASSERT(this->StoutSmearing->SmearRho[idx]==rho); if ( mu!=nu) assert(this->StoutSmearing->SmearRho[idx]==rho);
else GRID_ASSERT(this->StoutSmearing->SmearRho[idx]==0.0); else assert(this->StoutSmearing->SmearRho[idx]==0.0);
idx++; idx++;
}} }}
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
@@ -825,6 +825,7 @@ public:
virtual void fill_smearedSet(GaugeField &U) virtual void fill_smearedSet(GaugeField &U)
{ {
this->ThinLinks = &U; // attach the smearing routine to the field U this->ThinLinks = &U; // attach the smearing routine to the field U
std::cout << GridLogMessage << " fill_smearedSet " << WilsonLoops<PeriodicGimplR>::avgPlaquette(U) << std::endl;
// check the pointer is not null // check the pointer is not null
if (this->ThinLinks == NULL) if (this->ThinLinks == NULL)
@@ -846,6 +847,8 @@ public:
ApplyMask(smeared_A,smearLvl); ApplyMask(smeared_A,smearLvl);
smeared_B = previous_u; smeared_B = previous_u;
ApplyMask(smeared_B,smearLvl); ApplyMask(smeared_B,smearLvl);
std::cout << GridLogMessage << " smeared_A " << norm2(smeared_A) << std::endl;
std::cout << GridLogMessage << " smeared_B " << norm2(smeared_B) << std::endl;
// Replace only the masked portion // Replace only the masked portion
this->SmearedSet[smearLvl] = previous_u-smeared_B + smeared_A; this->SmearedSet[smearLvl] = previous_u-smeared_B + smeared_A;
previous_u = this->SmearedSet[smearLvl]; previous_u = this->SmearedSet[smearLvl];
@@ -934,10 +937,10 @@ public:
SmearedConfigurationMasked(GridCartesian* _UGrid, unsigned int Nsmear, Smear_Stout<Gimpl>& Stout) SmearedConfigurationMasked(GridCartesian* _UGrid, unsigned int Nsmear, Smear_Stout<Gimpl>& Stout)
: SmearedConfiguration<Gimpl>(_UGrid, Nsmear,Stout) : SmearedConfiguration<Gimpl>(_UGrid, Nsmear,Stout)
{ {
GRID_ASSERT(Nsmear%(2*Nd)==0); // Or multiply by 8?? assert(Nsmear%(2*Nd)==0); // Or multiply by 8??
// was resized in base class // was resized in base class
GRID_ASSERT(this->SmearedSet.size()==Nsmear); assert(this->SmearedSet.size()==Nsmear);
GridRedBlackCartesian * UrbGrid; GridRedBlackCartesian * UrbGrid;
UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(_UGrid); UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(_UGrid);

View File

@@ -54,7 +54,7 @@ public:
// Usual cases are not used // Usual cases are not used
////////////////////////////////// //////////////////////////////////
virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG){ GRID_ASSERT(0);}; virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG){ GRID_ASSERT(0);};
virtual RealD S(const GaugeField &U) { GRID_ASSERT(0); } virtual RealD S(const GaugeField &U) { GRID_ASSERT(0); return 0; }
virtual void deriv(const GaugeField &U, GaugeField &dSdU) { GRID_ASSERT(0); } virtual void deriv(const GaugeField &U, GaugeField &dSdU) { GRID_ASSERT(0); }
////////////////////////////////// //////////////////////////////////

View File

@@ -751,7 +751,7 @@ public:
obj.xbytes = xbytes; obj.xbytes = xbytes;
obj.rbytes = rbytes; obj.rbytes = rbytes;
obj.cb = cb; obj.cb = cb;
for(int i=0;i<CachedTransfers.size();i++){ for(int i=0;i<CachedTransfers.size();i++){
if ( (CachedTransfers[i].direction ==direction) if ( (CachedTransfers[i].direction ==direction)
&&(CachedTransfers[i].OrthogPlane==OrthogPlane) &&(CachedTransfers[i].OrthogPlane==OrthogPlane)
@@ -763,11 +763,13 @@ public:
){ ){
// FIXME worry about duplicate with partial compression // FIXME worry about duplicate with partial compression
// Wont happen as DWF has no duplicates, but... // Wont happen as DWF has no duplicates, but...
AddCopy(CachedTransfers[i].recv_buf,recv_buf,rbytes); // AddCopy(CachedTransfers[i].recv_buf,recv_buf,rbytes);
return 1; // std::cout << "Duplicate dir " <<direction<<" "<<" OrthogPlane "<<OrthogPlane<<" Dest"<<DestProc <<" xbytes " <<xbytes<<" lane "<< lane<<" cb "<<cb<<std::endl;
return 0;
// return 1;
} }
} }
CachedTransfers.push_back(obj); CachedTransfers.push_back(obj);
return 0; return 0;
} }

View File

@@ -47,6 +47,7 @@ int32_t FlightRecorder::CsumLoggingCounter;
int32_t FlightRecorder::NormLoggingCounter; int32_t FlightRecorder::NormLoggingCounter;
int32_t FlightRecorder::ReductionLoggingCounter; int32_t FlightRecorder::ReductionLoggingCounter;
uint64_t FlightRecorder::ErrorCounter; uint64_t FlightRecorder::ErrorCounter;
uint64_t FlightRecorder::CommsErrorCounter;
std::vector<double> FlightRecorder::NormLogVector; std::vector<double> FlightRecorder::NormLogVector;
std::vector<double> FlightRecorder::ReductionLogVector; std::vector<double> FlightRecorder::ReductionLogVector;
@@ -118,6 +119,10 @@ void FlightRecorder::SetLoggingModeVerify(void)
ResetCounters(); ResetCounters();
LoggingMode = LoggingModeVerify; LoggingMode = LoggingModeVerify;
} }
uint64_t FlightRecorder::CommsErrorCount(void)
{
return CommsErrorCounter;
}
uint64_t FlightRecorder::ErrorCount(void) uint64_t FlightRecorder::ErrorCount(void)
{ {
return ErrorCounter; return ErrorCounter;
@@ -312,6 +317,7 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
if ( !ContinueOnFail ) GRID_ASSERT(0); if ( !ContinueOnFail ) GRID_ASSERT(0);
ErrorCounter++; ErrorCounter++;
} else { } else {
if ( PrintEntireLog ) { if ( PrintEntireLog ) {
std::cerr<<"FlightRecorder::XmitLog : VALID "<< XmitLoggingCounter <<" "<< std::hexfloat << _xor << " "<< XmitLogVector[XmitLoggingCounter] <<std::endl; std::cerr<<"FlightRecorder::XmitLog : VALID "<< XmitLoggingCounter <<" "<< std::hexfloat << _xor << " "<< XmitLogVector[XmitLoggingCounter] <<std::endl;
@@ -357,7 +363,9 @@ void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
if ( !ContinueOnFail ) GRID_ASSERT(0); if ( !ContinueOnFail ) GRID_ASSERT(0);
CommsErrorCounter++;
ErrorCounter++; ErrorCounter++;
} else { } else {
if ( PrintEntireLog ) { if ( PrintEntireLog ) {
std::cerr<<"FlightRecorder::RecvLog : VALID "<< RecvLoggingCounter <<" "<< std::hexfloat << _xor << " "<< RecvLogVector[RecvLoggingCounter] <<std::endl; std::cerr<<"FlightRecorder::RecvLog : VALID "<< RecvLoggingCounter <<" "<< std::hexfloat << _xor << " "<< RecvLogVector[RecvLoggingCounter] <<std::endl;

View File

@@ -9,8 +9,8 @@ class FlightRecorder {
LoggingModeRecord, LoggingModeRecord,
LoggingModeVerify LoggingModeVerify
}; };
static int LoggingMode; static int LoggingMode;
static uint64_t CommsErrorCounter;
static uint64_t ErrorCounter; static uint64_t ErrorCounter;
static const char * StepName; static const char * StepName;
static int32_t StepLoggingCounter; static int32_t StepLoggingCounter;
@@ -39,6 +39,7 @@ class FlightRecorder {
static void Truncate(void); static void Truncate(void);
static void ResetCounters(void); static void ResetCounters(void);
static uint64_t ErrorCount(void); static uint64_t ErrorCount(void);
static uint64_t CommsErrorCount(void);
static void xmitLog(void *,uint64_t bytes); static void xmitLog(void *,uint64_t bytes);
static void recvLog(void *,uint64_t bytes,int rank); static void recvLog(void *,uint64_t bytes,int rank);
}; };

View File

@@ -755,7 +755,7 @@ void Grid_generic_handler(int sig,siginfo_t *si,void * ptr)
sig_print_uint(si->si_code); sig_print_uint(si->si_code);
SIGLOG("\n"); SIGLOG("\n");
unw_context_t *uc= (unw_context_t *)ptr; ucontext_t *uc= (ucontext_t *)ptr;
SIGLOG("Backtrace:\n"); SIGLOG("Backtrace:\n");
#ifdef HAVE_UNWIND #ifdef HAVE_UNWIND

View File

@@ -409,16 +409,10 @@ AC_SEARCH_LIBS([unw_backtrace], [unwind],
[have_unwind=true], [have_unwind=true],
[AC_MSG_WARN(libunwind library was not found in your system.)]) [AC_MSG_WARN(libunwind library was not found in your system.)])
AS_CASE([$host_cpu], [x86_64], AC_SEARCH_LIBS([_Ux86_64_step], [unwind-x86_64],
[AC_SEARCH_LIBS([_Ux86_64_step], [unwind-x86_64], [AC_DEFINE([HAVE_UNWIND_X86_64], [1], [Define to 1 if you have the `libunwind-x86_64' library])]
[AC_DEFINE([HAVE_UNWIND_X86_64], [1], [Define to 1 if you have the `libunwind-x86_64' library])] [have_unwind_x86_64=true],
[have_unwind_x86_64=true], [AC_MSG_WARN(libunwind library was not found in your system.)])
[AC_MSG_WARN(libunwind library was not found in your system.)])],
[aarch64],
[AC_SEARCH_LIBS([_Uaarch64_step], [unwind-aarch64],
[AC_DEFINE([HAVE_UNWIND_AARCH64], [1], [Define to 1 if you have the `libunwind-aarch64' library])]
[have_unwind_aarch64=true],
[AC_MSG_WARN(libunwind library was not found in your system.)])])
AC_SEARCH_LIBS([SHA256_Init], [crypto], AC_SEARCH_LIBS([SHA256_Init], [crypto],
[AC_DEFINE([HAVE_CRYPTO], [1], [Define to 1 if you have the `OpenSSL' library])] [AC_DEFINE([HAVE_CRYPTO], [1], [Define to 1 if you have the `OpenSSL' library])]

View File

@@ -10,7 +10,7 @@ export HDF5=/opt/cray/pe/hdf5/1.12.2.3/gnu/9.1
--disable-gparity \ --disable-gparity \
--disable-fermion-reps \ --disable-fermion-reps \
--enable-shm=nvlink \ --enable-shm=nvlink \
--enable-checksum-comms=yes \ --enable-checksum-comms=no \
--enable-log-views=yes \ --enable-log-views=yes \
--enable-accelerator=sycl \ --enable-accelerator=sycl \
--enable-accelerator-aware-mpi=no \ --enable-accelerator-aware-mpi=no \

View File

@@ -51,7 +51,7 @@ EOF
CMD="mpiexec -np 384 -ppn 12 -envall --hostfile nodefile \ CMD="mpiexec -np 384 -ppn 12 -envall --hostfile nodefile \
../gpu_tile.sh \ ../gpu_tile.sh \
$BINARY --mpi 4.4.4.6 --grid 64.64.64.96 \ $BINARY --mpi 4.4.4.6 --grid 64.64.64.96 \
--shm-mpi 0 --comms-overlap --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --log Message " --shm-mpi 0 --comms-overlap --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --log Message --debug-stdout "
echo $CMD > command-line echo $CMD > command-line
env > environment env > environment

View File

@@ -0,0 +1,62 @@
#!/bin/bash
#PBS -l select=256
#PBS -q run_next
##PBS -A LatticeQCD_aesp_CNDA
#PBS -A LatticeFlavor
#PBS -l walltime=06:00:00
#PBS -N reproBigJob
#PBS -k doe
#PBS -l filesystems=flare
#PBS -l filesystems=home
#export OMP_PROC_BIND=spread
#unset OMP_PLACES
# 56 cores / 6 threads ~9
export OMP_NUM_THREADS=6
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
export GRID_PRINT_ENTIRE_LOG=0
export GRID_CHECKSUM_RECV_BUF=1
export GRID_CHECKSUM_SEND_BUF=1
export MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE=generic
export MPIR_CVAR_NOLOCAL=1
cd $PBS_O_WORKDIR
source ../sourceme.sh
cp $PBS_NODEFILE nodefile
DIR=reproBigJob.$PBS_JOBID
mkdir -p $DIR
cd $DIR
cp $PBS_NODEFILE nodefile
BINARY=../Test_dwf_mixedcg_prec
echo > pingjob <<EOF
while read node ;
do
echo ssh $node killall -HUP Test_dwf_mixedcg_prec
done < nodefile
EOF
CMD="mpiexec -np 3072 -ppn 12 -envall --hostfile nodefile \
../gpu_tile.sh \
$BINARY --mpi 8.8.8.6 --grid 128.128.128.288 \
--shm-mpi 0 --comms-overlap --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 18000 --log Message --debug-stdout "
echo $CMD > command-line
env > environment
$CMD
grep Oops */Grid.stderr.* > failures.$PBS_JOBID

View File

@@ -0,0 +1,12 @@
CXX=mpicxx CXXFLAGS=-I/opt/local/include LDFLAGS=-L/opt/local/lib/ ../../configure \
--enable-simd=GEN \
--enable-comms=mpi3 \
--enable-debug \
--enable-unified=yes \
--prefix /Users/peterboyle/QCD/BugHunt/install \
--with-lime=/Users/peterboyle/QCD/SciDAC/install/ \
--with-openssl=$BREW \
--disable-fermion-reps \
--disable-gparity \
--enable-debug

View File

@@ -82,6 +82,7 @@ int main (int argc, char ** argv)
Grid_init(&argc,&argv); Grid_init(&argc,&argv);
std::cout << GridLogMessage<<" in main(): Grid is initialised"<<std::endl;
const int Ls=12; const int Ls=12;
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi()); GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
@@ -94,6 +95,7 @@ int main (int argc, char ** argv)
GridCartesian * FGrid_f = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_f); GridCartesian * FGrid_f = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_f);
GridRedBlackCartesian * FrbGrid_f = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_f); GridRedBlackCartesian * FrbGrid_f = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_f);
std::cout << GridLogMessage<<" in main(): making RNGs"<<std::endl;
std::vector<int> seeds4({1,2,3,4}); std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8}); std::vector<int> seeds5({5,6,7,8});
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
@@ -152,7 +154,7 @@ int main (int argc, char ** argv)
time_t start = time(NULL); time_t start = time(NULL);
UGrid->Broadcast(0,(void *)&start,sizeof(start)); UGrid->Broadcast(0,(void *)&start,sizeof(start));
FlightRecorder::ContinueOnFail = 0; FlightRecorder::ContinueOnFail = 1;
FlightRecorder::PrintEntireLog = 0; FlightRecorder::PrintEntireLog = 0;
FlightRecorder::ChecksumComms = 0; FlightRecorder::ChecksumComms = 0;
FlightRecorder::ChecksumCommsSend=0; FlightRecorder::ChecksumCommsSend=0;

View File

@@ -62,6 +62,7 @@ void ForceTest(Action<LatticeGaugeField> &action,ConfigurationBase<LatticeGaugeF
Gimpl::generate_momenta(P,sRNG,RNG4); Gimpl::generate_momenta(P,sRNG,RNG4);
// Filter.applyFilter(P); // Filter.applyFilter(P);
std::cout << GridLogMessage << "Initial momenta " << norm2(P) << std::endl;
action.refresh(smU,sRNG,RNG4); action.refresh(smU,sRNG,RNG4);
@@ -70,6 +71,8 @@ void ForceTest(Action<LatticeGaugeField> &action,ConfigurationBase<LatticeGaugeF
std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl; std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
RealD S1 = action.S(smU); RealD S1 = action.S(smU);
std::cout << GridLogMessage << "Initial action " << S1 << std::endl;
Gimpl::update_field(P,U,eps); Gimpl::update_field(P,U,eps);
smU.set_Field(U); smU.set_Field(U);
@@ -80,6 +83,7 @@ void ForceTest(Action<LatticeGaugeField> &action,ConfigurationBase<LatticeGaugeF
action.deriv(smU,UdSdU); action.deriv(smU,UdSdU);
UdSdU = Ta(UdSdU); UdSdU = Ta(UdSdU);
// Filter.applyFilter(UdSdU); // Filter.applyFilter(UdSdU);
std::cout << GridLogMessage << "Derivative " << norm2(UdSdU) << std::endl;
DumpSliceNorm("Force",UdSdU,Nd-1); DumpSliceNorm("Force",UdSdU,Nd-1);
@@ -91,6 +95,7 @@ void ForceTest(Action<LatticeGaugeField> &action,ConfigurationBase<LatticeGaugeF
std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl; std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
RealD S2 = action.S(smU); RealD S2 = action.S(smU);
std::cout << GridLogMessage << "Final action " << S1 << std::endl;
// Use the derivative // Use the derivative
LatticeComplex dS(UGrid); dS = Zero(); LatticeComplex dS(UGrid); dS = Zero();
@@ -145,6 +150,8 @@ int main (int argc, char ** argv)
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds); GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds);
SU<Nc>::HotConfiguration(RNG4,U); SU<Nc>::HotConfiguration(RNG4,U);
#endif #endif
std::cout << GridLogMessage << "Initial plaquette: " << WilsonLoops<PeriodicGimplR>::avgPlaquette(U) << std::endl;
WilsonGaugeActionR PlaqAction(6.0); WilsonGaugeActionR PlaqAction(6.0);