Scripts

Update for Jupiter
Updated python output
2025-06-13 20:57:06 +01:00 · 2025-06-11 15:30:16 +02:00 · 2025-06-11 15:24:34 +02:00 · 2025-06-03 14:09:29 -04:00 · 2025-05-23 20:58:16 +00:00 · 2025-05-23 20:57:11 +00:00
21 changed files with 904 additions and 37 deletions
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@ -269,7 +269,9 @@ public:
    RealD xscale = 2.0/(hi-lo);
    RealD mscale = -(hi+lo)/(hi-lo);
    Linop.HermOp(T0,y);
+    grid->Barrier();
    axpby(T1,xscale,mscale,y,in);
+    grid->Barrier();

    // sum = .5 c[0] T0 + c[1] T1
    //    out = ()*T0 + Coeffs[1]*T1;
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@ -260,32 +260,39 @@ CartesianCommunicator::~CartesianCommunicator()
 }
 #ifdef USE_GRID_REDUCTION
 void CartesianCommunicator::GlobalSum(float &f){
+  FlightRecorder::StepLog("GlobalSumP2P");
  CartesianCommunicator::GlobalSumP2P(f);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
+  FlightRecorder::StepLog("GlobalSumP2P");
  CartesianCommunicator::GlobalSumP2P(d);
 }
 #else
 void CartesianCommunicator::GlobalSum(float &f){
+  FlightRecorder::StepLog("AllReduce");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
+  FlightRecorder::StepLog("AllReduce");
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 #endif
 void CartesianCommunicator::GlobalSum(uint32_t &u){
+  FlightRecorder::StepLog("AllReduce");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
+  FlightRecorder::StepLog("AllReduce");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
+  FlightRecorder::StepLog("AllReduceVector");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
@ -794,6 +801,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque

 void CartesianCommunicator::StencilBarrier(void)
 {
+  FlightRecorder::StepLog("NodeBarrier");
  MPI_Barrier  (ShmComm);
 }
 //void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
@ -801,11 +809,13 @@ void CartesianCommunicator::StencilBarrier(void)
 //}
 void CartesianCommunicator::Barrier(void)
 {
+  FlightRecorder::StepLog("GridBarrier");
  int ierr = MPI_Barrier(communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
+  FlightRecorder::StepLog("Broadcast");
  int ierr=MPI_Bcast(data,
 		     bytes,
 		     MPI_BYTE,
@ -824,6 +834,7 @@ void CartesianCommunicator::BarrierWorld(void){
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
+  FlightRecorder::StepLog("BroadcastWorld");
  int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
@ -846,6 +857,7 @@ void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,
 }
 void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t bytes)
 {
+  FlightRecorder::StepLog("AllToAll");
  // MPI is a pain and uses "int" arguments
  // 64*64*64*128*16 == 500Million elements of data.
  // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@ -990,7 +990,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
  }
 #endif

-  SharedMemoryTest();
+  //  SharedMemoryTest();
 }
 //////////////////////////////////////////////////////////////////
 // On node barrier
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@ -236,7 +236,7 @@ public:
  template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
    vobj vtmp;
    vtmp = r;
-#if 0
+#if 1
    deviceVector<vobj> vvtmp(1);
    acceleratorPut(vvtmp[0],vtmp);
    vobj *vvtmp_p = & vvtmp[0];
--- a/Grid/qcd/smearing/WilsonFlow.h
+++ b/Grid/qcd/smearing/WilsonFlow.h
@ -252,6 +252,11 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const{

  out = in;
  RealD taus = 0.;
+
+  // Perform initial t=0 measurements
+  for(auto const &meas : this->functions)
+    meas.second(0,taus,out);
+  
  for (unsigned int step = 1; step <= Nstep; step++) { //step indicates the number of smearing steps applied at the time of measurement
    auto start = std::chrono::high_resolution_clock::now();
    evolve_step(out, taus);
@ -336,6 +341,11 @@ void WilsonFlowAdaptive<Gimpl>::smear(GaugeField& out, const GaugeField& in) con
  RealD taus = 0.;
  RealD eps = init_epsilon;
  unsigned int step = 0;
+
+  // Perform initial t=0 measurements
+  for(auto const &meas : this->functions)
+    meas.second(step,taus,out);
+  
  do{
    int step_success = evolve_step_adaptive(out, taus, eps); 
    step += step_success; //step will not be incremented if the integration step fails
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@ -396,6 +396,7 @@ public:
 					Packets[i].from_rank,Packets[i].do_recv,
 					Packets[i].xbytes,Packets[i].rbytes,i);
    }
+    FlightRecorder::StepLog("Communicate begin has finished");
    // Get comms started then run checksums
    // Having this PRIOR to the dslash seems to make Sunspot work... (!)
    for(int i=0;i<Packets.size();i++){
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@ -337,7 +337,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
    cgh.parallel_for(							\
 		     sycl::nd_range<3>(global,local),			\
 		     [=] (sycl::nd_item<3> item) /*mutable*/		\
-		     [[intel::reqd_sub_group_size(16)]]			\
+		     [[sycl::reqd_sub_group_size(16)]]			\
 		     {							\
 		       auto iter1    = item.get_global_id(0);		\
 		       auto iter2    = item.get_global_id(1);		\
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@ -638,12 +638,11 @@ void Grid_debug_handler_init(void)
  sa.sa_flags    = SA_SIGINFO;
  //  sigaction(SIGSEGV,&sa,NULL);
  sigaction(SIGTRAP,&sa,NULL);
-  sigaction(SIGBUS,&sa,NULL);
+  //  sigaction(SIGBUS,&sa,NULL);
  //  sigaction(SIGUSR2,&sa,NULL);

-  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
-
-  sigaction(SIGFPE,&sa,NULL);
+  //  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
+  //  sigaction(SIGFPE,&sa,NULL);
  sigaction(SIGKILL,&sa,NULL);
  sigaction(SIGILL,&sa,NULL);

--- a/HMC/ComputeWilsonFlow.cc
+++ b/HMC/ComputeWilsonFlow.cc
@ -66,6 +66,7 @@ namespace Grid{
  };
 }

+
 template <class T> void writeFile(T& in, std::string const fname){  
 #ifdef HAVE_LIME
  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
@ -73,7 +74,7 @@ template <class T> void writeFile(T& in, std::string const fname){
  Grid::emptyUserRecord record;
  Grid::ScidacWriter WR(in.Grid()->IsBoss());
  WR.open(fname);
-  WR.writeScidacFieldRecord(in,record,0);
+  WR.writeScidacFieldRecord(in,record,0); // Lexico
  WR.close();
 #endif
  // What is the appropriate way to throw error?
@ -107,8 +108,18 @@ int main(int argc, char **argv) {

  for (int conf = CPar.StartConfiguration; conf <= CPar.EndConfiguration; conf+= CPar.Skip){

+#if 0    
  CPNersc.CheckpointRestore(conf, Umu, sRNG, pRNG);
+#else
+  // Don't require Grid format RNGs
+  FieldMetaData header;
+  std::string file, filesmr;
+  file    = CPar.conf_path + "/" + CPar.conf_prefix      + "." + std::to_string(conf);
+  filesmr = CPar.conf_path + "/" + CPar.conf_smr_prefix  + "." + std::to_string(conf);

+  NerscIO::readConfiguration(Umu,header,file);
+#endif
+  
  std::cout << std::setprecision(15);
  std::cout << GridLogMessage << "Initial plaquette: "<< WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu) << std::endl;
  
@ -116,6 +127,7 @@ int main(int argc, char **argv) {
  std::string file_post = CPar.conf_prefix + "." + std::to_string(conf);

  WilsonFlow<PeriodicGimplR> WF(WFPar.step_size,WFPar.steps,WFPar.meas_interval);
+  
  WF.addMeasurement(WFPar.meas_interval_density, [&file_pre,&file_post,&conf](int step, RealD t, const typename PeriodicGimplR::GaugeField &U){
    
    typedef typename PeriodicGimplR::GaugeLinkField GaugeMat;
@ -165,33 +177,48 @@ int main(int argc, char **argv) {
    //double coeff = 2.0 / (1.0 * Nd * (Nd - 1)) / 3.0;
    //Plq = coeff * Plq;

-    int tau = std::round(t);
-    std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post;
-    writeFile(R,efile);
-    std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post;
-    writeFile(qfield,tfile);

+    RealD WFlow_TC5Li   = WilsonLoops<PeriodicGimplR>::TopologicalCharge5Li(U);
+
+    int tau = std::round(t);
+
+    std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post;
+    //    writeFile(R,efile);
+
+    std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post;
+    //    writeFile(qfield,tfile);
+
+    std::string ufile = file_pre + "U_" + std::to_string(tau) + "_" + file_post;
+    {
+      //      PeriodicGimplR::GaugeField Ucopy = U;
+      //      NerscIO::writeConfiguration(Ucopy,ufile);
+    }
+    
    RealD E = real(sum(R))/ RealD(U.Grid()->gSites());
    RealD T = real( sum(qfield) );
    Coordinate scoor; for (int mu=0; mu < Nd; mu++) scoor[mu] = 0;
    RealD E0 = real(peekSite(R,scoor));
    RealD T0 = real(peekSite(qfield,scoor));
    std::cout << GridLogMessage << "[WilsonFlow] Saved energy density (clover) & topo. charge density: "  << conf << " " << step << "  " << tau << "  "
-	      << "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << std::endl;
+	      << "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << " Q5Li "<< WFlow_TC5Li << std::endl;
    
  });
  
  int t=WFPar.maxTau;
  WF.smear(Uflow, Umu);
-
+  //  NerscIO::writeConfiguration(Uflow,filesmr);
+  
+  
  RealD WFlow_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(Uflow);
  RealD WFlow_TC   = WilsonLoops<PeriodicGimplR>::TopologicalCharge(Uflow);
+  RealD WFlow_TC5Li   = WilsonLoops<PeriodicGimplR>::TopologicalCharge5Li(Uflow);
  RealD WFlow_T0   = WF.energyDensityPlaquette(t,Uflow); // t
  RealD WFlow_EC   = WF.energyDensityCloverleaf(t,Uflow);
-  std::cout << GridLogMessage << "Plaquette          "<< conf << "   " << WFlow_plaq << std::endl;
-  std::cout << GridLogMessage << "T0                 "<< conf << "   " << WFlow_T0 << std::endl;
-  std::cout << GridLogMessage << "TC0                 "<< conf << "   " << WFlow_EC << std::endl;
-  std::cout << GridLogMessage << "TopologicalCharge  "<< conf << "   " << WFlow_TC   << std::endl;
+  std::cout << GridLogMessage << "Plaquette            "<< conf << "   " << WFlow_plaq << std::endl;
+  std::cout << GridLogMessage << "T0                   "<< conf << "   " << WFlow_T0 << std::endl;
+  std::cout << GridLogMessage << "TC0                  "<< conf << "   " << WFlow_EC << std::endl;
+  std::cout << GridLogMessage << "TopologicalCharge    "<< conf << "   " << WFlow_TC   << std::endl;
+  std::cout << GridLogMessage << "TopologicalCharge5Li "<< conf << "   " << WFlow_TC5Li<< std::endl;

  std::cout<< GridLogMessage << " Admissibility check:\n";
  const double sp_adm = 0.067;                // admissible threshold
--- a/systems/Jupiter/benchmarks/dwf.1node.perf
+++ b/systems/Jupiter/benchmarks/dwf.1node.perf
@ -0,0 +1,273 @@
+RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
+RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
+RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
+RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
+SLURM detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA GH200 120GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 102005473280 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 1 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+local rank 0 device 0 bus id: 0009:01:00.0
+AcceleratorCudaInit: ================================================
+SharedMemoryMpi:  World communicator of size 4
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x4002c0000000 - 40033fffffff for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=3737a24096282ea179607fc879814710860a0de6: (HEAD -> develop, origin/develop, origin/HEAD) clean
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : This rank is running on host jpbo-119-30.jupiter.internal
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 81604378624 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent host   allocations: SMALL 8 LARGE 2 HUGE 0
+Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0
+Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+
+
+
+
+
+
+
+Grid : Message : 0.303000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 0.309000 s :  Testing with full communication 
+Grid : Message : 0.312000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 0.313000 s : Grid Layout
+Grid : Message : 0.313000 s : 	Global lattice size  : 32 32 64 64 
+Grid : Message : 0.319000 s : 	OpenMP threads       : 4
+Grid : Message : 0.320000 s : 	MPI tasks            : 1 1 2 2 
+Grid : Message : 0.129590 s : Initialising 4d RNG
+Grid : Message : 0.764790 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 0.764920 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 0.942440 s : Initialising 5d RNG
+Grid : Message : 1.149388 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 1.149404 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+local rank 1 device 0 bus id: 0019:01:00.0
+local rank 2 device 0 bus id: 0029:01:00.0
+local rank 3 device 0 bus id: 0039:01:00.0
+Grid : Message : 43.893114 s : Drawing gauge field
+Grid : Message : 54.574150 s : Random gauge initialised 
+Grid : Message : 54.574170 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
+Grid : Message : 54.574172 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
+Grid : Message : 54.580032 s : Setting up Cshift based reference 
+Grid : Message : 60.407451 s : *****************************************************************
+Grid : Message : 60.407469 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 60.407470 s : *****************************************************************
+Grid : Message : 60.407471 s : *****************************************************************
+Grid : Message : 60.407472 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 60.407473 s : * Vectorising space-time by 8
+Grid : Message : 60.407475 s : * VComplex size is 64 B
+Grid : Message : 60.407477 s : * Using Overlapped Comms/Compute
+Grid : Message : 60.407479 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 60.407480 s : *****************************************************************
+Grid : Message : 61.102178 s : Called warmup
+Grid : Message : 62.177160 s : Called Dw 300 times in 1074958 us
+Grid : Message : 62.177198 s : mflop/s =   24721998.6
+Grid : Message : 62.177201 s : mflop/s per rank =  6180499.64
+Grid : Message : 62.177204 s : mflop/s per node =  24721998.6
+Grid : Message : 62.182696 s : norm diff   5.8108784e-14  Line 306
+Grid : Message : 71.328862 s : ----------------------------------------------------------------
+Grid : Message : 71.328884 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 71.328885 s : ----------------------------------------------------------------
+Grid : Message : 71.328886 s : Called DwDag
+Grid : Message : 71.328887 s : norm dag result 4.12810493
+Grid : Message : 71.329493 s : norm dag ref    4.12810493
+Grid : Message : 71.331967 s : norm dag diff   3.40632318e-14  Line 377
+Grid : Message : 71.394727 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 71.803650 s : src_e0.500003185
+Grid : Message : 71.819727 s : src_o0.499996882
+Grid : Message : 71.821991 s : *********************************************************
+Grid : Message : 71.821993 s : * Benchmarking DomainWallFermion::DhopEO                
+Grid : Message : 71.821995 s : * Vectorising space-time by 8
+Grid : Message : 71.821998 s : * Using Overlapped Comms/Compute
+Grid : Message : 71.822002 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 71.822003 s : *********************************************************
+Grid : Message : 72.377054 s : Deo mflop/s =   24065467
+Grid : Message : 72.377071 s : Deo mflop/s per rank   6016366.75
+Grid : Message : 72.377074 s : Deo mflop/s per node   24065467
+Grid : Message : 72.624877 s : r_e2.06377678
+Grid : Message : 72.625198 s : r_o2.06381058
+Grid : Message : 72.625507 s : res4.12758736
+Grid : Message : 73.759140 s : norm diff   0
+Grid : Message : 73.868204 s : norm diff even  0
+Grid : Message : 73.907201 s : norm diff odd   0
+
+
+
+
+
+
+
+Grid : Message : 74.414580 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 74.414582 s :  Testing without internode communication 
+Grid : Message : 74.414584 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 74.414586 s : Grid Layout
+Grid : Message : 74.414586 s : 	Global lattice size  : 32 32 64 64 
+Grid : Message : 74.414594 s : 	OpenMP threads       : 4
+Grid : Message : 74.414595 s : 	MPI tasks            : 1 1 2 2 
+Grid : Message : 74.679364 s : Initialising 4d RNG
+Grid : Message : 74.742332 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 74.742343 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 74.759525 s : Initialising 5d RNG
+Grid : Message : 75.812412 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 75.812429 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 119.252016 s : Drawing gauge field
+Grid : Message : 129.919846 s : Random gauge initialised 
+Grid : Message : 129.919863 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
+Grid : Message : 129.919865 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
+Grid : Message : 129.923611 s : Setting up Cshift based reference 
+Grid : Message : 135.522878 s : *****************************************************************
+Grid : Message : 135.522897 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 135.522899 s : *****************************************************************
+Grid : Message : 135.522899 s : *****************************************************************
+Grid : Message : 135.522900 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 135.522901 s : * Vectorising space-time by 8
+Grid : Message : 135.522903 s : * VComplex size is 64 B
+Grid : Message : 135.522905 s : * Using Overlapped Comms/Compute
+Grid : Message : 135.522907 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 135.522908 s : *****************************************************************
+Grid : Message : 136.151202 s : Called warmup
+Grid : Message : 137.224721 s : Called Dw 300 times in 1073490 us
+Grid : Message : 137.224748 s : mflop/s =   24755806
+Grid : Message : 137.224751 s : mflop/s per rank =  6188951.49
+Grid : Message : 137.224753 s : mflop/s per node =  24755806
+Grid : Message : 137.235239 s : norm diff   5.8108784e-14  Line 306
+Grid : Message : 146.451686 s : ----------------------------------------------------------------
+Grid : Message : 146.451708 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 146.451710 s : ----------------------------------------------------------------
+Grid : Message : 146.451712 s : Called DwDag
+Grid : Message : 146.451714 s : norm dag result 4.12810493
+Grid : Message : 146.452323 s : norm dag ref    4.12810493
+Grid : Message : 146.454799 s : norm dag diff   3.40632318e-14  Line 377
+Grid : Message : 146.498557 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 146.940894 s : src_e0.500003185
+Grid : Message : 146.953676 s : src_o0.499996882
+Grid : Message : 146.955927 s : *********************************************************
+Grid : Message : 146.955929 s : * Benchmarking DomainWallFermion::DhopEO                
+Grid : Message : 146.955932 s : * Vectorising space-time by 8
+Grid : Message : 146.955936 s : * Using Overlapped Comms/Compute
+Grid : Message : 146.955938 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 146.955941 s : *********************************************************
+Grid : Message : 147.511975 s : Deo mflop/s =   24036256.5
+Grid : Message : 147.511989 s : Deo mflop/s per rank   6009064.13
+Grid : Message : 147.511991 s : Deo mflop/s per node   24036256.5
+Grid : Message : 147.522100 s : r_e2.06377678
+Grid : Message : 147.522433 s : r_o2.06381058
+Grid : Message : 147.522745 s : res4.12758736
+Grid : Message : 148.229848 s : norm diff   0
+Grid : Message : 149.233474 s : norm diff even  0
+Grid : Message : 149.235815 s : norm diff odd   0
+
+
+
+
+
+
+
+Grid : Message : 149.960985 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 149.960990 s :  Testing without intranode communication 
+Grid : Message : 149.960991 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 149.960995 s : Grid Layout
+Grid : Message : 149.960995 s : 	Global lattice size  : 32 32 64 64 
+Grid : Message : 149.961003 s : 	OpenMP threads       : 4
+Grid : Message : 149.961004 s : 	MPI tasks            : 1 1 2 2 
+Grid : Message : 150.155810 s : Initialising 4d RNG
+Grid : Message : 150.800200 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 150.800340 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 150.973420 s : Initialising 5d RNG
+Grid : Message : 151.131117 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 151.131136 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 193.933765 s : Drawing gauge field
+Grid : Message : 204.611551 s : Random gauge initialised 
+Grid : Message : 204.611574 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
+Grid : Message : 204.611576 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
+Grid : Message : 204.615265 s : Setting up Cshift based reference 
+Grid : Message : 210.117788 s : *****************************************************************
+Grid : Message : 210.117807 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 210.117809 s : *****************************************************************
+Grid : Message : 210.117810 s : *****************************************************************
+Grid : Message : 210.117812 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 210.117813 s : * Vectorising space-time by 8
+Grid : Message : 210.117814 s : * VComplex size is 64 B
+Grid : Message : 210.117817 s : * Using Overlapped Comms/Compute
+Grid : Message : 210.117818 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 210.117819 s : *****************************************************************
+Grid : Message : 210.714641 s : Called warmup
+Grid : Message : 211.892227 s : Called Dw 300 times in 1177557 us
+Grid : Message : 211.892252 s : mflop/s =   22568003.2
+Grid : Message : 211.892255 s : mflop/s per rank =  5642000.8
+Grid : Message : 211.892257 s : mflop/s per node =  22568003.2
+Grid : Message : 211.896037 s : norm diff   5.8108784e-14  Line 306
+Grid : Message : 220.751375 s : ----------------------------------------------------------------
+Grid : Message : 220.751406 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 220.751409 s : ----------------------------------------------------------------
+Grid : Message : 220.751411 s : Called DwDag
+Grid : Message : 220.751412 s : norm dag result 4.12810493
+Grid : Message : 220.753307 s : norm dag ref    4.12810493
+Grid : Message : 220.755796 s : norm dag diff   3.40632318e-14  Line 377
+Grid : Message : 220.813226 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 221.697800 s : src_e0.500003185
+Grid : Message : 221.890920 s : src_o0.499996882
+Grid : Message : 221.913430 s : *********************************************************
+Grid : Message : 221.913450 s : * Benchmarking DomainWallFermion::DhopEO                
+Grid : Message : 221.913480 s : * Vectorising space-time by 8
+Grid : Message : 221.913500 s : * Using Overlapped Comms/Compute
+Grid : Message : 221.913530 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 221.913550 s : *********************************************************
+Grid : Message : 221.645213 s : Deo mflop/s =   24114032
+Grid : Message : 221.645228 s : Deo mflop/s per rank   6028508.01
+Grid : Message : 221.645231 s : Deo mflop/s per node   24114032
+Grid : Message : 221.656021 s : r_e2.06377678
+Grid : Message : 221.656389 s : r_o2.06381058
+Grid : Message : 221.656698 s : res4.12758736
+Grid : Message : 222.110075 s : norm diff   0
+Grid : Message : 222.857692 s : norm diff even  0
+Grid : Message : 222.875763 s : norm diff odd   0
+Grid : Message : 223.598127 s : *******************************************
+Grid : Message : 223.598145 s : ******* Grid Finalize                ******
+Grid : Message : 223.598146 s : *******************************************
--- a/systems/Jupiter/benchmarks/dwf.4node.perf
+++ b/systems/Jupiter/benchmarks/dwf.4node.perf
@ -0,0 +1,286 @@
+RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
+RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
+RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
+RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
+RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
+RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
+RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
+RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
+RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
+RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
+RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
+RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
+RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
+RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
+RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
+RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
+SLURM detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA GH200 120GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 102005473280 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 1 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+local rank 0 device 0 bus id: 0009:01:00.0
+AcceleratorCudaInit: ================================================
+SharedMemoryMpi:  World communicator of size 16
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x4002a0000000 - 40031fffffff for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=3737a24096282ea179607fc879814710860a0de6: (HEAD -> develop, origin/develop, origin/HEAD) clean
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : This rank is running on host jpbo-012-11.jupiter.internal
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 81604378624 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent host   allocations: SMALL 8 LARGE 2 HUGE 0
+Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0
+Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+
+
+
+
+
+
+
+Grid : Message : 0.834000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 0.838000 s :  Testing with full communication 
+Grid : Message : 0.839000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 0.840000 s : Grid Layout
+Grid : Message : 0.840000 s : 	Global lattice size  : 64 64 64 64 
+Grid : Message : 0.846000 s : 	OpenMP threads       : 4
+Grid : Message : 0.846000 s : 	MPI tasks            : 2 2 2 2 
+Grid : Message : 0.165970 s : Initialising 4d RNG
+Grid : Message : 0.787270 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 0.787340 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 0.960410 s : Initialising 5d RNG
+Grid : Message : 1.142344 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 1.142352 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+local rank 2 device 0 bus id: 0029:01:00.0
+local rank 3 device 0 bus id: 0039:01:00.0
+local rank 1 device 0 bus id: 0019:01:00.0
+Grid : Message : 44.657270 s : Drawing gauge field
+Grid : Message : 55.247733 s : Random gauge initialised 
+Grid : Message : 55.247745 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
+Grid : Message : 55.247747 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
+Grid : Message : 55.253053 s : Setting up Cshift based reference 
+Grid : Message : 62.191747 s : *****************************************************************
+Grid : Message : 62.191767 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 62.191768 s : *****************************************************************
+Grid : Message : 62.191769 s : *****************************************************************
+Grid : Message : 62.191769 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 62.191769 s : * Vectorising space-time by 8
+Grid : Message : 62.191770 s : * VComplex size is 64 B
+Grid : Message : 62.191771 s : * Using Overlapped Comms/Compute
+Grid : Message : 62.191771 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 62.191772 s : *****************************************************************
+Grid : Message : 62.857568 s : Called warmup
+Grid : Message : 65.581790 s : Called Dw 300 times in 2200540 us
+Grid : Message : 65.582120 s : mflop/s =   48306525
+Grid : Message : 65.582140 s : mflop/s per rank =  3019157.81
+Grid : Message : 65.582150 s : mflop/s per node =  12076631.3
+Grid : Message : 65.637550 s : norm diff   5.80156793e-14  Line 306
+Grid : Message : 75.122153 s : ----------------------------------------------------------------
+Grid : Message : 75.122166 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 75.122167 s : ----------------------------------------------------------------
+Grid : Message : 75.122167 s : Called DwDag
+Grid : Message : 75.122167 s : norm dag result 4.12801829
+Grid : Message : 75.123295 s : norm dag ref    4.12801829
+Grid : Message : 75.125890 s : norm dag diff   3.42093991e-14  Line 377
+Grid : Message : 75.188462 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 75.605683 s : src_e0.500004005
+Grid : Message : 75.617824 s : src_o0.499996067
+Grid : Message : 75.620089 s : *********************************************************
+Grid : Message : 75.620091 s : * Benchmarking DomainWallFermion::DhopEO                
+Grid : Message : 75.620093 s : * Vectorising space-time by 8
+Grid : Message : 75.620094 s : * Using Overlapped Comms/Compute
+Grid : Message : 75.620095 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 75.620096 s : *********************************************************
+Grid : Message : 76.732272 s : Deo mflop/s =   48068252.4
+Grid : Message : 76.732283 s : Deo mflop/s per rank   3004265.77
+Grid : Message : 76.732285 s : Deo mflop/s per node   12017063.1
+Grid : Message : 76.749317 s : r_e2.06443136
+Grid : Message : 76.749652 s : r_o2.06378451
+Grid : Message : 76.749955 s : res4.12821587
+Grid : Message : 77.198827 s : norm diff   0
+Grid : Message : 77.981760 s : norm diff even  0
+Grid : Message : 78.455900 s : norm diff odd   0
+
+
+
+
+
+
+
+Grid : Message : 78.539333 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 78.539337 s :  Testing without internode communication 
+Grid : Message : 78.539338 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 78.539339 s : Grid Layout
+Grid : Message : 78.539339 s : 	Global lattice size  : 64 64 64 64 
+Grid : Message : 78.539347 s : 	OpenMP threads       : 4
+Grid : Message : 78.539348 s : 	MPI tasks            : 2 2 2 2 
+Grid : Message : 78.798501 s : Initialising 4d RNG
+Grid : Message : 78.862916 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 78.862925 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 78.879916 s : Initialising 5d RNG
+Grid : Message : 79.941271 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 79.941280 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 124.586264 s : Drawing gauge field
+Grid : Message : 135.338090 s : Random gauge initialised 
+Grid : Message : 135.338102 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
+Grid : Message : 135.338103 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
+Grid : Message : 135.341266 s : Setting up Cshift based reference 
+Grid : Message : 142.604280 s : *****************************************************************
+Grid : Message : 142.604450 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 142.604460 s : *****************************************************************
+Grid : Message : 142.604470 s : *****************************************************************
+Grid : Message : 142.604480 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 142.604480 s : * Vectorising space-time by 8
+Grid : Message : 142.604500 s : * VComplex size is 64 B
+Grid : Message : 142.604510 s : * Using Overlapped Comms/Compute
+Grid : Message : 142.604510 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 142.604520 s : *****************************************************************
+Grid : Message : 142.686034 s : Called warmup
+Grid : Message : 144.868543 s : Called Dw 300 times in 2182483 us
+Grid : Message : 144.868559 s : mflop/s =   48706194.1
+Grid : Message : 144.868561 s : mflop/s per rank =  3044137.13
+Grid : Message : 144.868562 s : mflop/s per node =  12176548.5
+Grid : Message : 144.887595 s : norm diff   5.80156793e-14  Line 306
+Grid : Message : 153.622978 s : ----------------------------------------------------------------
+Grid : Message : 153.622994 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 153.622995 s : ----------------------------------------------------------------
+Grid : Message : 153.622995 s : Called DwDag
+Grid : Message : 153.622996 s : norm dag result 4.12801829
+Grid : Message : 153.623604 s : norm dag ref    4.12801829
+Grid : Message : 153.626098 s : norm dag diff   3.42093991e-14  Line 377
+Grid : Message : 153.691426 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 154.148319 s : src_e0.500004005
+Grid : Message : 154.151454 s : src_o0.499996067
+Grid : Message : 154.153722 s : *********************************************************
+Grid : Message : 154.153724 s : * Benchmarking DomainWallFermion::DhopEO                
+Grid : Message : 154.153725 s : * Vectorising space-time by 8
+Grid : Message : 154.153726 s : * Using Overlapped Comms/Compute
+Grid : Message : 154.153727 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 154.153728 s : *********************************************************
+Grid : Message : 155.200671 s : Deo mflop/s =   51121022.4
+Grid : Message : 155.200682 s : Deo mflop/s per rank   3195063.9
+Grid : Message : 155.200684 s : Deo mflop/s per node   12780255.6
+Grid : Message : 155.217204 s : r_e2.06443136
+Grid : Message : 155.217550 s : r_o2.06378451
+Grid : Message : 155.217869 s : res4.12821587
+Grid : Message : 155.673744 s : norm diff   0
+Grid : Message : 156.463329 s : norm diff even  0
+Grid : Message : 156.878866 s : norm diff odd   0
+
+
+
+
+
+
+
+Grid : Message : 157.620761 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 157.620764 s :  Testing without intranode communication 
+Grid : Message : 157.620765 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 157.620766 s : Grid Layout
+Grid : Message : 157.620766 s : 	Global lattice size  : 64 64 64 64 
+Grid : Message : 157.620773 s : 	OpenMP threads       : 4
+Grid : Message : 157.620774 s : 	MPI tasks            : 2 2 2 2 
+Grid : Message : 157.671479 s : Initialising 4d RNG
+Grid : Message : 157.738691 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 157.738698 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 157.755651 s : Initialising 5d RNG
+Grid : Message : 158.848676 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 158.848685 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 202.465158 s : Drawing gauge field
+Grid : Message : 213.214546 s : Random gauge initialised 
+Grid : Message : 213.214561 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
+Grid : Message : 213.214563 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
+Grid : Message : 213.217711 s : Setting up Cshift based reference 
+Grid : Message : 219.662772 s : *****************************************************************
+Grid : Message : 219.662786 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 219.662787 s : *****************************************************************
+Grid : Message : 219.662788 s : *****************************************************************
+Grid : Message : 219.662788 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 219.662789 s : * Vectorising space-time by 8
+Grid : Message : 219.662790 s : * VComplex size is 64 B
+Grid : Message : 219.662791 s : * Using Overlapped Comms/Compute
+Grid : Message : 219.662791 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 219.662791 s : *****************************************************************
+Grid : Message : 220.425592 s : Called warmup
+Grid : Message : 222.536249 s : Called Dw 300 times in 2110597 us
+Grid : Message : 222.536267 s : mflop/s =   50365105.5
+Grid : Message : 222.536269 s : mflop/s per rank =  3147819.09
+Grid : Message : 222.536270 s : mflop/s per node =  12591276.4
+Grid : Message : 222.541053 s : norm diff   5.80156793e-14  Line 306
+Grid : Message : 232.135901 s : ----------------------------------------------------------------
+Grid : Message : 232.135915 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 232.135916 s : ----------------------------------------------------------------
+Grid : Message : 232.135917 s : Called DwDag
+Grid : Message : 232.135918 s : norm dag result 4.12801829
+Grid : Message : 232.151938 s : norm dag ref    4.12801829
+Grid : Message : 232.154451 s : norm dag diff   3.42093991e-14  Line 377
+Grid : Message : 232.216117 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 232.630529 s : src_e0.500004005
+Grid : Message : 232.643197 s : src_o0.499996067
+Grid : Message : 232.645527 s : *********************************************************
+Grid : Message : 232.645529 s : * Benchmarking DomainWallFermion::DhopEO                
+Grid : Message : 232.645532 s : * Vectorising space-time by 8
+Grid : Message : 232.645533 s : * Using Overlapped Comms/Compute
+Grid : Message : 232.645534 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 232.645535 s : *********************************************************
+Grid : Message : 233.774184 s : Deo mflop/s =   47432091.9
+Grid : Message : 233.774194 s : Deo mflop/s per rank   2964505.74
+Grid : Message : 233.774196 s : Deo mflop/s per node   11858023
+Grid : Message : 233.791552 s : r_e2.06443136
+Grid : Message : 233.791899 s : r_o2.06378451
+Grid : Message : 233.792204 s : res4.12821587
+Grid : Message : 234.230783 s : norm diff   0
+Grid : Message : 235.162780 s : norm diff even  0
+Grid : Message : 235.291950 s : norm diff odd   0
+Grid : Message : 235.765411 s : *******************************************
+Grid : Message : 235.765424 s : ******* Grid Finalize                ******
+Grid : Message : 235.765425 s : *******************************************
+
--- a/systems/Jupiter/benchmarks/dwf1.slurm
+++ b/systems/Jupiter/benchmarks/dwf1.slurm
@ -0,0 +1,57 @@
+#!/bin/sh
+#SBATCH --account=jureap14
+#SBATCH --nodes=1
+#SBATCH --ntasks=4
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=64
+#SBATCH --time=2:00:00
+#SBATCH --partition=booster
+#SBATCH --gres=gpu:4
+
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+OPT="--comms-overlap"
+
+source ../sourceme.sh
+
+cat << EOF > bind_gpu
+#!/bin/bash
+export GPU_MAP=(0 1 2 3)
+export NUMA_MAP=(0 1 2 3)
+export NIC_MAP=(0 1 2 3)
+export GPU=\$SLURM_LOCALID
+export NUMA=\$SLURM_LOCALID
+export NIC=\$SLURM_LOCALID
+export CUDA_VISIBLE_DEVICES=\$GPU
+export UCX_NET_DEVICES=mlx5_\${NIC}:1
+
+echo RANK \$SLURM_LOCALID using NUMA \$NUMA  GPU \$GPU NIC \$UCX_NET_DEVICES
+exec numactl -m \$NUMA -N \$NUMA \$*
+EOF
+
+chmod +x ./bind_gpu
+
+srun --cpu-bind=no -N 1 -n $SLURM_NTASKS \
+        ./bind_gpu ./Benchmark_dwf_fp32 \
+	$OPT \
+	--mpi 1.1.2.2 \
+	--accelerator-threads 8 \
+	--grid 32.32.64.64 \
+	--shm 2048 > dwf.1node.perf
+
+srun --cpu-bind=no -N 1  -n $SLURM_NTASKS \
+	./bind_gpu ./Benchmark_comms_host_device \
+	--mpi 1.1.2.2 \
+	--accelerator-threads 8 \
+	--grid 32.32.64.64 \
+	--shm 2048 > comms.1node.perf
+
+
+
+
--- a/systems/Jupiter/benchmarks/dwf4.slurm
+++ b/systems/Jupiter/benchmarks/dwf4.slurm
@ -0,0 +1,57 @@
+#!/bin/sh
+#SBATCH --account=jureap14
+#SBATCH --nodes=4
+#SBATCH --ntasks=16
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=64
+#SBATCH --time=2:00:00
+#SBATCH --partition=booster
+#SBATCH --gres=gpu:4
+
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+OPT="--comms-overlap"
+
+source ../sourceme.sh
+
+cat << EOF > bind_gpu
+#!/bin/bash
+export GPU_MAP=(0 1 2 3)
+export NUMA_MAP=(0 1 2 3)
+export NIC_MAP=(0 1 2 3)
+export GPU=\$SLURM_LOCALID
+export NUMA=\$SLURM_LOCALID
+export NIC=\$SLURM_LOCALID
+export CUDA_VISIBLE_DEVICES=\$GPU
+export UCX_NET_DEVICES=mlx5_\${NIC}:1
+
+echo RANK \$SLURM_LOCALID using NUMA \$NUMA  GPU \$GPU NIC \$UCX_NET_DEVICES
+exec numactl -m \$NUMA -N \$NUMA \$*
+EOF
+
+chmod +x ./bind_gpu
+
+srun --cpu-bind=no -N 4 -n $SLURM_NTASKS \
+        ./bind_gpu ./Benchmark_dwf_fp32 \
+	$OPT \
+	--mpi 2.2.2.2 \
+	--accelerator-threads 8 \
+	--grid 64.64.64.64 \
+	--shm 2048 > dwf.4node.perf
+
+srun --cpu-bind=no -N 4  -n $SLURM_NTASKS \
+	./bind_gpu ./Benchmark_comms_host_device \
+	--mpi 2.2.2.2 \
+	--accelerator-threads 8 \
+	--grid 32.32.64.64 \
+	--shm 2048 > comms.4node.perf
+
+
+
+
--- a/systems/Jupiter/config-command
+++ b/systems/Jupiter/config-command
@ -0,0 +1,16 @@
+export CXX=nvcc
+export OPENMPI=/p/software/default/stages/2025/software/OpenMPI/5.0.5-NVHPC-24.9-CUDA-12/
+export LDFLAGS="-cudart shared -L${OPENMPI}/lib" 
+export CXXFLAGS="-ccbin clang++ -gencode arch=compute_90,code=sm_90 -std=c++17 -cudart shared -lcublas -lmpi -I${OPENMPI}/include"
+
+../../configure \
+    --enable-comms=mpi \
+    --enable-simd=GPU \
+    --enable-gen-simd-width=64 \
+    --enable-shm=nvlink \
+    --enable-accelerator=cuda \
+    --with-lime=$CLIME \
+    --disable-gparity \
+    --disable-fermion-reps \
+    --disable-unified 
+
--- a/systems/Jupiter/sourceme.sh
+++ b/systems/Jupiter/sourceme.sh
@ -0,0 +1,9 @@
+CLIME=$HOME/install/
+module load Clang
+module load CUDA
+module load FFTW
+module load OpenSSL
+module load MPFR
+module load NVHPC
+module load UCX
+module load OpenMPI
--- a/systems/sdcc-genoa/config-command
+++ b/systems/sdcc-genoa/config-command
@ -1,3 +1,12 @@
+
+spack load c-lime
+spack load fftw
+spack load hdf5+cxx
+
+export FFTW=`spack find --paths fftw       | grep ^fftw   | awk '{print $2}' `
+export HDF5=`spack find --paths hdf5+cxx   | grep ^hdf5   | awk '{print $2}' `
+export CLIME=`spack find --paths c-lime    | grep ^c-lime | awk '{print $2}' `
+
 ../../configure \
 --enable-comms=mpi-auto \
 --enable-unified=yes \
@ -5,12 +14,16 @@
 --enable-shm-fast-path=shmopen \
 --enable-accelerator=none \
 --enable-simd=AVX512 \
--disable-accelerator-cshift \
+--with-lime=$CLIME \
+--with-hdf5=$HDF5 \
+--with-fftw=$FFTW \
 --disable-fermion-reps \
 --disable-gparity \
 CXX=clang++ \
 MPICXX=mpicxx \
-CXXFLAGS="-std=c++17"
+LIBS=-llime \
+LDFLAGS=-L$CLIME/lib/ \
+CXXFLAGS="-std=c++17 -fPIE"



--- a/systems/sdcc-genoa/sourceme.sh
+++ b/systems/sdcc-genoa/sourceme.sh
@ -1,4 +1,5 @@
 source $HOME/spack/share/spack/setup-env.sh
 spack load llvm@17.0.4
 export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-17.0.4-laufdrcip63ivkadmtgoepwmj3dtztdu/lib:$LD_LIBRARY_PATH
-module load openmpi
+module load openmpi/4.1.8
+spack load c-lime
--- a/tests/lanczos/Test_dwf_G5R5.cc
+++ b/tests/lanczos/Test_dwf_G5R5.cc
@ -31,12 +31,23 @@ directory

 using namespace std;
 using namespace Grid;
- ;

 //typedef WilsonFermionD FermionOp;
 typedef DomainWallFermionD FermionOp;
 typedef typename DomainWallFermionD::FermionField FermionField;

+template <class T> void writeFile(T& in, std::string const fname){  
+#ifdef HAVE_LIME
+  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
+  std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
+  Grid::emptyUserRecord record;
+  Grid::ScidacWriter WR(in.Grid()->IsBoss());
+  WR.open(fname);
+  WR.writeScidacFieldRecord(in,record,0);
+  WR.close();
+#endif
+  // What is the appropriate way to throw error?
+}

 RealD AllZero(RealD x) { return 0.; }

@ -121,7 +132,7 @@ int main(int argc, char** argv) {

  int Ls=16;
  RealD M5=1.8;
-  RealD mass = -1.0;
+  RealD mass = 0.01;

  mass=LanParams.mass;
  Ls=LanParams.Ls;
@ -159,10 +170,10 @@ int main(int argc, char** argv) {
    U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
  }
 */
-
-  int Nstop = 10;
  int Nk = 20;
+  int Nstop = Nk;
  int Np = 80;
+
  Nstop=LanParams.Nstop;
  Nk=LanParams.Nk;
  Np=LanParams.Np;
@ -201,8 +212,12 @@ int main(int argc, char** argv) {
  int Nconv;
  IRL.calc(eval, evec, src, Nconv);

-  std::cout << mass <<" : " << eval << std::endl;
-
+  std::cout << mass <<" : " << eval        << std::endl;
+  std::cout << " #evecs "   << evec.size() << std::endl;
+  std::cout << " Nconv  "   << Nconv       << std::endl;
+  std::cout << " Nm     "   << Nm          << std::endl;
+  if ( Nconv > evec.size() ) Nconv = evec.size();
+  
 #if 0
  Gamma g5(Gamma::Algebra::Gamma5) ;
  ComplexD dot;
@ -232,6 +247,7 @@ int main(int argc, char** argv) {
  vector<LatticeFermion> finalevec(Nconv, FGrid);
  vector<RealD> eMe(Nconv), eMMe(Nconv);
  for(int i = 0; i < Nconv; i++){
+    cout << "calculate the matrix element["<<i<<"]" << endl;
    G5R5Herm.HermOpAndNorm(evec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
  }
  cout << "Re<evec, G5R5M(evec)>: " << endl;
@ -298,7 +314,7 @@ int main(int argc, char** argv) {
      }
    }
  }
-    for(int i = 0; i < Nconv; i++){
+  for(int i = 0; i < Nconv; i++){
    G5R5Herm.HermOpAndNorm(finalevec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
  }
  cout << "Re<evec, G5R5M(evec)>: " << endl;
@ -306,6 +322,7 @@ int main(int argc, char** argv) {
  cout << "<G5R5M(evec), G5R5M(evec)>" << endl;
  cout << eMMe << endl;

+  

 //  vector<LatticeFermion> finalevec(Nconv, FGrid);
 // temporary, until doing rotation
@ -326,13 +343,41 @@ int main(int argc, char** argv) {
      axpby_ssp(G5evec[i], -1., finalevec[i], 0., G5evec[i], j, j);
    }
  }
+  
+  for(int i = 0; i < Nconv; i++){
+    Ddwf.M(finalevec[i], G5R5Mevec[i]);
+    for(int j = 0; j < Nconv; j++){
+      std::cout << "<"<<j<<"|Ddwf|"<<i<<"> = "<<innerProduct(finalevec[j],G5R5Mevec[i])<<std::endl;
+    }
+  }
+  for(int i = 0; i < Nconv; i++){
+    RealD t1,t2;
+    G5R5Herm.HermOpAndNorm(finalevec[i], G5R5Mevec[i], t1, t2);
+    for(int j = 0; j < Nconv; j++){
+      std::cout << "<"<<j<<"|G5R5 M|"<<i<<"> = "<<innerProduct(finalevec[j],G5R5Mevec[i])<<std::endl;
+    }
+  }
+  
  for(int i = 0; i < Nconv; i++){
    chiral_matrix_real[i].resize(Nconv);
    chiral_matrix[i].resize(Nconv);
+
+    std::string evfile("./evec_density");
+    evfile = evfile+"_"+std::to_string(i);
+    auto evdensity = localInnerProduct(finalevec[i],finalevec[i] );
+    writeFile(evdensity,evfile);
+
    for(int j = 0; j < Nconv; j++){
      chiral_matrix[i][j] = innerProduct(finalevec[i], G5evec[j]);
+      std::cout <<" chiral_matrix_real signed "<<i<<" "<<j<<" "<< chiral_matrix_real[i][j] << std::endl;
      chiral_matrix_real[i][j] = abs(chiral_matrix[i][j]);
      std::cout <<" chiral_matrix_real "<<i<<" "<<j<<" "<< chiral_matrix_real[i][j] << std::endl;
+      if ( chiral_matrix_real[i][j] > 0.8 ) {
+	auto g5density = localInnerProduct(finalevec[i], G5evec[j]);
+	std::string chfile("./chiral_density_");
+	chfile = chfile +std::to_string(i)+"_"+std::to_string(j);
+	writeFile(g5density,chfile);
+      }
    }
  }
  for(int i = 0; i < Nconv; i++){
@ -341,6 +386,43 @@ int main(int argc, char** argv) {
    }
  }

-
+  FILE *fp = fopen("lego-plot.py","w"); assert(fp!=NULL);
+#define PYTHON_LINE(A)  fprintf(fp,A"\n");
+  PYTHON_LINE("import matplotlib.pyplot as plt");
+  PYTHON_LINE("import numpy as np");
+  PYTHON_LINE("");
+  PYTHON_LINE("fig = plt.figure()");
+  PYTHON_LINE("ax = fig.add_subplot(projection='3d')");
+  PYTHON_LINE("");
+  PYTHON_LINE("x, y = np.random.rand(2, 100) * 4");
+  fprintf(fp,"hist, xedges, yedges = np.histogram2d(x, y, bins=%d, range=[[0, %d], [0, %d]])\n",Nconv,Nconv-1,Nconv-1);
+  PYTHON_LINE("");
+  PYTHON_LINE("# Construct arrays for the anchor positions of the 16 bars");
+  PYTHON_LINE("xpos, ypos = np.meshgrid(xedges[:-1] + 0.25, yedges[:-1] + 0.25, indexing=\"ij\")");
+  PYTHON_LINE("xpos = xpos.ravel()");
+  PYTHON_LINE("ypos = ypos.ravel()");
+  PYTHON_LINE("zpos = 0");
+  PYTHON_LINE("");
+  PYTHON_LINE("# Construct arrays with the dimensions for the 16 bars.");
+  PYTHON_LINE("dx = dy = 0.5 * np.ones_like(zpos)");
+  PYTHON_LINE("dz = np.array([");
+  for(int i = 0; i < Nconv; i++){
+    fprintf(fp,"\t[ ");
+    for(int j = 0; j < Nconv; j++){
+      fprintf(fp,"%lf ",chiral_matrix_real[i][j]);
+      if(j<Nconv-1) fprintf(fp,",");
+      else          fprintf(fp," ");
+    }
+    fprintf(fp,"]");
+    if(i<Nconv-1) fprintf(fp,",\n");
+    else          fprintf(fp,"\n");
+  }
+	      
+  PYTHON_LINE("\t])");
+  PYTHON_LINE("dz = dz.ravel()");
+  PYTHON_LINE("ax.bar3d(xpos, ypos, zpos, dx, dy, dz, zsort='average')");
+  PYTHON_LINE("plt.show()");
+  fclose(fp);
+  
  Grid_finalize();
 }
--- a/visualisation/CMakeLists.txt
+++ b/visualisation/CMakeLists.txt
@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.12 FATAL_ERROR)

 project(GridViewer)

-list(APPEND CMAKE_PREFIX_PATH "/Users/peterboyle/QCD/vtk/VTK-9.4.2-install/")
+list(APPEND CMAKE_PREFIX_PATH "/home/paboyle/Visualisation/install/")

 find_package(VTK COMPONENTS 
  CommonColor
--- a/visualisation/README
+++ b/visualisation/README
@ -73,6 +73,21 @@ each to:

   VTK really should make it easier to pick up the flags required for FFMPEG linkage, especially as they are very quirky on MacOS.

+========================================
+Aurora compilation:
+========================================
+module load ffmpeg
+download & untar: VTK-7.0.2
+mkdir build
+cd build 
+ccmake ../
+
+"t"
+Enable: VTK_MODULE_ENABLE_VTK_IOFFMPEG   YES   
+"configure" ; should "discover" the installed ffmpeg module
+
+Still need an "X" connection to make the MPEG files.
+

 ========================================
 Grid:
--- a/visualisation/cmake-command
+++ b/visualisation/cmake-command
@ -1,10 +1,17 @@
-libs=`grid-config --libs`
-ldflags=`grid-config --ldflags`
-cxxflags=`grid-config --cxxflags`
-cxx=`grid-config --cxx`
-cc=clang
+export grid_config=/home/paboyle/GPT/install/bin/grid-config
+libs=`$grid_config --libs`
+ldflags=`$grid_config --ldflags`
+cxxflags=`$grid_config --cxxflags`
+cxx=`$grid_config --cxx`
+cc=icx

 mkdir build
 cd build

-LDFLAGS="$ldflags $libs " cmake .. -DCMAKE_C_COMPILER=$cc -DCMAKE_CXX_COMPILER=$cxx -DCMAKE_CXX_FLAGS=$cxxflags 
+echo CC $cc
+echo CXX $cxx
+echo CXXFLAGS $cxxflags
+echo LDFLAGS  $ldflags
+echo LIBS  $libs
+
+LDFLAGS="$ldflags $libs " cmake .. -DCMAKE_C_COMPILER=$cc -DCMAKE_CXX_COMPILER="$cxx" -DCMAKE_CXX_FLAGS="$cxxflags "
Author	SHA1	Message	Date
Peter Boyle	9203126aa5	Scripts	2025-06-11 15:30:16 +02:00
Peter Boyle	f90ba4712a	Update for Jupiter	2025-06-11 15:24:34 +02:00
Peter Boyle	3737a24096	Updated python output	2025-06-03 14:09:29 -04:00
Peter Boyle	d418f78352	Making running on Aurora more debuggable	2025-05-23 20:58:16 +00:00
Peter Boyle	25163998a0	Makes SYCL compiler happy	2025-05-23 20:57:11 +00:00
Peter Boyle	dc546aaa4b	Updated config options for BNL cluster	2025-05-13 18:44:47 -04:00
Peter Boyle	5364d580c9	Output chirality, eigenvector density files and python source lego plot	2025-05-13 18:44:47 -04:00
Peter Boyle	2a9a6347e3	Do not require Grid format RNGs and also to the 5Li reporting	2025-05-13 18:44:47 -04:00
Peter Boyle	cfdb56f314	Run measurements at t=0 too	2025-05-13 18:44:46 -04:00
Peter Boyle	b517e88db3	Update README	2025-05-13 16:49:21 -04:00
Peter Boyle	bb317aba8d	Lattice = for sycl	2025-05-13 12:50:58 +00:00
Peter Boyle	644cc6647e	JSON update	2025-05-13 12:50:58 +00:00
Peter Boyle	72397ce23b	SYCL interface change	2025-05-13 12:50:58 +00:00