merged in develop

2025-10-26 17:49:33 +00:00 · 2019-12-04 17:12:46 +00:00
parent e940f4db7e d6100cc35a
commit cd9fd80a5d
39 changed files with 2641 additions and 68 deletions
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -162,11 +162,8 @@ static inline int divides(int a,int b)
 void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
 {
  ////////////////////////////////////////////////////////////////
-  // Assert power of two shm_size.
+  // Powers of 2,3,5 only in prime decomposition for now
  ////////////////////////////////////////////////////////////////
-  int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
-  assert(log2size != -1);
-
  int ndimension = WorldDims.size();
  ShmDims=Coordinate(ndimension,1);

@@ -177,7 +174,8 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD
  while(AutoShmSize != WorldShmSize) {
    for(int p=0;p<primes.size();p++) {
      int prime=primes[p];
-      if ( divides(prime,WorldDims[dim]/ShmDims[dim]) ) {
+      if ( divides(prime,WorldDims[dim]/ShmDims[dim])
+        && divides(prime,WorldShmSize/AutoShmSize)  ) {
 	AutoShmSize*=prime;
 	ShmDims[dim]*=prime;
 	break;
@@ -308,7 +306,6 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
 }
 void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
 {
-
  ////////////////////////////////////////////////////////////////
  // Identify subblock of ranks on node spreading across dims
  // in a maximally symmetrical way
@@ -435,10 +432,13 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  // e.g. DGX1, supermicro board, 
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  //  cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
+
 #ifdef GRID_IBM_SUMMIT
-  std::cout << header << "flag IBM_SUMMIT disabled CUDA set device: ensure jsrun is used correctly" <<std::endl;
+  // IBM Jsrun makes cuda Device numbering screwy and not match rank
+    std::cout << "IBM Summit or similar - NOT setting device to WorldShmRank"<<std::endl;
 #else
-  cudaSetDevice(WorldShmRank);
+    std::cout << "setting device to WorldShmRank"<<std::endl;
+    cudaSetDevice(WorldShmRank);
 #endif
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Each MPI rank should allocate our own buffer
@@ -466,7 +466,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    // If it is me, pass around the IPC access key
    //////////////////////////////////////////////////
    cudaIpcMemHandle_t handle;
-
+    
    if ( r==WorldShmRank ) { 
      err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
      if ( err !=  cudaSuccess) {
@@ -735,6 +735,24 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
  std::vector<int> ranks(size);   for(int r=0;r<size;r++) ranks[r]=r;
  MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]); 

+#ifdef GRID_IBM_SUMMIT
+  // Hide the shared memory path between sockets 
+  // if even number of nodes
+  if ( (ShmSize & 0x1)==0 ) {
+    int SocketSize = ShmSize/2;
+    int mySocket = ShmRank/SocketSize; 
+    for(int r=0;r<size;r++){
+      int hisRank=ShmRanks[r];
+      if ( hisRank!= MPI_UNDEFINED ) {
+	int hisSocket=hisRank/SocketSize;
+	if ( hisSocket != mySocket ) {
+	  ShmRanks[r] = MPI_UNDEFINED;
+	}
+      }
+    }
+  }
+#endif
+
  SharedMemoryTest();
 }
 //////////////////////////////////////////////////////////////////
--- a/Grid/perfmon/PerfCount.h
+++ b/Grid/perfmon/PerfCount.h
@@ -44,8 +44,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <sys/syscall.h>
 #endif
 #ifdef __x86_64__
+#ifdef GRID_NVCC
+accelerator_inline uint64_t __rdtsc(void) {  return 0; }
+accelerator_inline uint64_t __rdpmc(int ) {  return 0; }
+#else
 #include <x86intrin.h>
 #endif
+#endif

 NAMESPACE_BEGIN(Grid);

@@ -89,13 +94,8 @@ inline uint64_t cyclecount(void){
  return tmp;
 }
 #elif defined __x86_64__
-#ifdef GRID_NVCC
-accelerator_inline uint64_t __rdtsc(void) {  return 0; }
-#endif
 inline uint64_t cyclecount(void){ 
  return __rdtsc();
-  //  unsigned int dummy;
-  // return __rdtscp(&dummy);
 }
 #else

--- a/Grid/qcd/action/fermion/instantiation/ImprovedStaggeredFermionInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/ImprovedStaggeredFermionInstantiation.cc
@@ -26,7 +26,7 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 NAMESPACE_BEGIN(Grid);

--- a/Grid/qcd/action/fermion/instantiation/ImprovedStaggeredFermionInstantiation.cc.master
+++ b/Grid/qcd/action/fermion/instantiation/ImprovedStaggeredFermionInstantiation.cc.master
@@ -26,7 +26,7 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 #include <Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h>

 NAMESPACE_BEGIN(Grid);
--- a/Grid/qcd/action/fermion/instantiation/StaggeredImplD/ImprovedStaggeredFermionInstantiationStaggeredImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/StaggeredImplD/ImprovedStaggeredFermionInstantiationStaggeredImplD.cc
@@ -26,7 +26,7 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 #include <Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h>

 NAMESPACE_BEGIN(Grid);
--- a/Grid/qcd/action/fermion/instantiation/StaggeredImplF/ImprovedStaggeredFermionInstantiationStaggeredImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/StaggeredImplF/ImprovedStaggeredFermionInstantiationStaggeredImplF.cc
@@ -26,7 +26,7 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 #include <Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h>

 NAMESPACE_BEGIN(Grid);
--- a/Grid/qcd/smearing/StoutSmearing.h
+++ b/Grid/qcd/smearing/StoutSmearing.h
@@ -1,5 +1,34 @@
+/*************************************************************************************
+ 
+ Grid physics library, www.github.com/paboyle/Grid
+ 
+ Source file: ./lib/qcd/smearing/StoutSmearing.h
+ 
+ Copyright (C) 2019
+ 
+ Author: unknown
+ Author: Felix Erben <ferben@ed.ac.uk>
+ Author: Michael Marshall <Michael.Marshall@ed.ac.uk>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ 
+ See the full license in the file "LICENSE" in the top level distribution
+ directory
+ *************************************************************************************/
 /*
-  @file stoutSmear.hpp
+  @file StoutSmearing.h
  @brief Declares Stout smearing class
 */
 #pragma once
@@ -9,19 +38,43 @@ NAMESPACE_BEGIN(Grid);
 /*!  @brief Stout smearing of link variable. */
 template <class Gimpl>
 class Smear_Stout : public Smear<Gimpl> {
-private:
-  const Smear<Gimpl>* SmearBase;
+ private:
+  int OrthogDim = -1;
+  const std::vector<double> SmearRho;
+  // Smear<Gimpl>* ownership semantics:
+  //    Smear<Gimpl>* passed in to constructor are owned by caller, so we don't delete them here
+  //    Smear<Gimpl>* created within constructor need to be deleted as part of the destructor
+  const std::unique_ptr<Smear<Gimpl>> OwnedBase; // deleted at destruction
+  const Smear<Gimpl>* SmearBase; // Not owned by this object, so not deleted at destruction

+  // only anticipated to be used from default constructor
+  inline static std::vector<double> rho3D(double rho, int orthogdim){
+    std::vector<double> rho3d(Nd*Nd);
+    for (int mu=0; mu<Nd; mu++)
+      for (int nu=0; nu<Nd; nu++)
+        rho3d[mu + Nd * nu] = (mu == nu || mu == orthogdim || nu == orthogdim) ? 0.0 : rho;
+    return rho3d;
+  };
+  
 public:
  INHERIT_GIMPL_TYPES(Gimpl)

-  Smear_Stout(Smear<Gimpl>* base) : SmearBase(base) {
-    assert(Nc == 3);//                  "Stout smearing currently implemented only for Nc==3");
+  /*! Stout smearing with base explicitly specified */
+  Smear_Stout(Smear<Gimpl>* base) : SmearBase{base} {
+    assert(Nc == 3 && "Stout smearing currently implemented only for Nc==3");
  }

-  /*! Default constructor */
-  Smear_Stout(double rho = 1.0) : SmearBase(new Smear_APE<Gimpl>(rho)) {
-    assert(Nc == 3);//                  "Stout smearing currently implemented only for Nc==3");
+  /*! Construct stout smearing object from explicitly specified rho matrix */
+  Smear_Stout(const std::vector<double>& rho_)
+    : OwnedBase{new Smear_APE<Gimpl>(rho_)}, SmearBase{OwnedBase.get()} {
+    std::cout << GridLogDebug << "Stout smearing constructor : Smear_Stout(const std::vector<double>& " << rho_ << " )" << std::endl
+    assert(Nc == 3 && "Stout smearing currently implemented only for Nc==3");
+    }
+
+  /*! Default constructor. rho is constant in all directions, optionally except for orthogonal dimension */
+  Smear_Stout(double rho = 1.0, int orthogdim = -1)
+  : OrthogDim{orthogdim}, SmearRho{ rho3D(rho,orthogdim) }, OwnedBase{ new Smear_APE<Gimpl>(SmearRho) }, SmearBase{OwnedBase.get()} {
+    assert(Nc == 3 && "Stout smearing currently implemented only for Nc==3");
  }

  ~Smear_Stout() {}  // delete SmearBase...
@@ -36,12 +89,16 @@ public:
    SmearBase->smear(C, U);

    for (int mu = 0; mu < Nd; mu++) {
-      tmp = peekLorentz(C, mu);
-      Umu = peekLorentz(U, mu);
-      iq_mu = Ta(
-		 tmp *
-		 adj(Umu));  // iq_mu = Ta(Omega_mu) to match the signs with the paper
-      exponentiate_iQ(tmp, iq_mu);
+      if( mu == OrthogDim )
+        tmp = 1.0;  // Don't smear in the orthogonal direction
+      else {
+        tmp = peekLorentz(C, mu);
+        Umu = peekLorentz(U, mu);
+        iq_mu = Ta(
+                   tmp *
+                   adj(Umu));  // iq_mu = Ta(Omega_mu) to match the signs with the paper
+        exponentiate_iQ(tmp, iq_mu);
+      }
      pokeLorentz(u_smr, tmp * Umu, mu);  // u_smr = exp(iQ_mu)*U_mu
    }
    std::cout << GridLogDebug << "Stout smearing completed\n";
@@ -80,6 +137,7 @@ public:
    iQ2 = iQ * iQ;
    iQ3 = iQ * iQ2;

+    //We should check sgn(c0) here already and then apply eq (34) from 0311018
    set_uw(u, w, iQ2, iQ3);
    set_fj(f0, f1, f2, u, w);

@@ -139,9 +197,8 @@ public:
  }

  LatticeComplex func_xi0(const LatticeComplex& w) const {
-    // Define a function to do the check
-    // if( w < 1e-4 ) std::cout << GridLogWarning<< "[Smear_stout] w too small:
-    // "<< w <<"\n";
+    // Definition from arxiv 0311018
+    //if (abs(w) < 0.05) {w2 = w*w; return 1.0 - w2/6.0 * (1.0-w2/20.0 * (1.0-w2/42.0));}
    return sin(w) / w;
  }

@@ -154,4 +211,3 @@ public:
 };

 NAMESPACE_END(Grid);
-
--- a/Grid/qcd/utils/A2Autils.h
+++ b/Grid/qcd/utils/A2Autils.h
@@ -260,7 +260,7 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
 	      int ij_dx = m+Nmom*i + Nmom*Lblock * j + Nmom*Lblock * Rblock * lt;
 	      for(int mu=0;mu<Ngamma;mu++){
 		// this is a bit slow
-		mat(m,mu,t,i,j) = trace(lsSum[ij_dx]*Gamma(gammas[mu]));
+		mat(m,mu,t,i,j) = trace(lsSum[ij_dx]*Gamma(gammas[mu]))()()();
 	      }
 	    }
 	  }
--- a/Grid/qcd/utils/BaryonUtils.h
+++ b/Grid/qcd/utils/BaryonUtils.h
@@ -45,8 +45,8 @@ public:
  typedef Lattice<iSpinMatrix<typename FImpl::Simd>> SpinMatrixField;
  typedef typename SpinMatrixField::vector_object sobj;

-  static constexpr int epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
-  static constexpr int epsilon_sgn[6]= {1,1,1,-1,-1,-1};
+  static const int epsilon[6][3] ;
+  static const Complex epsilon_sgn[6];

  private: 
  template <class mobj, class robj>
@@ -149,10 +149,15 @@ public:
 				 SpinMatrixField &stn_corr);
 };

-template <class FImpl>
-constexpr int BaryonUtils<FImpl>::epsilon[6][3];
-template <class FImpl>
-constexpr int BaryonUtils<FImpl>::epsilon_sgn[6];
+template <class FImpl> 
+const int BaryonUtils<FImpl>::epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
+template <class FImpl> 
+const Complex BaryonUtils<FImpl>::epsilon_sgn[6] = {Complex(1),
+						    Complex(1),
+						    Complex(1),
+						    Complex(-1),
+						    Complex(-1),
+						    Complex(-1)};

 template <class FImpl>
 template <class mobj, class robj>
@@ -189,7 +194,7 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() += static_cast<Complex>(epsilon_sgn[ie_left] * epsilon_sgn[ie_right]) * pD1()(gamma_left,gamma_left)(c_right,c_left)*D2g()(alpha_right,beta_left)(a_right,a_left)*gD3()(alpha_right,beta_left)(b_right,b_left);
+	    result()()() += epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,gamma_left)(c_right,c_left)*D2g()(alpha_right,beta_left)(a_right,a_left)*gD3()(alpha_right,beta_left)(b_right,b_left);
          }}}
  	}	  
        //This is the \delta_{456}^{231} part
@@ -198,7 +203,7 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() += static_cast<Complex>(epsilon_sgn[ie_left] * epsilon_sgn[ie_right]) * pD1g()(gamma_left,beta_left)(c_right,a_left)*D2()(alpha_right,beta_left)(a_right,b_left)*gD3()(alpha_right,gamma_left)(b_right,c_left);
+	    result()()() += epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1g()(gamma_left,beta_left)(c_right,a_left)*D2()(alpha_right,beta_left)(a_right,b_left)*gD3()(alpha_right,gamma_left)(b_right,c_left);
          }}}
        }	  
        //This is the \delta_{456}^{312} part
@@ -207,7 +212,7 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() += static_cast<Complex>(epsilon_sgn[ie_left] * epsilon_sgn[ie_right]) * pD1()(gamma_left,beta_left)(c_right,b_left)*D2()(alpha_right,gamma_left)(a_right,c_left)*gD3g()(alpha_right,beta_left)(b_right,a_left);
+	    result()()() += epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,beta_left)(c_right,b_left)*D2()(alpha_right,gamma_left)(a_right,c_left)*gD3g()(alpha_right,beta_left)(b_right,a_left);
          }}}
        }	  
        //This is the \delta_{456}^{132} part
@@ -216,7 +221,7 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() -= static_cast<Complex>(epsilon_sgn[ie_left] * epsilon_sgn[ie_right]) * pD1()(gamma_left,gamma_left)(c_right,c_left)*D2()(alpha_right,beta_left)(a_right,b_left)*gD3g()(alpha_right,beta_left)(b_right,a_left);
+	    result()()() -= epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,gamma_left)(c_right,c_left)*D2()(alpha_right,beta_left)(a_right,b_left)*gD3g()(alpha_right,beta_left)(b_right,a_left);
          }}}
        }	  
        //This is the \delta_{456}^{321} part
@@ -225,7 +230,7 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() -= static_cast<Complex>(epsilon_sgn[ie_left] * epsilon_sgn[ie_right]) * pD1()(gamma_left,beta_left)(c_right,b_left)*D2g()(alpha_right,beta_left)(a_right,a_left)*gD3()(alpha_right,gamma_left)(b_right,c_left);
+	    result()()() -= epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,beta_left)(c_right,b_left)*D2g()(alpha_right,beta_left)(a_right,a_left)*gD3()(alpha_right,gamma_left)(b_right,c_left);
          }}}
        }	  
        //This is the \delta_{456}^{213} part
@@ -234,7 +239,7 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() -= static_cast<Complex>(epsilon_sgn[ie_left] * epsilon_sgn[ie_right]) * pD1g()(gamma_left,beta_left)(c_right,a_left)*D2()(alpha_right,gamma_left)(a_right,c_left)*gD3()(alpha_right,beta_left)(b_right,b_left);
+	    result()()() -= epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1g()(gamma_left,beta_left)(c_right,a_left)*D2()(alpha_right,gamma_left)(a_right,c_left)*gD3()(alpha_right,beta_left)(b_right,b_left);
          }}}
        }	  
      }
@@ -414,10 +419,10 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q1_NonEye_site(const mobj &Du_xi,
        for (int tau2=0; tau2<Ns; tau2++){
        for (int j=0; j<Nc; j++){
          auto DuGH_at_aj = DuGH()(alpha_s,tau2)(a_s,j);
-          auto ee_GDGDDG_a = static_cast<Complex>(epsilon_sgn[ie_n] * epsilon_sgn[ie_s]) * GDsGDd_ab_bb * DuGH_at_aj;
+          auto ee_GDGDDG_a = epsilon_sgn[ie_n] * epsilon_sgn[ie_s] * GDsGDd_ab_bb * DuGH_at_aj;
          for (int gamma_s=0; gamma_s<Ns; gamma_s++){
            auto DuGH_gt_cj = DuGH()(gamma_s,tau2)(c_s,j);
-            auto ee_GDGDDG_c = static_cast<Complex>(epsilon_sgn[ie_n] * epsilon_sgn[ie_s]) * GDsGDd_ab_bb * DuGH_gt_cj;
+            auto ee_GDGDDG_c = epsilon_sgn[ie_n] * epsilon_sgn[ie_s] * GDsGDd_ab_bb * DuGH_gt_cj;
            for (int gamma_n=0; gamma_n<Ns; gamma_n++){
              result()(gamma_s,gamma_n)() += ee_GDGDDG_a * DuG()(gamma_s, beta_n)(c_s,a_n) * adjDu()(tau2,gamma_n)(j,c_n);
              result()(gamma_s,gamma_n)() += ee_GDGDDG_c * adjDuG()(tau2, beta_n)(j,a_n) * Du_spec()(alpha_s,gamma_n)(a_s,c_n);
@@ -466,7 +471,7 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q2_Eye_site(const mobj &Dq_loop,
 	auto GDsG_at_bi = GDsG()(alpha_s,tau)(b_s,i);
        for (int beta_n=0; beta_n<Ns; beta_n++){
          auto DqGDd_tb_ib = DqGDd()(tau,beta_n)(i,b_n);
-	  auto ee_GDGDGD = static_cast<Complex>(epsilon_sgn[ie_n] * epsilon_sgn[ie_s]) * GDsG_at_bi * DqGDd_tb_ib;
+	  auto ee_GDGDGD = epsilon_sgn[ie_n] * epsilon_sgn[ie_s] * GDsG_at_bi * DqGDd_tb_ib;
          for (int gamma_s=0; gamma_s<Ns; gamma_s++){
          for (int gamma_n=0; gamma_n<Ns; gamma_n++){
            result()(gamma_s,gamma_n)() -= ee_GDGDGD * DuG()(alpha_s, beta_n)(a_s,a_n) * Du_spec()(gamma_s,gamma_n)(c_s,c_n);
@@ -515,10 +520,10 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q2_NonEye_site(const mobj &Du_xi,
 	auto GDsG_at_bi = GDsG()(alpha_s,tau)(b_s,i);
        for (int beta_n=0; beta_n<Ns; beta_n++){
          auto DuGDd_ab_ab = DuGDd()(alpha_s,beta_n)(a_s,b_n);
-	  auto ee_GDGDGD_a = static_cast<Complex>(epsilon_sgn[ie_n] * epsilon_sgn[ie_s]) * GDsG_at_bi * DuGDd_ab_ab;
+	  auto ee_GDGDGD_a = epsilon_sgn[ie_n] * epsilon_sgn[ie_s] * GDsG_at_bi * DuGDd_ab_ab;
          for (int gamma_s=0; gamma_s<Ns; gamma_s++){
            auto DuGDd_gb_cb = DuGDd()(gamma_s,beta_n)(c_s,b_n);
-	    auto ee_GDGDGD_c = static_cast<Complex>(epsilon_sgn[ie_n] * epsilon_sgn[ie_s]) * GDsG_at_bi * DuGDd_gb_cb;
+	    auto ee_GDGDGD_c = epsilon_sgn[ie_n] * epsilon_sgn[ie_s] * GDsG_at_bi * DuGDd_gb_cb;
            for (int gamma_n=0; gamma_n<Ns; gamma_n++){
              result()(gamma_s,gamma_n)() -= ee_GDGDGD_a * DuG()(gamma_s, beta_n)(c_s,a_n) * adjDu()(tau,gamma_n)(i,c_n);
              result()(gamma_s,gamma_n)() -= ee_GDGDGD_c * adjDuG()(tau, beta_n)(i,a_n) * Du_spec()(alpha_s,gamma_n)(a_s,c_n);
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -1233,7 +1233,7 @@ public:
  };
  
  void Report(void) {
-#define AVERAGE(A) _grid->GlobalSum(A);A/=NP;
+#define AVERAGE(A) 
 #define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
    RealD NP = _grid->_Nprocessors;
    RealD NN = _grid->NodeCount();
@@ -1281,11 +1281,13 @@ public:
 	std::cout << GridLogMessage << " Stencil SHM mem " << (membytes)/gatheralltime/1000. << " GB/s per rank"<<std::endl;
 	std::cout << GridLogMessage << " Stencil SHM mem " << (membytes)/gatheralltime/1000.*NP/NN << " GB/s per node"<<std::endl;
      }
+      /*
      PRINTIT(mpi3synctime);
      PRINTIT(mpi3synctime_g);
      PRINTIT(shmmergetime);
      PRINTIT(splicetime);
      PRINTIT(nosplicetime);
+      */
    }
 #undef PRINTIT
 #undef AVERAGE