Hadrons: meson field threaded cache copy

2025-08-09 16:07:06 +01:00 · 2018-08-14 14:02:37 +01:00
parent 89d2fac92e
commit f4878d3a13
5 changed files with 21 additions and 18 deletions
--- a/extras/Hadrons/Modules/MContraction/A2AMesonField.hpp
+++ b/extras/Hadrons/Modules/MContraction/A2AMesonField.hpp
@@ -296,7 +296,8 @@ void TA2AMesonField<FImpl>::execute(void)
                +  vol * ( 2.0 * sizeof(Complex) *nmom ) * N_iii*N_jjj* ngamma;

            startTimer("cache copy");
-            for(int iii=0;iii< N_iii;iii++)
+
+            parallel_for_nest(5)(int iii=0;iii< N_iii;iii++)
            for(int jjj=0;jjj< N_jjj;jjj++)
            for(int m =0;m< nmom;m++)
            for(int g =0;g< ngamma;g++)
@@ -310,30 +311,30 @@ void TA2AMesonField<FImpl>::execute(void)
        // IO
        if (!par().output.empty())
        {
-        double blockSize, ioTime;
+            double blockSize, ioTime;
        
            LOG(Message) << "Writing block to disk" << std::endl;
            ioTime = -getDTimer("IO: write block");
            startTimer("IO: total");
-        for(int m = 0; m < nmom; m++)
-        for(int g = 0; g < ngamma; g++)
-        {
-            if ((i == 0) and (j == 0))
+            for(int m = 0; m < nmom; m++)
+            for(int g = 0; g < ngamma; g++)
            {
+                if ((i == 0) and (j == 0))
+                {
                    startTimer("IO: file creation");
-                initFile(m, g);
+                    initFile(m, g);
                    stopTimer("IO: file creation");
-            }
+                }
                startTimer("IO: write block");
-            saveBlock(mfBlock, m, g, i, j);
+                saveBlock(mfBlock, m, g, i, j);
                stopTimer("IO: write block");
-        }
+            }
            stopTimer("IO: total");
            blockSize  = static_cast<double>(nmom*ngamma*nt*N_ii*N_jj*sizeof(Complex));
            ioTime    += getDTimer("IO: write block");
            LOG(Message) << "HDF5 IO done " << blockSize/ioTime*1.0e6/1024/1024
-                     << " MB/s" << std::endl;
-    }
+                         << " MB/s" << std::endl;
+        }
    }

    double nodes    = env().getGrid()->NodeCount();
--- a/extras/Hadrons/Modules/MContraction/A2AMesonFieldKernels.hpp
+++ b/extras/Hadrons/Modules/MContraction/A2AMesonFieldKernels.hpp
@@ -174,7 +174,7 @@ void makeMesonFieldBlock(MesonField &mat,
    if (caller) caller->startTimer("contraction: spin trace");
    int pd = grid->_processors[orthogdim];
    int pc = grid->_processor_coor[orthogdim];
-    parallel_for_nest2(int lt=0;lt<ld;lt++)
+    parallel_for_nest(2)(int lt=0;lt<ld;lt++)
    {
        for(int pt=0;pt<pd;pt++)
        {
--- a/lib/cshift/Cshift_common.h
+++ b/lib/cshift/Cshift_common.h
@@ -94,7 +94,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_
  int n1=rhs._grid->_slice_stride[dimension];

  if ( cbmask ==0x3){
-    parallel_for_nest2(int n=0;n<e1;n++){
+    parallel_for_nest(2)(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){

 	int o      =   n*n1;
@@ -110,7 +110,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_
    // Case of SIMD split AND checker dim cannot currently be hit, except in 
    // Test_cshift_red_black code.
    std::cout << " Dense packed buffer WARNING " <<std::endl;
-    parallel_for_nest2(int n=0;n<e1;n++){
+    parallel_for_nest(2)(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){

 	int o=n*n1;
@@ -191,7 +191,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<typ
  int e2=rhs._grid->_slice_block[dimension];

  if(cbmask ==0x3 ) {
-    parallel_for_nest2(int n=0;n<e1;n++){
+    parallel_for_nest(2)(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o      = n*rhs._grid->_slice_stride[dimension];
 	int offset = b+n*rhs._grid->_slice_block[dimension];
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -522,7 +522,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice

    tensor_reduced at; at=av;

-    parallel_for_nest2(int n=0;n<e1;n++){
+    parallel_for_nest(2)(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int ss= so+n*stride+b;
 	R._odata[ss] = at*X._odata[ss]+Y._odata[ss];
--- a/lib/threads/Threads.h
+++ b/lib/threads/Threads.h
@@ -41,12 +41,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define PARALLEL_FOR_LOOP        _Pragma("omp parallel for schedule(static)")
 #define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(static)")
 #define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
+#define PARALLEL_NESTED_LOOP(n) _Pragma("omp parallel for collapse(n)")
 #define PARALLEL_REGION       _Pragma("omp parallel")
 #define PARALLEL_CRITICAL     _Pragma("omp critical")
 #else
 #define PARALLEL_FOR_LOOP
 #define PARALLEL_FOR_LOOP_INTERN
 #define PARALLEL_NESTED_LOOP2
+#define PARALLEL_NESTED_LOOP(n)
 #define PARALLEL_REGION
 #define PARALLEL_CRITICAL
 #endif
@@ -54,7 +56,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define parallel_region    PARALLEL_REGION
 #define parallel_for       PARALLEL_FOR_LOOP for
 #define parallel_for_internal PARALLEL_FOR_LOOP_INTERN for
-#define parallel_for_nest2 PARALLEL_NESTED_LOOP2 for
+#define parallel_for_nest(n) PARALLEL_NESTED_LOOP(n) for

 namespace Grid {