From 95af55128e1c85bdc3028b8012c5f169fa2ef379 Mon Sep 17 00:00:00 2001
From: James Harrison <jch1g10@soton.ac.uk>
Date: Fri, 3 Nov 2017 18:46:16 +0000
Subject: [PATCH] =?UTF-8?q?QedFVol:=20Redo=20optimisation=20of=20scalar=20?=
 =?UTF-8?q?VP=20(extra=20memory=20requirements=20were=20not=20the=20proble?=
 =?UTF-8?q?m),=20and=20undo=20optimisation=20of=20charged=20propagator=20(?=
 =?UTF-8?q?which=20seemed=20to=20be=20causing=20HDF5=20errors,=20although?=
 =?UTF-8?q?=20I=20don=E2=80=99t=20know=20why).?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 extras/Hadrons/Modules/MScalar/ChargedProp.cc | 131 +++++++++++-------
 extras/Hadrons/Modules/MScalar/ScalarVP.cc    | 131 ++++--------------
 2 files changed, 110 insertions(+), 152 deletions(-)
diff --git a/extras/Hadrons/Modules/MScalar/ChargedProp.cc b/extras/Hadrons/Modules/MScalar/ChargedProp.cc
index cb7e6c79..1b901bf1 100644
--- a/extras/Hadrons/Modules/MScalar/ChargedProp.cc
+++ b/extras/Hadrons/Modules/MScalar/ChargedProp.cc
@@ -151,25 +151,23 @@ void TChargedProp::execute(void)
     buf = GFSrc;
     momD1(buf, fft);
     buf = -G*buf;
-    fft.FFT_dim(propQ, buf, env().getNd()-1, FFT::backward);
+    fft.FFT_all_dim(propQ, buf, FFT::backward);
 
     // G*momD1*G*momD1*G*F*Src (here buf = G*momD1*G*F*Src)
     buf = -buf;
     momD1(buf, fft);
     propSun = G*buf;
-    fft.FFT_dim(propSun, propSun, env().getNd()-1, FFT::backward);
+    fft.FFT_all_dim(propSun, propSun, FFT::backward);
 
     // -G*momD2*G*F*Src (momD2 = F*D2*Finv)
     buf = GFSrc;
     momD2(buf, fft);
     buf = -G*buf;
-    fft.FFT_dim(propTad, buf, env().getNd()-1, FFT::backward);
-    
-    // full charged scalar propagator
-    buf = GFSrc;
-    fft.FFT_dim(buf, buf, env().getNd()-1, FFT::backward);
-    prop = buf + q*propQ + q*q*propSun + q*q*propTad;
+    fft.FFT_all_dim(propTad, buf, FFT::backward);
 
+    // full charged scalar propagator
+    prop = (*prop0_) + q*propQ + q*q*propSun + q*q*propTad;
+    
     // OUTPUT IF NECESSARY
     if (!par().output.empty())
     {
@@ -186,59 +184,94 @@ void TChargedProp::execute(void)
                      << filename << "'..." << std::endl;
 
             CorrWriter            writer(filename);
-            // std::vector<TComplex> vecBuf;
-            std::vector<Complex>  result, result0, resultQ, resultSun, resultTad;
-            result.resize(env().getGrid()->_fdimensions[env().getNd()-1]);
-            result0.resize(env().getGrid()->_fdimensions[env().getNd()-1]);
-            resultQ.resize(env().getGrid()->_fdimensions[env().getNd()-1]);
-            resultSun.resize(env().getGrid()->_fdimensions[env().getNd()-1]);
-            resultTad.resize(env().getGrid()->_fdimensions[env().getNd()-1]);
+            std::vector<TComplex> vecBuf;
+            std::vector<Complex>  result;
 
             write(writer, "charge", q);
             write(writer, "mass", par().mass);
 
-            TComplex site;
-            std::vector<int>   whichmom;
-            whichmom.resize(env().getNd());
-
+            // Write full propagator
+            buf = prop;
             for (unsigned int j = 0; j < env().getNd()-1; ++j)
             {
-                whichmom[j] = mom[j];
+                for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
+                {
+                    buf = buf*adj(*phase_[j]);
+                }
             }
-
-            for (unsigned int t = 0; t < env().getGrid()->_fdimensions[env().getNd()-1]; ++t)
+            sliceSum(buf, vecBuf, Tp);
+            result.resize(vecBuf.size());
+            for (unsigned int t = 0; t < vecBuf.size(); ++t)
             {
-                whichmom[env().getNd()-1] = t;
-                // Write full propagator
-                peekSite(site, prop, whichmom);
-                result[t]=TensorRemove(site);
-                // Write free propagator
-                peekSite(site, buf, whichmom);
-                result0[t]=TensorRemove(site);
-                // Write propagator O(q) term
-                peekSite(site, propQ, whichmom);
-                resultQ[t]=TensorRemove(site);
-                // Write propagator sunset term
-                peekSite(site, propSun, whichmom);
-                resultSun[t]=TensorRemove(site);
-                // Write propagator tadpole term
-                peekSite(site, propTad, whichmom);
-                resultTad[t]=TensorRemove(site);
+                result[t] = TensorRemove(vecBuf[t]);
             }
             write(writer, "prop", result);
-            write(writer, "prop_0", result0);
-            write(writer, "prop_Q", resultQ);
-            write(writer, "prop_Sun", resultSun);
-            write(writer, "prop_Tad", resultTad);
+
+            // Write free propagator
+            buf = *prop0_;
+            for (unsigned int j = 0; j < env().getNd()-1; ++j)
+            {
+                for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
+                {
+                    buf = buf*adj(*phase_[j]);
+                }
+            }
+            sliceSum(buf, vecBuf, Tp);
+            for (unsigned int t = 0; t < vecBuf.size(); ++t)
+            {
+                result[t] = TensorRemove(vecBuf[t]);
+            }
+            write(writer, "prop_0", result);
+
+            // Write propagator O(q) term
+            buf = propQ;
+            for (unsigned int j = 0; j < env().getNd()-1; ++j)
+            {
+                for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
+                {
+                    buf = buf*adj(*phase_[j]);
+                }
+            }
+            sliceSum(buf, vecBuf, Tp);
+            for (unsigned int t = 0; t < vecBuf.size(); ++t)
+            {
+                result[t] = TensorRemove(vecBuf[t]);
+            }
+            write(writer, "prop_Q", result);
+
+            // Write propagator sunset term
+            buf = propSun;
+            for (unsigned int j = 0; j < env().getNd()-1; ++j)
+            {
+                for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
+                {
+                    buf = buf*adj(*phase_[j]);
+                }
+            }
+            sliceSum(buf, vecBuf, Tp);
+            for (unsigned int t = 0; t < vecBuf.size(); ++t)
+            {
+                result[t] = TensorRemove(vecBuf[t]);
+            }
+            write(writer, "prop_Sun", result);
+
+            // Write propagator tadpole term
+            buf = propTad;
+            for (unsigned int j = 0; j < env().getNd()-1; ++j)
+            {
+                for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
+                {
+                    buf = buf*adj(*phase_[j]);
+                }
+            }
+            sliceSum(buf, vecBuf, Tp);
+            for (unsigned int t = 0; t < vecBuf.size(); ++t)
+            {
+                result[t] = TensorRemove(vecBuf[t]);
+            }
+            write(writer, "prop_Tad", result);
         }
     }
-
-    std::vector<int> mask(env().getNd(),1);
-    mask[env().getNd()-1] = 0;
-    fft.FFT_dim_mask(prop, prop, mask, FFT::backward);
-    fft.FFT_dim_mask(propQ, propQ, mask, FFT::backward);
-    fft.FFT_dim_mask(propSun, propSun, mask, FFT::backward);
-    fft.FFT_dim_mask(propTad, propTad, mask, FFT::backward);
 }
 
 void TChargedProp::momD1(ScalarField &s, FFT &fft)
diff --git a/extras/Hadrons/Modules/MScalar/ScalarVP.cc b/extras/Hadrons/Modules/MScalar/ScalarVP.cc
index 4d923802..297a823d 100644
--- a/extras/Hadrons/Modules/MScalar/ScalarVP.cc
+++ b/extras/Hadrons/Modules/MScalar/ScalarVP.cc
@@ -144,13 +144,19 @@ void TScalarVP::execute(void)
     }
 
     // Open output files if necessary
+    std::vector<TComplex>   vecBuf;
+    std::vector<Complex>    result;
+    ScalarField vpPhase(env().getGrid());
     std::vector<CorrWriter *> writer, writer0, writerD;
+    std::vector<ScalarField> momphases;
     if (!par().output.empty())
     {
+        LOG(Message) << "Preparing output files..." << std::endl;
         for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
         {
             std::vector<int> mom = strToVec<int>(par().outputMom[i_p]);
 
+            // Open output files
             std::string           filename = par().output + "_" + std::to_string(mom[0])
                                                           + std::to_string(mom[1])
                                                           + std::to_string(mom[2])
@@ -180,11 +186,20 @@ void TScalarVP::execute(void)
             write(*writer0[i_p], "mass", static_cast<TChargedProp *>(env().getModule(par().scalarProp))->par().mass);
             write(*writerD[i_p], "charge", q);
             write(*writerD[i_p], "mass", static_cast<TChargedProp *>(env().getModule(par().scalarProp))->par().mass);
+
+            // Calculate phase factors
+            vpPhase = Complex(1.0,0.0);
+            for (unsigned int j = 0; j < env().getNd()-1; ++j)
+            {
+                for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
+                {
+                    vpPhase = vpPhase*(*phase_[j]);
+                }
+            }
+            vpPhase = adj(vpPhase);
+            momphases.push_back(vpPhase);
         }
     }
-    std::vector<TComplex>   vecBuf;
-    std::vector<Complex>    result;
-    ScalarField vpPhase(env().getGrid());
 
     // Do contractions
     for (unsigned int nu = 0; nu < env().getNd(); ++nu)
@@ -207,18 +222,9 @@ void TScalarVP::execute(void)
             // Output if necessary
             if (!par().output.empty())
             {
-                std::vector<int> mom;
                 for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                 {
-                    mom = strToVec<int>(par().outputMom[i_p]);
-                    vpPhase = freeVpTensor[mu][nu];
-                    for (unsigned int j = 0; j < env().getNd()-1; ++j)
-                    {
-                        for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
-                        {
-                            vpPhase = vpPhase*adj(*phase_[j]);
-                        }
-                    }
+                    vpPhase = freeVpTensor[mu][nu]*momphases[i_p];
                     sliceSum(vpPhase, vecBuf, Tp);
                     result.resize(vecBuf.size());
                     for (unsigned int t = 0; t < vecBuf.size(); ++t)
@@ -244,18 +250,9 @@ void TScalarVP::execute(void)
             // Output if necessary
             if (!par().output.empty())
             {
-                std::vector<int> mom;
                 for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                 {
-                    mom = strToVec<int>(par().outputMom[i_p]);
-                    vpPhase = tmp_vp;
-                    for (unsigned int j = 0; j < env().getNd()-1; ++j)
-                    {
-                        for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
-                        {
-                            vpPhase = vpPhase*adj(*phase_[j]);
-                        }
-                    }
+                    vpPhase = tmp_vp*momphases[i_p];
                     sliceSum(vpPhase, vecBuf, Tp);
                     result.resize(vecBuf.size());
                     for (unsigned int t = 0; t < vecBuf.size(); ++t)
@@ -281,18 +278,9 @@ void TScalarVP::execute(void)
             // Output if necessary
             if (!par().output.empty())
             {
-                std::vector<int> mom;
                 for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                 {
-                    mom = strToVec<int>(par().outputMom[i_p]);
-                    vpPhase = tmp_vp;
-                    for (unsigned int j = 0; j < env().getNd()-1; ++j)
-                    {
-                        for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
-                        {
-                            vpPhase = vpPhase*adj(*phase_[j]);
-                        }
-                    }
+                    vpPhase = tmp_vp*momphases[i_p];
                     sliceSum(vpPhase, vecBuf, Tp);
                     result.resize(vecBuf.size());
                     for (unsigned int t = 0; t < vecBuf.size(); ++t)
@@ -316,18 +304,9 @@ void TScalarVP::execute(void)
             // Output if necessary
             if (!par().output.empty())
             {
-                std::vector<int> mom;
                 for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                 {
-                    mom = strToVec<int>(par().outputMom[i_p]);
-                    vpPhase = tmp_vp;
-                    for (unsigned int j = 0; j < env().getNd()-1; ++j)
-                    {
-                        for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
-                        {
-                            vpPhase = vpPhase*adj(*phase_[j]);
-                        }
-                    }
+                    vpPhase = tmp_vp*momphases[i_p];
                     sliceSum(vpPhase, vecBuf, Tp);
                     result.resize(vecBuf.size());
                     for (unsigned int t = 0; t < vecBuf.size(); ++t)
@@ -350,18 +329,9 @@ void TScalarVP::execute(void)
             // Output if necessary
             if (!par().output.empty())
             {
-                std::vector<int> mom;
                 for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                 {
-                    mom = strToVec<int>(par().outputMom[i_p]);
-                    vpPhase = tmp_vp;
-                    for (unsigned int j = 0; j < env().getNd()-1; ++j)
-                    {
-                        for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
-                        {
-                            vpPhase = vpPhase*adj(*phase_[j]);
-                        }
-                    }
+                    vpPhase = tmp_vp*momphases[i_p];
                     sliceSum(vpPhase, vecBuf, Tp);
                     result.resize(vecBuf.size());
                     for (unsigned int t = 0; t < vecBuf.size(); ++t)
@@ -385,18 +355,9 @@ void TScalarVP::execute(void)
             // Output if necessary
             if (!par().output.empty())
             {
-                std::vector<int> mom;
                 for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                 {
-                    mom = strToVec<int>(par().outputMom[i_p]);
-                    vpPhase = tmp_vp;
-                    for (unsigned int j = 0; j < env().getNd()-1; ++j)
-                    {
-                        for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
-                        {
-                            vpPhase = vpPhase*adj(*phase_[j]);
-                        }
-                    }
+                    vpPhase = tmp_vp*momphases[i_p];
                     sliceSum(vpPhase, vecBuf, Tp);
                     result.resize(vecBuf.size());
                     for (unsigned int t = 0; t < vecBuf.size(); ++t)
@@ -419,18 +380,9 @@ void TScalarVP::execute(void)
             // Output if necessary
             if (!par().output.empty())
             {
-                std::vector<int> mom;
                 for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                 {
-                    mom = strToVec<int>(par().outputMom[i_p]);
-                    vpPhase = tmp_vp;
-                    for (unsigned int j = 0; j < env().getNd()-1; ++j)
-                    {
-                        for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
-                        {
-                            vpPhase = vpPhase*adj(*phase_[j]);
-                        }
-                    }
+                    vpPhase = tmp_vp*momphases[i_p];
                     sliceSum(vpPhase, vecBuf, Tp);
                     result.resize(vecBuf.size());
                     for (unsigned int t = 0; t < vecBuf.size(); ++t)
@@ -457,18 +409,9 @@ void TScalarVP::execute(void)
             // Output if necessary
             if (!par().output.empty())
             {
-                std::vector<int> mom;
                 for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                 {
-                    mom = strToVec<int>(par().outputMom[i_p]);
-                    vpPhase = tmp_vp;
-                    for (unsigned int j = 0; j < env().getNd()-1; ++j)
-                    {
-                        for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
-                        {
-                            vpPhase = vpPhase*adj(*phase_[j]);
-                        }
-                    }
+                    vpPhase = tmp_vp*momphases[i_p];
                     sliceSum(vpPhase, vecBuf, Tp);
                     result.resize(vecBuf.size());
                     for (unsigned int t = 0; t < vecBuf.size(); ++t)
@@ -494,18 +437,9 @@ void TScalarVP::execute(void)
             // Output if necessary
             if (!par().output.empty())
             {
-                std::vector<int> mom;
                 for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                 {
-                    mom = strToVec<int>(par().outputMom[i_p]);
-                    vpPhase = tmp_vp;
-                    for (unsigned int j = 0; j < env().getNd()-1; ++j)
-                    {
-                        for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
-                        {
-                            vpPhase = vpPhase*adj(*phase_[j]);
-                        }
-                    }
+                    vpPhase = tmp_vp*momphases[i_p];
                     sliceSum(vpPhase, vecBuf, Tp);
                     result.resize(vecBuf.size());
                     for (unsigned int t = 0; t < vecBuf.size(); ++t)
@@ -521,18 +455,9 @@ void TScalarVP::execute(void)
             // Output if necessary
             if (!par().output.empty())
             {
-                std::vector<int> mom;
                 for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                 {
-                    mom = strToVec<int>(par().outputMom[i_p]);
-                    vpPhase = vpTensor[mu][nu];
-                    for (unsigned int j = 0; j < env().getNd()-1; ++j)
-                    {
-                        for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
-                        {
-                            vpPhase = vpPhase*adj(*phase_[j]);
-                        }
-                    }
+                    vpPhase = vpTensor[mu][nu]*momphases[i_p];
                     sliceSum(vpPhase, vecBuf, Tp);
                     result.resize(vecBuf.size());
                     for (unsigned int t = 0; t < vecBuf.size(); ++t)