mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-31 03:54:33 +00:00 
			
		
		
		
	Compare commits
	
		
			39 Commits
		
	
	
		
			feature/lu
			...
			feature/mp
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | b820076b91 | ||
|  | 09f66100d3 | ||
|  | d7d92af09d | ||
|  | 460d0753a1 | ||
|  | 8f8058f8a5 | ||
|  | d97a27f483 | ||
|  | 7c3363b91e | ||
|  | b94478fa51 | ||
| 13bf0482e3 | |||
| a795b5705e | |||
| 392e064513 | |||
|  | b6a65059a2 | ||
|  | ea25a4d9ac | ||
|  | c190221fd3 | ||
|  | 0fcd2e7188 | ||
|  | 910b8dd6a1 | ||
|  | 75ebd3a0d1 | ||
|  | 09fd5c43a7 | ||
|  | f22317748f | ||
|  | 6a9eae6b6b | ||
|  | fad96cf250 | ||
|  | f331809c27 | ||
|  | 2c54a53d0a | ||
|  | 306160ad9a | ||
|  | 20a091c3ed | ||
|  | 202078eb1b | ||
|  | a762b1fb71 | ||
|  | 5b5925b8e5 | ||
|  | b58adc6a4b | ||
|  | f9d5e95d72 | ||
|  | 4f8e636a43 | ||
|  | 9b39f35ae6 | ||
|  | 5fe2b85cbd | ||
|  | c7cccaaa69 | ||
|  | cbcfea466f | ||
|  | 4955672fc3 | ||
|  | 39f1c880b8 | ||
|  | 8c043da5b7 | ||
|  | 3cbe974eb4 | 
| @@ -153,9 +153,10 @@ int main (int argc, char ** argv) | |||||||
|     std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl; |     std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl; | ||||||
|     std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl; |     std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl; | ||||||
|     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl; |     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl; | ||||||
|     std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl; |     std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl; | ||||||
|     err = ref-result;  |     err = ref-result;  | ||||||
|     std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl; |     std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl; | ||||||
|  |     assert (norm2(err)< 1.0e-5 ); | ||||||
|     Dw.Report(); |     Dw.Report(); | ||||||
|   } |   } | ||||||
|  |  | ||||||
| @@ -192,7 +193,7 @@ int main (int argc, char ** argv) | |||||||
|  |  | ||||||
|     std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl; |     std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl; | ||||||
|     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl; |     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl; | ||||||
|     std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl; |     std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl; | ||||||
|     sDw.Report(); |     sDw.Report(); | ||||||
|    |    | ||||||
|     if(0){ |     if(0){ | ||||||
| @@ -208,8 +209,7 @@ int main (int argc, char ** argv) | |||||||
|  |  | ||||||
|     std::cout<<GridLogMessage<< "res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl; |     std::cout<<GridLogMessage<< "res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl; | ||||||
|  |  | ||||||
|  |     RealD sum=0; | ||||||
|     RealF sum=0; |  | ||||||
|     for(int x=0;x<latt4[0];x++){ |     for(int x=0;x<latt4[0];x++){ | ||||||
|     for(int y=0;y<latt4[1];y++){ |     for(int y=0;y<latt4[1];y++){ | ||||||
|     for(int z=0;z<latt4[2];z++){ |     for(int z=0;z<latt4[2];z++){ | ||||||
| @@ -227,12 +227,12 @@ int main (int argc, char ** argv) | |||||||
|       } |       } | ||||||
|     }}}}} |     }}}}} | ||||||
|     std::cout<<GridLogMessage<<" difference between normal and simd is "<<sum<<std::endl; |     std::cout<<GridLogMessage<<" difference between normal and simd is "<<sum<<std::endl; | ||||||
|  |     assert (sum< 1.0e-5 ); | ||||||
|  |  | ||||||
|  |  | ||||||
|     if (1) { |     if (1) { | ||||||
|  |  | ||||||
|       LatticeFermion sr_eo(sFGrid); |       LatticeFermion sr_eo(sFGrid); | ||||||
|       LatticeFermion serr(sFGrid); |  | ||||||
|  |  | ||||||
|       LatticeFermion ssrc_e (sFrbGrid); |       LatticeFermion ssrc_e (sFrbGrid); | ||||||
|       LatticeFermion ssrc_o (sFrbGrid); |       LatticeFermion ssrc_o (sFrbGrid); | ||||||
| @@ -244,8 +244,6 @@ int main (int argc, char ** argv) | |||||||
|  |  | ||||||
|       setCheckerboard(sr_eo,ssrc_o); |       setCheckerboard(sr_eo,ssrc_o); | ||||||
|       setCheckerboard(sr_eo,ssrc_e); |       setCheckerboard(sr_eo,ssrc_e); | ||||||
|       serr = sr_eo-ssrc;  |  | ||||||
|       std::cout<<GridLogMessage << "EO src norm diff   "<< norm2(serr)<<std::endl; |  | ||||||
|  |  | ||||||
|       sr_e = zero; |       sr_e = zero; | ||||||
|       sr_o = zero; |       sr_o = zero; | ||||||
| @@ -263,7 +261,7 @@ int main (int argc, char ** argv) | |||||||
|       double flops=(1344.0*volume*ncall)/2; |       double flops=(1344.0*volume*ncall)/2; | ||||||
|  |  | ||||||
|       std::cout<<GridLogMessage << "sDeo mflop/s =   "<< flops/(t1-t0)<<std::endl; |       std::cout<<GridLogMessage << "sDeo mflop/s =   "<< flops/(t1-t0)<<std::endl; | ||||||
|       std::cout<<GridLogMessage << "sDeo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl; |       std::cout<<GridLogMessage << "sDeo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl; | ||||||
|       sDw.Report(); |       sDw.Report(); | ||||||
|  |  | ||||||
|       sDw.DhopEO(ssrc_o,sr_e,DaggerNo); |       sDw.DhopEO(ssrc_o,sr_e,DaggerNo); | ||||||
| @@ -273,9 +271,18 @@ int main (int argc, char ** argv) | |||||||
|       pickCheckerboard(Even,ssrc_e,sresult); |       pickCheckerboard(Even,ssrc_e,sresult); | ||||||
|       pickCheckerboard(Odd ,ssrc_o,sresult); |       pickCheckerboard(Odd ,ssrc_o,sresult); | ||||||
|       ssrc_e = ssrc_e - sr_e; |       ssrc_e = ssrc_e - sr_e; | ||||||
|  |       RealD error = norm2(ssrc_e); | ||||||
|  |  | ||||||
|       std::cout<<GridLogMessage << "sE norm diff   "<< norm2(ssrc_e)<< "  vec nrm"<<norm2(sr_e) <<std::endl; |       std::cout<<GridLogMessage << "sE norm diff   "<< norm2(ssrc_e)<< "  vec nrm"<<norm2(sr_e) <<std::endl; | ||||||
|       ssrc_o = ssrc_o - sr_o; |       ssrc_o = ssrc_o - sr_o; | ||||||
|  |  | ||||||
|  |       error+= norm2(ssrc_o); | ||||||
|       std::cout<<GridLogMessage << "sO norm diff   "<< norm2(ssrc_o)<< "  vec nrm"<<norm2(sr_o) <<std::endl; |       std::cout<<GridLogMessage << "sO norm diff   "<< norm2(ssrc_o)<< "  vec nrm"<<norm2(sr_o) <<std::endl; | ||||||
|  |       if(error>1.0e-5) {  | ||||||
|  | 	setCheckerboard(ssrc,ssrc_o); | ||||||
|  | 	setCheckerboard(ssrc,ssrc_e); | ||||||
|  | 	std::cout<< ssrc << std::endl; | ||||||
|  |       } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -307,7 +314,7 @@ int main (int argc, char ** argv) | |||||||
|   std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl; |   std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl; | ||||||
|   err = ref-result;  |   err = ref-result;  | ||||||
|   std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl; |   std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl; | ||||||
|  |   assert(norm2(err)<1.0e-5); | ||||||
|   LatticeFermion src_e (FrbGrid); |   LatticeFermion src_e (FrbGrid); | ||||||
|   LatticeFermion src_o (FrbGrid); |   LatticeFermion src_o (FrbGrid); | ||||||
|   LatticeFermion r_e   (FrbGrid); |   LatticeFermion r_e   (FrbGrid); | ||||||
| @@ -334,7 +341,7 @@ int main (int argc, char ** argv) | |||||||
|     double flops=(1344.0*volume*ncall)/2; |     double flops=(1344.0*volume*ncall)/2; | ||||||
|  |  | ||||||
|     std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl; |     std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl; | ||||||
|     std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl; |     std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl; | ||||||
|     Dw.Report(); |     Dw.Report(); | ||||||
|   } |   } | ||||||
|   Dw.DhopEO(src_o,r_e,DaggerNo); |   Dw.DhopEO(src_o,r_e,DaggerNo); | ||||||
| @@ -350,11 +357,14 @@ int main (int argc, char ** argv) | |||||||
|  |  | ||||||
|   err = r_eo-result;  |   err = r_eo-result;  | ||||||
|   std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl; |   std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl; | ||||||
|  |   assert(norm2(err)<1.0e-5); | ||||||
|  |  | ||||||
|   pickCheckerboard(Even,src_e,err); |   pickCheckerboard(Even,src_e,err); | ||||||
|   pickCheckerboard(Odd,src_o,err); |   pickCheckerboard(Odd,src_o,err); | ||||||
|   std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl; |   std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl; |   std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl; | ||||||
|  |   assert(norm2(src_e)<1.0e-5); | ||||||
|  |   assert(norm2(src_o)<1.0e-5); | ||||||
|  |  | ||||||
|  |  | ||||||
|   } |   } | ||||||
|   | |||||||
| @@ -244,6 +244,9 @@ case ${ac_COMMS} in | |||||||
|      mpi) |      mpi) | ||||||
|        AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] ) |        AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] ) | ||||||
|      ;; |      ;; | ||||||
|  |      mpi3) | ||||||
|  |        AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] ) | ||||||
|  |      ;; | ||||||
|      shmem) |      shmem) | ||||||
|        AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] ) |        AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] ) | ||||||
|      ;; |      ;; | ||||||
| @@ -253,6 +256,7 @@ case ${ac_COMMS} in | |||||||
| esac | esac | ||||||
| AM_CONDITIONAL(BUILD_COMMS_SHMEM,[ test "X${ac_COMMS}X" == "XshmemX" ]) | AM_CONDITIONAL(BUILD_COMMS_SHMEM,[ test "X${ac_COMMS}X" == "XshmemX" ]) | ||||||
| AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" || test "X${ac_COMMS}X" == "Xmpi-autoX" ]) | AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" || test "X${ac_COMMS}X" == "Xmpi-autoX" ]) | ||||||
|  | AM_CONDITIONAL(BUILD_COMMS_MPI3,[ test "X${ac_COMMS}X" == "Xmpi3X"] ) | ||||||
| AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ]) | AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ]) | ||||||
|  |  | ||||||
| ############### RNG selection | ############### RNG selection | ||||||
|   | |||||||
| @@ -40,14 +40,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #include <mm_malloc.h> | #include <mm_malloc.h> | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| #ifdef GRID_COMMS_SHMEM |  | ||||||
| extern "C" {  |  | ||||||
| #include <mpp/shmem.h> |  | ||||||
| extern void * shmem_align(size_t, size_t); |  | ||||||
| extern void  shmem_free(void *); |  | ||||||
| } |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
| //////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////// | ||||||
| @@ -65,53 +57,21 @@ public: | |||||||
|   typedef _Tp        value_type; |   typedef _Tp        value_type; | ||||||
|  |  | ||||||
|   template<typename _Tp1>  struct rebind { typedef alignedAllocator<_Tp1> other; }; |   template<typename _Tp1>  struct rebind { typedef alignedAllocator<_Tp1> other; }; | ||||||
|  |  | ||||||
|   alignedAllocator() throw() { } |   alignedAllocator() throw() { } | ||||||
|  |  | ||||||
|   alignedAllocator(const alignedAllocator&) throw() { } |   alignedAllocator(const alignedAllocator&) throw() { } | ||||||
|  |  | ||||||
|   template<typename _Tp1> alignedAllocator(const alignedAllocator<_Tp1>&) throw() { } |   template<typename _Tp1> alignedAllocator(const alignedAllocator<_Tp1>&) throw() { } | ||||||
|  |  | ||||||
|   ~alignedAllocator() throw() { } |   ~alignedAllocator() throw() { } | ||||||
|  |  | ||||||
|   pointer       address(reference __x)       const { return &__x; } |   pointer       address(reference __x)       const { return &__x; } | ||||||
|   //  const_pointer address(const_reference __x) const { return &__x; } |  | ||||||
|  |  | ||||||
|   size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); } |   size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); } | ||||||
|  |  | ||||||
|   pointer allocate(size_type __n, const void* _p= 0) |   pointer allocate(size_type __n, const void* _p= 0) | ||||||
|   {  |   {  | ||||||
| #ifdef GRID_COMMS_SHMEM |  | ||||||
|  |  | ||||||
|     _Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64); |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #define PARANOID_SYMMETRIC_HEAP |  | ||||||
| #ifdef PARANOID_SYMMETRIC_HEAP |  | ||||||
|     static void * bcast; |  | ||||||
|     static long  psync[_SHMEM_REDUCE_SYNC_SIZE]; |  | ||||||
|  |  | ||||||
|     bcast = (void *) ptr; |  | ||||||
|     shmem_broadcast32((void *)&bcast,(void *)&bcast,sizeof(void *)/4,0,0,0,shmem_n_pes(),psync); |  | ||||||
|  |  | ||||||
|     if ( bcast != ptr ) { |  | ||||||
|       std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout); |  | ||||||
|       BACKTRACEFILE(); |  | ||||||
|       exit(0); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     assert( bcast == (void *) ptr); |  | ||||||
|  |  | ||||||
| #endif  |  | ||||||
| #else |  | ||||||
|  |  | ||||||
| #ifdef HAVE_MM_MALLOC_H | #ifdef HAVE_MM_MALLOC_H | ||||||
|     _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128); |     _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128); | ||||||
| #else | #else | ||||||
|     _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp)); |     _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp)); | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| #endif |  | ||||||
|     _Tp tmp; |     _Tp tmp; | ||||||
| #ifdef GRID_NUMA | #ifdef GRID_NUMA | ||||||
| #pragma omp parallel for schedule(static) | #pragma omp parallel for schedule(static) | ||||||
| @@ -123,27 +83,109 @@ public: | |||||||
|   } |   } | ||||||
|  |  | ||||||
|   void deallocate(pointer __p, size_type) {  |   void deallocate(pointer __p, size_type) {  | ||||||
| #ifdef GRID_COMMS_SHMEM |  | ||||||
|     shmem_free((void *)__p); |  | ||||||
| #else |  | ||||||
| #ifdef HAVE_MM_MALLOC_H | #ifdef HAVE_MM_MALLOC_H | ||||||
|     _mm_free((void *)__p);  |     _mm_free((void *)__p);  | ||||||
| #else | #else | ||||||
|     free((void *)__p); |     free((void *)__p); | ||||||
| #endif |  | ||||||
| #endif | #endif | ||||||
|   } |   } | ||||||
|   void construct(pointer __p, const _Tp& __val) { }; |   void construct(pointer __p, const _Tp& __val) { }; | ||||||
|   void construct(pointer __p) { }; |   void construct(pointer __p) { }; | ||||||
|  |  | ||||||
|   void destroy(pointer __p) { }; |   void destroy(pointer __p) { }; | ||||||
| }; | }; | ||||||
|  | template<typename _Tp>  inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; } | ||||||
|  | template<typename _Tp>  inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; } | ||||||
|  |  | ||||||
| template<typename _Tp>  inline bool | ////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; } | // MPI3 : comms must use shm region | ||||||
|  | // SHMEM: comms must use symmetric heap | ||||||
|  | ////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | #ifdef GRID_COMMS_SHMEM | ||||||
|  | extern "C" {  | ||||||
|  | #include <mpp/shmem.h> | ||||||
|  | extern void * shmem_align(size_t, size_t); | ||||||
|  | extern void  shmem_free(void *); | ||||||
|  | } | ||||||
|  | #define PARANOID_SYMMETRIC_HEAP | ||||||
|  | #endif | ||||||
|  |  | ||||||
| template<typename _Tp>  inline bool | template<typename _Tp> | ||||||
| operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; } | class commAllocator { | ||||||
|  | public:  | ||||||
|  |   typedef std::size_t     size_type; | ||||||
|  |   typedef std::ptrdiff_t  difference_type; | ||||||
|  |   typedef _Tp*       pointer; | ||||||
|  |   typedef const _Tp* const_pointer; | ||||||
|  |   typedef _Tp&       reference; | ||||||
|  |   typedef const _Tp& const_reference; | ||||||
|  |   typedef _Tp        value_type; | ||||||
|  |  | ||||||
|  |   template<typename _Tp1>  struct rebind { typedef commAllocator<_Tp1> other; }; | ||||||
|  |   commAllocator() throw() { } | ||||||
|  |   commAllocator(const commAllocator&) throw() { } | ||||||
|  |   template<typename _Tp1> commAllocator(const commAllocator<_Tp1>&) throw() { } | ||||||
|  |   ~commAllocator() throw() { } | ||||||
|  |   pointer       address(reference __x)       const { return &__x; } | ||||||
|  |   size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); } | ||||||
|  |  | ||||||
|  | #ifdef GRID_COMMS_SHMEM | ||||||
|  |   pointer allocate(size_type __n, const void* _p= 0) | ||||||
|  |   { | ||||||
|  | #ifdef CRAY | ||||||
|  |     _Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64); | ||||||
|  | #else | ||||||
|  |     _Tp *ptr = (_Tp *) shmem_align(64,__n*sizeof(_Tp)); | ||||||
|  | #endif | ||||||
|  | #ifdef PARANOID_SYMMETRIC_HEAP | ||||||
|  |     static void * bcast; | ||||||
|  |     static long  psync[_SHMEM_REDUCE_SYNC_SIZE]; | ||||||
|  |  | ||||||
|  |     bcast = (void *) ptr; | ||||||
|  |     shmem_broadcast32((void *)&bcast,(void *)&bcast,sizeof(void *)/4,0,0,0,shmem_n_pes(),psync); | ||||||
|  |  | ||||||
|  |     if ( bcast != ptr ) { | ||||||
|  |       std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout); | ||||||
|  |       //      BACKTRACEFILE(); | ||||||
|  |       exit(0); | ||||||
|  |     } | ||||||
|  |     assert( bcast == (void *) ptr); | ||||||
|  | #endif  | ||||||
|  |     return ptr; | ||||||
|  |   } | ||||||
|  |   void deallocate(pointer __p, size_type) {  | ||||||
|  |     shmem_free((void *)__p); | ||||||
|  |   } | ||||||
|  | #else | ||||||
|  |   pointer allocate(size_type __n, const void* _p= 0)  | ||||||
|  |   { | ||||||
|  | #ifdef HAVE_MM_MALLOC_H | ||||||
|  |     _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128); | ||||||
|  | #else | ||||||
|  |     _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp)); | ||||||
|  | #endif | ||||||
|  |     return ptr; | ||||||
|  |   } | ||||||
|  |   void deallocate(pointer __p, size_type) {  | ||||||
|  | #ifdef HAVE_MM_MALLOC_H | ||||||
|  |     _mm_free((void *)__p);  | ||||||
|  | #else | ||||||
|  |     free((void *)__p); | ||||||
|  | #endif | ||||||
|  |   } | ||||||
|  | #endif | ||||||
|  |   void construct(pointer __p, const _Tp& __val) { }; | ||||||
|  |   void construct(pointer __p) { }; | ||||||
|  |   void destroy(pointer __p) { }; | ||||||
|  | }; | ||||||
|  | template<typename _Tp>  inline bool operator==(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return true; } | ||||||
|  | template<typename _Tp>  inline bool operator!=(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return false; } | ||||||
|  |  | ||||||
|  | //////////////////////////////////////////////////////////////////////////////// | ||||||
|  | // Template typedefs | ||||||
|  | //////////////////////////////////////////////////////////////////////////////// | ||||||
|  | template<class T> using Vector     = std::vector<T,alignedAllocator<T> >;            | ||||||
|  | template<class T> using commVector = std::vector<T,commAllocator<T> >;               | ||||||
|  | template<class T> using Matrix     = std::vector<std::vector<T,alignedAllocator<T> > >; | ||||||
|      |      | ||||||
| }; // namespace Grid | }; // namespace Grid | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -38,6 +38,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #include <Grid/cshift/Cshift_mpi.h> | #include <Grid/cshift/Cshift_mpi.h> | ||||||
| #endif  | #endif  | ||||||
|  |  | ||||||
|  | #ifdef GRID_COMMS_MPI3 | ||||||
|  | #include <Grid/cshift/Cshift_mpi.h> | ||||||
|  | #endif  | ||||||
|  |  | ||||||
| #ifdef GRID_COMMS_SHMEM | #ifdef GRID_COMMS_SHMEM | ||||||
| #include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator | #include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator | ||||||
| #endif  | #endif  | ||||||
|   | |||||||
							
								
								
									
										27
									
								
								lib/FFT.h
									
									
									
									
									
								
							
							
						
						
									
										27
									
								
								lib/FFT.h
									
									
									
									
									
								
							| @@ -200,18 +200,14 @@ namespace Grid { | |||||||
| 					       sign,FFTW_ESTIMATE); | 					       sign,FFTW_ESTIMATE); | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	double add,mul,fma; |     std::vector<int> lcoor(Nd), gcoor(Nd); | ||||||
| 	FFTW<scalar>::fftw_flops(p,&add,&mul,&fma); |  | ||||||
| 	flops_call = add+mul+2.0*fma; |  | ||||||
|  |  | ||||||
| 	GridStopWatch timer; |  | ||||||
|  |  | ||||||
| 	// Barrel shift and collect global pencil | 	// Barrel shift and collect global pencil | ||||||
| 	for(int p=0;p<processors[dim];p++) {  | 	for(int p=0;p<processors[dim];p++) {  | ||||||
|  |  | ||||||
| 	  for(int idx=0;idx<sgrid->lSites();idx++) {  | 	  for(int idx=0;idx<sgrid->lSites();idx++) {  | ||||||
|  |  | ||||||
| 	    std::vector<int> lcoor(Nd); | 	     | ||||||
|     	    sgrid->LocalIndexToLocalCoor(idx,lcoor); |     	    sgrid->LocalIndexToLocalCoor(idx,lcoor); | ||||||
|  |  | ||||||
| 	    sobj s; | 	    sobj s; | ||||||
| @@ -228,14 +224,11 @@ namespace Grid { | |||||||
| 	 | 	 | ||||||
| 	// Loop over orthog coords | 	// Loop over orthog coords | ||||||
| 	int NN=pencil_g.lSites(); | 	int NN=pencil_g.lSites(); | ||||||
|  | 	GridStopWatch timer; | ||||||
| 	GridStopWatch Timer; | 	timer.Start(); | ||||||
| 	Timer.Start(); |  | ||||||
|  |  | ||||||
| PARALLEL_FOR_LOOP | PARALLEL_FOR_LOOP | ||||||
| 	for(int idx=0;idx<NN;idx++) { | 	for(int idx=0;idx<NN;idx++) { | ||||||
|  |  | ||||||
| 	  std::vector<int> lcoor(Nd); |  | ||||||
| 	  pencil_g.LocalIndexToLocalCoor(idx,lcoor); | 	  pencil_g.LocalIndexToLocalCoor(idx,lcoor); | ||||||
|  |  | ||||||
| 	  if ( lcoor[dim] == 0 ) {  // restricts loop to plane at lcoor[dim]==0 | 	  if ( lcoor[dim] == 0 ) {  // restricts loop to plane at lcoor[dim]==0 | ||||||
| @@ -245,15 +238,17 @@ PARALLEL_FOR_LOOP | |||||||
| 	  } | 	  } | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
|         Timer.Stop(); |         timer.Stop(); | ||||||
| 	usec += Timer.useconds(); |  | ||||||
| 	flops+= flops_call*NN; |  | ||||||
|  |  | ||||||
|  |           double add,mul,fma; | ||||||
|  |           FFTW<scalar>::fftw_flops(p,&add,&mul,&fma); | ||||||
|  |           flops_call = add+mul+2.0*fma; | ||||||
|  |           usec += timer.useconds(); | ||||||
|  |           flops+= flops_call*NN; | ||||||
|         int pc = processor_coor[dim]; |         int pc = processor_coor[dim]; | ||||||
|         for(int idx=0;idx<sgrid->lSites();idx++) { |         for(int idx=0;idx<sgrid->lSites();idx++) { | ||||||
| 	  std::vector<int> lcoor(Nd); |  | ||||||
| 	  sgrid->LocalIndexToLocalCoor(idx,lcoor); | 	  sgrid->LocalIndexToLocalCoor(idx,lcoor); | ||||||
| 	  std::vector<int> gcoor = lcoor; | 	  gcoor = lcoor; | ||||||
| 	  // extract the result | 	  // extract the result | ||||||
| 	  sobj s; | 	  sobj s; | ||||||
| 	  gcoor[dim] = lcoor[dim]+L*pc; | 	  gcoor[dim] = lcoor[dim]+L*pc; | ||||||
|   | |||||||
							
								
								
									
										16
									
								
								lib/Init.cc
									
									
									
									
									
								
							
							
						
						
									
										16
									
								
								lib/Init.cc
									
									
									
									
									
								
							| @@ -171,14 +171,17 @@ std::string GridCmdVectorIntToString(const std::vector<int> & vec){ | |||||||
| ///////////////////////////////////////////////////////// | ///////////////////////////////////////////////////////// | ||||||
| // | // | ||||||
| ///////////////////////////////////////////////////////// | ///////////////////////////////////////////////////////// | ||||||
|  | static int Grid_is_initialised = 0; | ||||||
|  |  | ||||||
|  |  | ||||||
| void Grid_init(int *argc,char ***argv) | void Grid_init(int *argc,char ***argv) | ||||||
| { | { | ||||||
|  |   GridLogger::StopWatch.Start(); | ||||||
|  |  | ||||||
|   CartesianCommunicator::Init(argc,argv); |   CartesianCommunicator::Init(argc,argv); | ||||||
|  |  | ||||||
|   // Parse command line args. |   // Parse command line args. | ||||||
|  |  | ||||||
|   GridLogger::StopWatch.Start(); |  | ||||||
|  |  | ||||||
|   std::string arg; |   std::string arg; | ||||||
|   std::vector<std::string> logstreams; |   std::vector<std::string> logstreams; | ||||||
|   std::string defaultLog("Error,Warning,Message,Performance"); |   std::string defaultLog("Error,Warning,Message,Performance"); | ||||||
| @@ -216,11 +219,14 @@ void Grid_init(int *argc,char ***argv) | |||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){ |   if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){ | ||||||
|     LebesgueOrder::UseLebesgueOrder=1; |     LebesgueOrder::UseLebesgueOrder=1; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ |   if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ | ||||||
|     arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); |     arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); | ||||||
|     GridCmdOptionIntVector(arg,LebesgueOrder::Block); |     GridCmdOptionIntVector(arg,LebesgueOrder::Block); | ||||||
|   } |   } | ||||||
|  |   if( GridCmdOptionExists(*argv,*argv+*argc,"--timestamp") ){ | ||||||
|  |     GridLogTimestamp(1); | ||||||
|  |   } | ||||||
|  |  | ||||||
|   GridParseLayout(*argv,*argc, |   GridParseLayout(*argv,*argc, | ||||||
| 		  Grid_default_latt, | 		  Grid_default_latt, | ||||||
| 		  Grid_default_mpi); | 		  Grid_default_mpi); | ||||||
| @@ -274,12 +280,14 @@ void Grid_init(int *argc,char ***argv) | |||||||
|   std::cout << "GNU General Public License for more details."<<std::endl; |   std::cout << "GNU General Public License for more details."<<std::endl; | ||||||
|   std::cout << COL_BACKGROUND <<std::endl; |   std::cout << COL_BACKGROUND <<std::endl; | ||||||
|   std::cout << std::endl; |   std::cout << std::endl; | ||||||
|  |  | ||||||
|  |   Grid_is_initialised = 1; | ||||||
| } | } | ||||||
|  |  | ||||||
|    |    | ||||||
| void Grid_finalize(void) | void Grid_finalize(void) | ||||||
| { | { | ||||||
| #ifdef GRID_COMMS_MPI | #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) | ||||||
|   MPI_Finalize(); |   MPI_Finalize(); | ||||||
|   Grid_unquiesce_nodes(); |   Grid_unquiesce_nodes(); | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -33,6 +33,7 @@ namespace Grid { | |||||||
|  |  | ||||||
|   void Grid_init(int *argc,char ***argv); |   void Grid_init(int *argc,char ***argv); | ||||||
|   void Grid_finalize(void); |   void Grid_finalize(void); | ||||||
|  |  | ||||||
|   // internal, controled with --handle |   // internal, controled with --handle | ||||||
|   void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr); |   void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr); | ||||||
|   void Grid_debug_handler_init(void); |   void Grid_debug_handler_init(void); | ||||||
| @@ -44,6 +45,7 @@ namespace Grid { | |||||||
|   const std::vector<int> &GridDefaultMpi(void); |   const std::vector<int> &GridDefaultMpi(void); | ||||||
|   const int              &GridThreads(void)  ; |   const int              &GridThreads(void)  ; | ||||||
|   void                    GridSetThreads(int t) ; |   void                    GridSetThreads(int t) ; | ||||||
|  |   void GridLogTimestamp(int); | ||||||
|  |  | ||||||
|   // Common parsing chores |   // Common parsing chores | ||||||
|   std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option); |   std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option); | ||||||
|   | |||||||
| @@ -34,8 +34,13 @@ directory | |||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
| GridStopWatch Logger::StopWatch; | GridStopWatch Logger::StopWatch; | ||||||
|  | int Logger::timestamp; | ||||||
| std::ostream Logger::devnull(0); | std::ostream Logger::devnull(0); | ||||||
|  |  | ||||||
|  | void GridLogTimestamp(int on){ | ||||||
|  |   Logger::Timestamp(on); | ||||||
|  | } | ||||||
|  |  | ||||||
| Colours GridLogColours(0); | Colours GridLogColours(0); | ||||||
| GridLogger GridLogError(1, "Error", GridLogColours, "RED"); | GridLogger GridLogError(1, "Error", GridLogColours, "RED"); | ||||||
| GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW"); | GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW"); | ||||||
| @@ -73,7 +78,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) { | |||||||
| //////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////// | ||||||
| void Grid_quiesce_nodes(void) { | void Grid_quiesce_nodes(void) { | ||||||
|   int me = 0; |   int me = 0; | ||||||
| #ifdef GRID_COMMS_MPI | #if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) | ||||||
|   MPI_Comm_rank(MPI_COMM_WORLD, &me); |   MPI_Comm_rank(MPI_COMM_WORLD, &me); | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_COMMS_SHMEM | #ifdef GRID_COMMS_SHMEM | ||||||
|   | |||||||
							
								
								
									
										23
									
								
								lib/Log.h
									
									
									
									
									
								
							
							
						
						
									
										23
									
								
								lib/Log.h
									
									
									
									
									
								
							| @@ -39,8 +39,9 @@ | |||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
|  | ////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| // Dress the output; use std::chrono for time stamping via the StopWatch class | // Dress the output; use std::chrono for time stamping via the StopWatch class | ||||||
| int Rank(void); // used for early stage debug before library init | ////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |  | ||||||
|  |  | ||||||
| class Colours{ | class Colours{ | ||||||
| @@ -55,7 +56,6 @@ public: | |||||||
|  |  | ||||||
|   void Active(bool activate){ |   void Active(bool activate){ | ||||||
|     is_active=activate; |     is_active=activate; | ||||||
|  |  | ||||||
|     if (is_active){ |     if (is_active){ | ||||||
|      colour["BLACK"]  ="\033[30m"; |      colour["BLACK"]  ="\033[30m"; | ||||||
|      colour["RED"]    ="\033[31m"; |      colour["RED"]    ="\033[31m"; | ||||||
| @@ -77,10 +77,7 @@ public: | |||||||
|       colour["WHITE"] =""; |       colour["WHITE"] =""; | ||||||
|       colour["NORMAL"]=""; |       colour["NORMAL"]=""; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -88,6 +85,7 @@ class Logger { | |||||||
| protected: | protected: | ||||||
|   Colours &Painter; |   Colours &Painter; | ||||||
|   int active; |   int active; | ||||||
|  |   static int timestamp; | ||||||
|   std::string name, topName; |   std::string name, topName; | ||||||
|   std::string COLOUR; |   std::string COLOUR; | ||||||
|  |  | ||||||
| @@ -99,8 +97,7 @@ public: | |||||||
|   std::string evidence() {return Painter.colour["YELLOW"];} |   std::string evidence() {return Painter.colour["YELLOW"];} | ||||||
|   std::string colour() {return Painter.colour[COLOUR];} |   std::string colour() {return Painter.colour[COLOUR];} | ||||||
|  |  | ||||||
|   Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col) |   Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col)  : active(on), | ||||||
|   : active(on), |  | ||||||
|     name(nm), |     name(nm), | ||||||
|     topName(topNm), |     topName(topNm), | ||||||
|     Painter(col_class), |     Painter(col_class), | ||||||
| @@ -108,16 +105,20 @@ public: | |||||||
|    |    | ||||||
|   void Active(int on) {active = on;}; |   void Active(int on) {active = on;}; | ||||||
|   int  isActive(void) {return active;}; |   int  isActive(void) {return active;}; | ||||||
|  |   static void Timestamp(int on) {timestamp = on;}; | ||||||
|    |    | ||||||
|   friend std::ostream& operator<< (std::ostream& stream, Logger& log){ |   friend std::ostream& operator<< (std::ostream& stream, Logger& log){ | ||||||
|  |  | ||||||
|     if ( log.active ) { |     if ( log.active ) { | ||||||
|  |       stream << log.background()<< log.topName << log.background()<< " : "; | ||||||
|  |       stream << log.colour() <<std::setw(14) << std::left << log.name << log.background() << " : "; | ||||||
|  |       if ( log.timestamp ) { | ||||||
| 	StopWatch.Stop(); | 	StopWatch.Stop(); | ||||||
| 	GridTime now = StopWatch.Elapsed(); | 	GridTime now = StopWatch.Elapsed(); | ||||||
| 	StopWatch.Start(); | 	StopWatch.Start(); | ||||||
|       stream << log.background()<< log.topName << log.background()<< " : "; | 	stream << log.evidence()<< now << log.background() << " : " ; | ||||||
|       stream << log.colour() <<std::setw(14) << std::left << log.name << log.background() << " : "; |       } | ||||||
|       stream << log.evidence()<< now << log.background() << " : " << log.colour(); |       stream << log.colour(); | ||||||
|       return stream; |       return stream; | ||||||
|     } else {  |     } else {  | ||||||
|       return devnull; |       return devnull; | ||||||
| @@ -149,7 +150,7 @@ extern void * Grid_backtrace_buffer[_NBACKTRACE]; | |||||||
|  |  | ||||||
| #define BACKTRACEFILE() {\ | #define BACKTRACEFILE() {\ | ||||||
| char string[20];					\ | char string[20];					\ | ||||||
| std::sprintf(string,"backtrace.%d",Rank());				\ | std::sprintf(string,"backtrace.%d",CartesianCommunicator::RankWorld()); \ | ||||||
| std::FILE * fp = std::fopen(string,"w");				\ | std::FILE * fp = std::fopen(string,"w");				\ | ||||||
| BACKTRACEFP(fp)\ | BACKTRACEFP(fp)\ | ||||||
| std::fclose(fp);	    \ | std::fclose(fp);	    \ | ||||||
|   | |||||||
| @@ -1,14 +1,22 @@ | |||||||
| extra_sources= | extra_sources= | ||||||
| if BUILD_COMMS_MPI | if BUILD_COMMS_MPI | ||||||
|   extra_sources+=communicator/Communicator_mpi.cc |   extra_sources+=communicator/Communicator_mpi.cc | ||||||
|  |   extra_sources+=communicator/Communicator_base.cc | ||||||
|  | endif | ||||||
|  |  | ||||||
|  | if BUILD_COMMS_MPI3 | ||||||
|  |   extra_sources+=communicator/Communicator_mpi3.cc | ||||||
|  |   extra_sources+=communicator/Communicator_base.cc | ||||||
| endif | endif | ||||||
|  |  | ||||||
| if BUILD_COMMS_SHMEM | if BUILD_COMMS_SHMEM | ||||||
|   extra_sources+=communicator/Communicator_shmem.cc |   extra_sources+=communicator/Communicator_shmem.cc | ||||||
|  |   extra_sources+=communicator/Communicator_base.cc | ||||||
| endif | endif | ||||||
|  |  | ||||||
| if BUILD_COMMS_NONE | if BUILD_COMMS_NONE | ||||||
|   extra_sources+=communicator/Communicator_none.cc |   extra_sources+=communicator/Communicator_none.cc | ||||||
|  |   extra_sources+=communicator/Communicator_base.cc | ||||||
| endif | endif | ||||||
|  |  | ||||||
| # | # | ||||||
|   | |||||||
							
								
								
									
										267
									
								
								lib/Stencil.h
									
									
									
									
									
								
							
							
						
						
									
										267
									
								
								lib/Stencil.h
									
									
									
									
									
								
							| @@ -70,20 +70,20 @@ | |||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
| template<class vobj,class cobj,class compressor> void  | inline void Gather_plane_simple_table_compute (GridBase *grid,int dimension,int plane,int cbmask, | ||||||
| Gather_plane_simple_table_compute (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off,std::vector<std::pair<int,int> >& table) | 					       int off,std::vector<std::pair<int,int> > & table) | ||||||
| { | { | ||||||
|   table.resize(0); |   table.resize(0); | ||||||
|   int rd = rhs._grid->_rdimensions[dimension]; |   int rd = grid->_rdimensions[dimension]; | ||||||
|  |  | ||||||
|   if ( !rhs._grid->CheckerBoarded(dimension) ) { |   if ( !grid->CheckerBoarded(dimension) ) { | ||||||
|     cbmask = 0x3; |     cbmask = 0x3; | ||||||
|   } |   } | ||||||
|   int so= plane*rhs._grid->_ostride[dimension]; // base offset for start of plane  |   int so= plane*grid->_ostride[dimension]; // base offset for start of plane  | ||||||
|   int e1=rhs._grid->_slice_nblock[dimension]; |   int e1=grid->_slice_nblock[dimension]; | ||||||
|   int e2=rhs._grid->_slice_block[dimension]; |   int e2=grid->_slice_block[dimension]; | ||||||
|  |  | ||||||
|   int stride=rhs._grid->_slice_stride[dimension]; |   int stride=grid->_slice_stride[dimension]; | ||||||
|   if ( cbmask == 0x3 ) {  |   if ( cbmask == 0x3 ) {  | ||||||
|     table.resize(e1*e2); |     table.resize(e1*e2); | ||||||
|     for(int n=0;n<e1;n++){ |     for(int n=0;n<e1;n++){ | ||||||
| @@ -99,7 +99,7 @@ Gather_plane_simple_table_compute (const Lattice<vobj> &rhs,std::vector<cobj,ali | |||||||
|      for(int n=0;n<e1;n++){ |      for(int n=0;n<e1;n++){ | ||||||
|        for(int b=0;b<e2;b++){ |        for(int b=0;b<e2;b++){ | ||||||
| 	 int o  = n*stride; | 	 int o  = n*stride; | ||||||
| 	 int ocb=1<<rhs._grid->CheckerBoardFromOindexTable(o+b); | 	 int ocb=1<<grid->CheckerBoardFromOindexTable(o+b); | ||||||
| 	 if ( ocb &cbmask ) { | 	 if ( ocb &cbmask ) { | ||||||
| 	   table[bo]=std::pair<int,int>(bo,o+b); bo++; | 	   table[bo]=std::pair<int,int>(bo,o+b); bo++; | ||||||
| 	 } | 	 } | ||||||
| @@ -109,8 +109,7 @@ Gather_plane_simple_table_compute (const Lattice<vobj> &rhs,std::vector<cobj,ali | |||||||
| } | } | ||||||
|  |  | ||||||
| template<class vobj,class cobj,class compressor> void  | template<class vobj,class cobj,class compressor> void  | ||||||
| Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer, | Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so) | ||||||
| 			   compressor &compress, int off,int so) |  | ||||||
| { | { | ||||||
| PARALLEL_FOR_LOOP      | PARALLEL_FOR_LOOP      | ||||||
|      for(int i=0;i<table.size();i++){ |      for(int i=0;i<table.size();i++){ | ||||||
| @@ -118,19 +117,6 @@ PARALLEL_FOR_LOOP | |||||||
|      } |      } | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class vobj,class cobj,class compressor> void  |  | ||||||
| Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off, |  | ||||||
| 			     double &t_table ,double & t_data ) |  | ||||||
| { |  | ||||||
|   std::vector<std::pair<int,int> > table; |  | ||||||
|   Gather_plane_simple_table_compute (rhs, buffer,dimension,plane,cbmask,compress,off,table); |  | ||||||
|   int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane  |  | ||||||
|   Gather_plane_simple_table         (table,rhs,buffer,compress,off,so); |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| struct StencilEntry {  | struct StencilEntry {  | ||||||
|   uint64_t _offset; |   uint64_t _offset; | ||||||
|   uint64_t _byte_offset; |   uint64_t _byte_offset; | ||||||
| @@ -143,6 +129,7 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl | |||||||
| class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in. | class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in. | ||||||
|  public: |  public: | ||||||
|  |  | ||||||
|  |   typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; | ||||||
|   typedef uint32_t StencilInteger; |   typedef uint32_t StencilInteger; | ||||||
|   typedef typename cobj::vector_type vector_type; |   typedef typename cobj::vector_type vector_type; | ||||||
|   typedef typename cobj::scalar_type scalar_type; |   typedef typename cobj::scalar_type scalar_type; | ||||||
| @@ -158,7 +145,6 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl | |||||||
|     Integer to_rank; |     Integer to_rank; | ||||||
|     Integer from_rank; |     Integer from_rank; | ||||||
|     Integer bytes; |     Integer bytes; | ||||||
| 	 volatile Integer done; |  | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   std::vector<Packet> Packets; |   std::vector<Packet> Packets; | ||||||
| @@ -166,81 +152,53 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl | |||||||
|   int face_table_computed; |   int face_table_computed; | ||||||
|   std::vector<std::vector<std::pair<int,int> > > face_table ; |   std::vector<std::vector<std::pair<int,int> > > face_table ; | ||||||
|    |    | ||||||
| #define SEND_IMMEDIATE |  | ||||||
| #define SERIAL_SENDS |  | ||||||
|  |  | ||||||
|   void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){ |   void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){ | ||||||
|  #ifdef SEND_IMMEDIATE |  | ||||||
| 	 commtime-=usecond(); |  | ||||||
| 	 _grid->SendToRecvFrom(xmit,to,rcv,from,bytes); |  | ||||||
| 	 commtime+=usecond(); |  | ||||||
|  #endif |  | ||||||
|     Packet p; |     Packet p; | ||||||
|     p.send_buf = xmit; |     p.send_buf = xmit; | ||||||
|     p.recv_buf = rcv; |     p.recv_buf = rcv; | ||||||
|     p.to_rank  = to; |     p.to_rank  = to; | ||||||
|     p.from_rank= from; |     p.from_rank= from; | ||||||
|     p.bytes    = bytes; |     p.bytes    = bytes; | ||||||
| 	 p.done     = 0; |  | ||||||
|     comms_bytes+=2.0*bytes; |     comms_bytes+=2.0*bytes; | ||||||
|     Packets.push_back(p); |     Packets.push_back(p); | ||||||
|  |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  #ifdef SERIAL_SENDS |   void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) | ||||||
|        void Communicate(void ) {  |   { | ||||||
|  |     reqs.resize(Packets.size()); | ||||||
|     commtime-=usecond(); |     commtime-=usecond(); | ||||||
|     for(int i=0;i<Packets.size();i++){ |     for(int i=0;i<Packets.size();i++){ | ||||||
|  #ifndef SEND_IMMEDIATE | 	_grid->StencilSendToRecvFromBegin(reqs[i], | ||||||
| 	   _grid->SendToRecvFrom( |  | ||||||
| 					  Packets[i].send_buf, | 					  Packets[i].send_buf, | ||||||
| 					  Packets[i].to_rank, | 					  Packets[i].to_rank, | ||||||
| 					  Packets[i].recv_buf, | 					  Packets[i].recv_buf, | ||||||
| 					  Packets[i].from_rank, | 					  Packets[i].from_rank, | ||||||
| 					  Packets[i].bytes); | 					  Packets[i].bytes); | ||||||
|  #endif | 	/* | ||||||
| 	   Packets[i].done = 1; |       }else{ | ||||||
|  | 	_grid->SendToRecvFromBegin(reqs[i], | ||||||
|  | 				   Packets[i].send_buf, | ||||||
|  | 				   Packets[i].to_rank, | ||||||
|  | 				   Packets[i].recv_buf, | ||||||
|  | 				   Packets[i].from_rank, | ||||||
|  | 				   Packets[i].bytes); | ||||||
|  |       } | ||||||
|  | 	*/ | ||||||
|     } |     } | ||||||
|     commtime+=usecond(); |     commtime+=usecond(); | ||||||
|   } |   } | ||||||
|  #else |   void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) | ||||||
|        void Communicate(void ) {  |   { | ||||||
| 	 typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; |  | ||||||
| 	 std::vector<std::vector<CommsRequest_t> > reqs(Packets.size()); |  | ||||||
|     commtime-=usecond(); |     commtime-=usecond(); | ||||||
| 	 const int concurrency=2; |  | ||||||
| 	 for(int i=0;i<Packets.size();i+=concurrency){ |     for(int i=0;i<Packets.size();i++){ | ||||||
| 	   for(int ii=0;ii<concurrency;ii++){ |       //      if( ShmDirectCopy )  | ||||||
| 	     int j = i+ii; | 	_grid->StencilSendToRecvFromComplete(reqs[i]); | ||||||
| 	     if ( j<Packets.size() ) { | 	//      else  | ||||||
|  #ifndef SEND_IMMEDIATE | 	//	_grid->SendToRecvFromComplete(reqs[i]); | ||||||
| 	       _grid->SendToRecvFromBegin(reqs[j], |  | ||||||
| 					  Packets[j].send_buf, |  | ||||||
| 					  Packets[j].to_rank, |  | ||||||
| 					  Packets[j].recv_buf, |  | ||||||
| 					  Packets[j].from_rank, |  | ||||||
| 					  Packets[j].bytes); |  | ||||||
|  #endif |  | ||||||
| 	     } |  | ||||||
| 	   } |  | ||||||
| 	   for(int ii=0;ii<concurrency;ii++){ |  | ||||||
| 	     int j = i+ii; |  | ||||||
| 	     if ( j<Packets.size() ) { |  | ||||||
|  #ifndef SEND_IMMEDIATE |  | ||||||
| 	       _grid->SendToRecvFromComplete(reqs[i]); |  | ||||||
|  #endif |  | ||||||
| 	     } |  | ||||||
| 	   } |  | ||||||
| 	   for(int ii=0;ii<concurrency;ii++){ |  | ||||||
| 	     int j = i+ii; |  | ||||||
| 	     if ( j<Packets.size() ) { |  | ||||||
| 	       Packets[j].done = 1; |  | ||||||
| 	     } |  | ||||||
| 	   } |  | ||||||
|     } |     } | ||||||
|     commtime+=usecond(); |     commtime+=usecond(); | ||||||
|   } |   } | ||||||
|  #endif |  | ||||||
|  |  | ||||||
|   /////////////////////////////////////////// |   /////////////////////////////////////////// | ||||||
|   // Simd merge queue for asynch comms |   // Simd merge queue for asynch comms | ||||||
| @@ -260,36 +218,19 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl | |||||||
|     m.rpointers= rpointers; |     m.rpointers= rpointers; | ||||||
|     m.buffer_size = buffer_size; |     m.buffer_size = buffer_size; | ||||||
|     m.packet_id   = packet_id; |     m.packet_id   = packet_id; | ||||||
|  #ifdef SEND_IMMEDIATE |  | ||||||
| 	 mergetime-=usecond(); |  | ||||||
|  PARALLEL_FOR_LOOP |  | ||||||
| 	 for(int o=0;o<m.buffer_size;o++){ |  | ||||||
| 	   merge1(m.mpointer[o],m.rpointers,o); |  | ||||||
| 	 } |  | ||||||
| 	 mergetime+=usecond(); |  | ||||||
|  #else |  | ||||||
|     Mergers.push_back(m); |     Mergers.push_back(m); | ||||||
|  #endif |  | ||||||
|  |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void CommsMerge(void ) {  |   void CommsMerge(void ) {  | ||||||
| 	 //PARALLEL_NESTED_LOOP2  |  | ||||||
|     for(int i=0;i<Mergers.size();i++){	 |     for(int i=0;i<Mergers.size();i++){	 | ||||||
|        |        | ||||||
| 	 spintime-=usecond(); |  | ||||||
| 	 int packet_id = Mergers[i].packet_id; |  | ||||||
| 	 while(! Packets[packet_id].done ); // spin for completion |  | ||||||
| 	 spintime+=usecond(); |  | ||||||
|  |  | ||||||
|  #ifndef SEND_IMMEDIATE |  | ||||||
|       mergetime-=usecond(); |       mergetime-=usecond(); | ||||||
| PARALLEL_FOR_LOOP | PARALLEL_FOR_LOOP | ||||||
|       for(int o=0;o<Mergers[i].buffer_size;o++){ |       for(int o=0;o<Mergers[i].buffer_size;o++){ | ||||||
| 	merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o); | 	merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o); | ||||||
|       } |       } | ||||||
|       mergetime+=usecond(); |       mergetime+=usecond(); | ||||||
|  #endif |  | ||||||
|  |  | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
| @@ -312,24 +253,19 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl | |||||||
|   // Flat vector, change layout for cache friendly. |   // Flat vector, change layout for cache friendly. | ||||||
|   Vector<StencilEntry>  _entries; |   Vector<StencilEntry>  _entries; | ||||||
|    |    | ||||||
|        inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; } |  | ||||||
|  |  | ||||||
|   void PrecomputeByteOffsets(void){ |   void PrecomputeByteOffsets(void){ | ||||||
|     for(int i=0;i<_entries.size();i++){ |     for(int i=0;i<_entries.size();i++){ | ||||||
|       if( _entries[i]._is_local ) { |       if( _entries[i]._is_local ) { | ||||||
| 	_entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj); | 	_entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj); | ||||||
|       } else {  |       } else {  | ||||||
| 	     // PrecomputeByteOffsets [5] 16384/32768 140735768678528 140735781261056 2581581952 |  | ||||||
| 	_entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj); | 	_entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj); | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|        inline uint64_t Touch(int ent) { |   inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; } | ||||||
| 	 //	 _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0); |  | ||||||
|        } |  | ||||||
|   inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) { |   inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) { | ||||||
| 	 uint64_t cbase = (uint64_t)&comm_buf[0]; |     uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; | ||||||
|     local = _entries[ent]._is_local; |     local = _entries[ent]._is_local; | ||||||
|     perm  = _entries[ent]._permute; |     perm  = _entries[ent]._permute; | ||||||
|     if (perm)  ptype = _permute_type[point];  |     if (perm)  ptype = _permute_type[point];  | ||||||
| @@ -340,20 +276,33 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl | |||||||
|     } |     } | ||||||
|   } |   } | ||||||
|   inline uint64_t GetPFInfo(int ent,uint64_t base) { |   inline uint64_t GetPFInfo(int ent,uint64_t base) { | ||||||
| 	 uint64_t cbase = (uint64_t)&comm_buf[0]; |     uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; | ||||||
|     int local = _entries[ent]._is_local; |     int local = _entries[ent]._is_local; | ||||||
|     if (local) return  base + _entries[ent]._byte_offset; |     if (local) return  base + _entries[ent]._byte_offset; | ||||||
|     else       return cbase + _entries[ent]._byte_offset; |     else       return cbase + _entries[ent]._byte_offset; | ||||||
|   } |   } | ||||||
|    |    | ||||||
|        // Comms buffers |   /////////////////////////////////////////////////////////// | ||||||
|        std::vector<Vector<scalar_object> > u_simd_send_buf; |   // Unified Comms buffers for all directions | ||||||
|        std::vector<Vector<scalar_object> > u_simd_recv_buf; |   /////////////////////////////////////////////////////////// | ||||||
|        Vector<cobj>          u_send_buf; |   // Vectors that live on the symmetric heap in case of SHMEM | ||||||
|        Vector<cobj>          comm_buf; |   //  std::vector<commVector<scalar_object> > u_simd_send_buf_hide; | ||||||
|  |   //  std::vector<commVector<scalar_object> > u_simd_recv_buf_hide; | ||||||
|  |   //  commVector<cobj>          u_send_buf_hide; | ||||||
|  |   //  commVector<cobj>          u_recv_buf_hide; | ||||||
|  |  | ||||||
|  |   // These are used; either SHM objects or refs to the above symmetric heap vectors | ||||||
|  |   // depending on comms target | ||||||
|  |   cobj* u_recv_buf_p; | ||||||
|  |   cobj* u_send_buf_p; | ||||||
|  |   std::vector<scalar_object *> u_simd_send_buf; | ||||||
|  |   std::vector<scalar_object *> u_simd_recv_buf; | ||||||
|  |  | ||||||
|   int u_comm_offset; |   int u_comm_offset; | ||||||
|   int _unified_buffer_size; |   int _unified_buffer_size; | ||||||
|    |    | ||||||
|  |   cobj *CommBuf(void) { return u_recv_buf_p; } | ||||||
|  |  | ||||||
|   ///////////////////////////////////////// |   ///////////////////////////////////////// | ||||||
|   // Timing info; ugly; possibly temporary |   // Timing info; ugly; possibly temporary | ||||||
|   ///////////////////////////////////////// |   ///////////////////////////////////////// | ||||||
| @@ -435,7 +384,6 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl | |||||||
|       int i = ii; // reverse direction to get SIMD comms done first |       int i = ii; // reverse direction to get SIMD comms done first | ||||||
|       int point = i; |       int point = i; | ||||||
|        |        | ||||||
|  |  | ||||||
|       int dimension    = directions[i]; |       int dimension    = directions[i]; | ||||||
|       int displacement = distances[i]; |       int displacement = distances[i]; | ||||||
|       int shift = displacement; |       int shift = displacement; | ||||||
| @@ -482,18 +430,25 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl | |||||||
| 	} | 	} | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|        u_send_buf.resize(_unified_buffer_size); |  | ||||||
|        comm_buf.resize(_unified_buffer_size); |  | ||||||
|  |  | ||||||
|        PrecomputeByteOffsets(); |  | ||||||
|  |  | ||||||
|  |     ///////////////////////////////////////////////////////////////////////////////// | ||||||
|  |     // Try to allocate for receiving in a shared memory region, fall back to buffer | ||||||
|  |     ///////////////////////////////////////////////////////////////////////////////// | ||||||
|     const int Nsimd = grid->Nsimd(); |     const int Nsimd = grid->Nsimd(); | ||||||
|  |  | ||||||
|  |     _grid->ShmBufferFreeAll(); | ||||||
|  |  | ||||||
|     u_simd_send_buf.resize(Nsimd); |     u_simd_send_buf.resize(Nsimd); | ||||||
|     u_simd_recv_buf.resize(Nsimd); |     u_simd_recv_buf.resize(Nsimd); | ||||||
|  |  | ||||||
|  |     u_send_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); | ||||||
|  |     u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); | ||||||
|     for(int l=0;l<Nsimd;l++){ |     for(int l=0;l<Nsimd;l++){ | ||||||
| 	 u_simd_send_buf[l].resize(_unified_buffer_size); |       u_simd_recv_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object)); | ||||||
| 	 u_simd_recv_buf[l].resize(_unified_buffer_size); |       u_simd_send_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object)); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     PrecomputeByteOffsets(); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void Local     (int point, int dimension,int shiftpm,int cbmask) |   void Local     (int point, int dimension,int shiftpm,int cbmask) | ||||||
| @@ -717,38 +672,22 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl | |||||||
|     } |     } | ||||||
|   } |   } | ||||||
|    |    | ||||||
|  |   template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)  | ||||||
|  |  | ||||||
|        template<class compressor> |  | ||||||
|        void HaloExchange(const Lattice<vobj> &source,compressor &compress)  |  | ||||||
|   { |   { | ||||||
|  |     std::vector<std::vector<CommsRequest_t> > reqs; | ||||||
|     calls++; |     calls++; | ||||||
|     Mergers.resize(0); |     Mergers.resize(0); | ||||||
|     Packets.resize(0); |     Packets.resize(0); | ||||||
|  |     _grid->StencilBarrier(); | ||||||
|     HaloGather(source,compress); |     HaloGather(source,compress); | ||||||
| 	 this->Communicate(); |     this->CommunicateBegin(reqs); | ||||||
|  |     _grid->StencilBarrier(); | ||||||
|  |     this->CommunicateComplete(reqs); | ||||||
|  |     _grid->StencilBarrier(); | ||||||
|     CommsMerge(); // spins |     CommsMerge(); // spins | ||||||
|   } |   } | ||||||
| #if 0 |    | ||||||
|        // Overlapping comms and compute typically slows down compute and  is useless |   template<class compressor> void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx) | ||||||
|        // unless memory bandwidth greatly exceeds network |  | ||||||
|        template<class compressor> |  | ||||||
|        std::thread HaloExchangeBegin(const Lattice<vobj> &source,compressor &compress) { |  | ||||||
| 	 Mergers.resize(0);  |  | ||||||
| 	 Packets.resize(0); |  | ||||||
| 	 HaloGather(source,compress); |  | ||||||
| 	 return std::thread([&] { this->Communicate(); }); |  | ||||||
|        } |  | ||||||
|        void HaloExchangeComplete(std::thread &thr)  |  | ||||||
|        { |  | ||||||
| 	 CommsMerge(); // spins |  | ||||||
| 	 jointime-=usecond(); |  | ||||||
| 	 thr.join(); |  | ||||||
| 	 jointime+=usecond(); |  | ||||||
|        } |  | ||||||
| #endif |  | ||||||
|        template<class compressor> |  | ||||||
|        void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx) |  | ||||||
|   { |   { | ||||||
|     int dimension    = _directions[point]; |     int dimension    = _directions[point]; | ||||||
|     int displacement = _distances[point]; |     int displacement = _distances[point]; | ||||||
| @@ -806,7 +745,6 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl | |||||||
|     assert(source._grid==_grid); |     assert(source._grid==_grid); | ||||||
|     halogtime-=usecond(); |     halogtime-=usecond(); | ||||||
|      |      | ||||||
| 	 assert (comm_buf.size() == _unified_buffer_size ); |  | ||||||
|     u_comm_offset=0; |     u_comm_offset=0; | ||||||
|      |      | ||||||
|     // Gather all comms buffers |     // Gather all comms buffers | ||||||
| @@ -863,37 +801,48 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl | |||||||
| 	if ( !face_table_computed ) { | 	if ( !face_table_computed ) { | ||||||
| 	  t_table-=usecond(); | 	  t_table-=usecond(); | ||||||
| 	  face_table.resize(face_idx+1); | 	  face_table.resize(face_idx+1); | ||||||
| 		 Gather_plane_simple_table_compute (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset,face_table[face_idx]); | 	  Gather_plane_simple_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset, | ||||||
|  | 					     face_table[face_idx]); | ||||||
| 	  t_table+=usecond(); | 	  t_table+=usecond(); | ||||||
| 	} | 	} | ||||||
| 	       t_data-=usecond(); |  | ||||||
| 	       Gather_plane_simple_table         (face_table[face_idx],rhs,u_send_buf,compress,u_comm_offset,so); |  | ||||||
| 	       face_idx++; |  | ||||||
| 	       t_data+=usecond(); |  | ||||||
| 	       gathertime+=usecond(); |  | ||||||
| 	 | 	 | ||||||
| 	       //	       Gather_plane_simple_stencil (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset,t_table,t_data); |  | ||||||
| 	 | 	 | ||||||
| 	int rank           = _grid->_processor; | 	int rank           = _grid->_processor; | ||||||
| 	int recv_from_rank; | 	int recv_from_rank; | ||||||
| 	int xmit_to_rank; | 	int xmit_to_rank; | ||||||
| 	_grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); | 	_grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); | ||||||
|  | 	 | ||||||
| 	assert (xmit_to_rank   != _grid->ThisRank()); | 	assert (xmit_to_rank   != _grid->ThisRank()); | ||||||
| 	assert (recv_from_rank != _grid->ThisRank()); | 	assert (recv_from_rank != _grid->ThisRank()); | ||||||
| 	 | 	 | ||||||
| 	       //      FIXME Implement asynchronous send & also avoid buffer copy | 	///////////////////////////////////////////////////////// | ||||||
| 	       AddPacket((void *)&u_send_buf[u_comm_offset], | 	// try the direct copy if possible | ||||||
| 			 (void *)  &comm_buf[u_comm_offset], | 	///////////////////////////////////////////////////////// | ||||||
|  |  | ||||||
|  |  | ||||||
|  | 	cobj *send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,u_recv_buf_p); | ||||||
|  | 	if ( send_buf==NULL ) {  | ||||||
|  | 	  send_buf = u_send_buf_p; | ||||||
|  | 	} | ||||||
|  | 	//	std::cout << " send_bufs  "<<std::hex<< send_buf <<" ubp "<<u_send_buf_p <<std::dec<<std::endl; | ||||||
|  | 	t_data-=usecond(); | ||||||
|  | 	assert(u_send_buf_p!=NULL); | ||||||
|  | 	assert(send_buf!=NULL); | ||||||
|  | 	Gather_plane_simple_table         (face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so);  face_idx++; | ||||||
|  | 	t_data+=usecond(); | ||||||
|  | 	 | ||||||
|  | 	AddPacket((void *)&send_buf[u_comm_offset], | ||||||
|  | 		  (void *)&u_recv_buf_p[u_comm_offset], | ||||||
| 		  xmit_to_rank, | 		  xmit_to_rank, | ||||||
| 		  recv_from_rank, | 		  recv_from_rank, | ||||||
| 		  bytes); | 		  bytes); | ||||||
|  |  | ||||||
|  | 	gathertime+=usecond(); | ||||||
| 	u_comm_offset+=words; | 	u_comm_offset+=words; | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|    |    | ||||||
|  |  | ||||||
|   template<class compressor> |   template<class compressor> | ||||||
|   void  GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx) |   void  GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx) | ||||||
|   { |   { | ||||||
| @@ -974,10 +923,6 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl | |||||||
| 	  auto rp = &u_simd_recv_buf[i       ][u_comm_offset]; | 	  auto rp = &u_simd_recv_buf[i       ][u_comm_offset]; | ||||||
| 	  auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset]; | 	  auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset]; | ||||||
| 	   | 	   | ||||||
| 		 void *vrp = (void *)rp; |  | ||||||
| 		 void *vsp = (void *)sp; |  | ||||||
|  |  | ||||||
|  |  | ||||||
| 	  if(nbr_proc){ | 	  if(nbr_proc){ | ||||||
| 	     | 	     | ||||||
| 	    int recv_from_rank; | 	    int recv_from_rank; | ||||||
| @@ -985,9 +930,17 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl | |||||||
| 	     | 	     | ||||||
| 	    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);  | 	    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);  | ||||||
|   |   | ||||||
| 		   AddPacket( vsp,vrp,xmit_to_rank,recv_from_rank,bytes); | 	    scalar_object *shm = (scalar_object *) _grid->ShmBufferTranslate(recv_from_rank,sp); | ||||||
|  | 	    //	    if ((ShmDirectCopy==0)||(shm==NULL)) {  | ||||||
|  | 	    if (shm==NULL) {  | ||||||
|  | 	      shm = rp; | ||||||
|  | 	    }  | ||||||
| 	     | 	     | ||||||
| 		   rpointers[i] = rp; | 	    // if Direct, StencilSendToRecvFrom will suppress copy to a peer on node | ||||||
|  | 	    // assuming above pointer flip | ||||||
|  | 	    AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes); | ||||||
|  | 	     | ||||||
|  | 	    rpointers[i] = shm; | ||||||
| 	     | 	     | ||||||
| 	  } else {  | 	  } else {  | ||||||
| 	     | 	     | ||||||
| @@ -996,7 +949,7 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl | |||||||
| 	  } | 	  } | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	       AddMerge(&comm_buf[u_comm_offset],rpointers,buffer_size,Packets.size()-1); | 	AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,buffer_size,Packets.size()-1); | ||||||
| 	 | 	 | ||||||
| 	u_comm_offset     +=buffer_size; | 	u_comm_offset     +=buffer_size; | ||||||
|       } |       } | ||||||
|   | |||||||
| @@ -127,6 +127,22 @@ class GridThread { | |||||||
|     ThreadBarrier(); |     ThreadBarrier(); | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|  |   static void bcopy(const void *src, void *dst, size_t len) { | ||||||
|  | #ifdef GRID_OMP | ||||||
|  | #pragma omp parallel  | ||||||
|  |     { | ||||||
|  |       const char *c_src =(char *) src; | ||||||
|  |       char *c_dest=(char *) dst; | ||||||
|  |       int me,mywork,myoff; | ||||||
|  |       GridThread::GetWorkBarrier(len,me, mywork,myoff); | ||||||
|  |       bcopy(&c_src[myoff],&c_dest[myoff],mywork); | ||||||
|  |     } | ||||||
|  | #else  | ||||||
|  |     bcopy(src,dst,len); | ||||||
|  | #endif | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |  | ||||||
| }; | }; | ||||||
|  |  | ||||||
| } | } | ||||||
|   | |||||||
| @@ -77,7 +77,7 @@ public: | |||||||
|     // GridCartesian / GridRedBlackCartesian |     // GridCartesian / GridRedBlackCartesian | ||||||
|     //////////////////////////////////////////////////////////////// |     //////////////////////////////////////////////////////////////// | ||||||
|     virtual int CheckerBoarded(int dim)=0; |     virtual int CheckerBoarded(int dim)=0; | ||||||
|     virtual int CheckerBoard(std::vector<int> site)=0; |     virtual int CheckerBoard(std::vector<int> &site)=0; | ||||||
|     virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0; |     virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0; | ||||||
|     virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0; |     virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0; | ||||||
|     virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0; |     virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0; | ||||||
|   | |||||||
| @@ -49,7 +49,7 @@ public: | |||||||
|     virtual int CheckerBoarded(int dim){ |     virtual int CheckerBoarded(int dim){ | ||||||
|       return 0; |       return 0; | ||||||
|     } |     } | ||||||
|     virtual int CheckerBoard(std::vector<int> site){ |     virtual int CheckerBoard(std::vector<int> &site){ | ||||||
|         return 0; |         return 0; | ||||||
|     } |     } | ||||||
|     virtual int CheckerBoardDestination(int cb,int shift,int dim){ |     virtual int CheckerBoardDestination(int cb,int shift,int dim){ | ||||||
|   | |||||||
| @@ -49,7 +49,7 @@ public: | |||||||
|       if( dim==_checker_dim) return 1; |       if( dim==_checker_dim) return 1; | ||||||
|       else return 0; |       else return 0; | ||||||
|     } |     } | ||||||
|     virtual int CheckerBoard(std::vector<int> site){ |     virtual int CheckerBoard(std::vector<int> &site){ | ||||||
|       int linear=0; |       int linear=0; | ||||||
|       assert(site.size()==_ndimension); |       assert(site.size()==_ndimension); | ||||||
|       for(int d=0;d<_ndimension;d++){  |       for(int d=0;d<_ndimension;d++){  | ||||||
|   | |||||||
							
								
								
									
										132
									
								
								lib/communicator/Communicator_base.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										132
									
								
								lib/communicator/Communicator_base.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,132 @@ | |||||||
|  |     /************************************************************************************* | ||||||
|  |  | ||||||
|  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
|  |  | ||||||
|  |     Source file: ./lib/communicator/Communicator_none.cc | ||||||
|  |  | ||||||
|  |     Copyright (C) 2015 | ||||||
|  |  | ||||||
|  | Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||||
|  |  | ||||||
|  |     This program is free software; you can redistribute it and/or modify | ||||||
|  |     it under the terms of the GNU General Public License as published by | ||||||
|  |     the Free Software Foundation; either version 2 of the License, or | ||||||
|  |     (at your option) any later version. | ||||||
|  |  | ||||||
|  |     This program is distributed in the hope that it will be useful, | ||||||
|  |     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  |     GNU General Public License for more details. | ||||||
|  |  | ||||||
|  |     You should have received a copy of the GNU General Public License along | ||||||
|  |     with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|  |     *************************************************************************************/ | ||||||
|  |     /*  END LEGAL */ | ||||||
|  | #include "Grid.h" | ||||||
|  | namespace Grid { | ||||||
|  |  | ||||||
|  | /////////////////////////////////////////////////////////////// | ||||||
|  | // Info that is setup once and indept of cartesian layout | ||||||
|  | /////////////////////////////////////////////////////////////// | ||||||
|  | int CartesianCommunicator::ShmRank; | ||||||
|  | int CartesianCommunicator::ShmSize; | ||||||
|  | int CartesianCommunicator::GroupRank; | ||||||
|  | int CartesianCommunicator::GroupSize; | ||||||
|  | int CartesianCommunicator::WorldRank; | ||||||
|  | int CartesianCommunicator::WorldSize; | ||||||
|  | int CartesianCommunicator::Slave; | ||||||
|  | void *              CartesianCommunicator::ShmCommBuf; | ||||||
|  |  | ||||||
|  | ///////////////////////////////// | ||||||
|  | // Alloc, free shmem region | ||||||
|  | ///////////////////////////////// | ||||||
|  | void *CartesianCommunicator::ShmBufferMalloc(size_t bytes){ | ||||||
|  |   //  bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes | ||||||
|  |   void *ptr = (void *)heap_top; | ||||||
|  |   heap_top  += bytes; | ||||||
|  |   heap_bytes+= bytes; | ||||||
|  |   std::cout <<"Shm alloc "<<ptr<<std::endl; | ||||||
|  |   assert(heap_bytes < MAX_MPI_SHM_BYTES); | ||||||
|  |   return ptr; | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::ShmBufferFreeAll(void) {  | ||||||
|  |   heap_top  =(size_t)ShmBufferSelf(); | ||||||
|  |   heap_bytes=0; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | ///////////////////////////////// | ||||||
|  | // Grid information queries | ||||||
|  | ///////////////////////////////// | ||||||
|  | int                      CartesianCommunicator::IsBoss(void)            { return _processor==0; }; | ||||||
|  | int                      CartesianCommunicator::BossRank(void)          { return 0; }; | ||||||
|  | int                      CartesianCommunicator::ThisRank(void)          { return _processor; }; | ||||||
|  | const std::vector<int> & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; }; | ||||||
|  | const std::vector<int> & CartesianCommunicator::ProcessorGrid(void)     { return _processors; }; | ||||||
|  | int                      CartesianCommunicator::ProcessorCount(void)    { return _Nprocessors; }; | ||||||
|  |  | ||||||
|  | //////////////////////////////////////////////////////////////////////////////// | ||||||
|  | // very VERY rarely (Log, serial RNG) we need world without a grid | ||||||
|  | //////////////////////////////////////////////////////////////////////////////// | ||||||
|  | int  CartesianCommunicator::RankWorld(void){ return WorldRank; }; | ||||||
|  | int CartesianCommunicator::Ranks    (void) { return WorldSize; }; | ||||||
|  | int CartesianCommunicator::Nodes    (void) { return GroupSize; }; | ||||||
|  | int CartesianCommunicator::Cores    (void) { return ShmSize;   }; | ||||||
|  | int CartesianCommunicator::NodeRank (void) { return GroupRank; }; | ||||||
|  | int CartesianCommunicator::CoreRank (void) { return ShmRank;   }; | ||||||
|  |  | ||||||
|  | void CartesianCommunicator::GlobalSum(ComplexF &c) | ||||||
|  | { | ||||||
|  |   GlobalSumVector((float *)&c,2); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N) | ||||||
|  | { | ||||||
|  |   GlobalSumVector((float *)c,2*N); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalSum(ComplexD &c) | ||||||
|  | { | ||||||
|  |   GlobalSumVector((double *)&c,2); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N) | ||||||
|  | { | ||||||
|  |   GlobalSumVector((double *)c,2*N); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #ifndef GRID_COMMS_MPI3 | ||||||
|  |  | ||||||
|  | void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
|  | 						       void *xmit, | ||||||
|  | 						       int xmit_to_rank, | ||||||
|  | 						       void *recv, | ||||||
|  | 						       int recv_from_rank, | ||||||
|  | 						       int bytes) | ||||||
|  | { | ||||||
|  |   SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall) | ||||||
|  | { | ||||||
|  |   SendToRecvFromComplete(waitall); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::StencilBarrier(void){}; | ||||||
|  |  | ||||||
|  | commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector; | ||||||
|  |  | ||||||
|  | void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; } | ||||||
|  |  | ||||||
|  | void *CartesianCommunicator::ShmBuffer(int rank) { | ||||||
|  |   return NULL; | ||||||
|  | } | ||||||
|  | void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {  | ||||||
|  |   return NULL; | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::ShmInitGeneric(void){ | ||||||
|  |   ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES); | ||||||
|  |   ShmCommBuf=(void *)&ShmBufStorageVector[0]; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #endif | ||||||
|  |    | ||||||
|  | } | ||||||
|  |  | ||||||
| @@ -34,77 +34,137 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #ifdef GRID_COMMS_MPI | #ifdef GRID_COMMS_MPI | ||||||
| #include <mpi.h> | #include <mpi.h> | ||||||
| #endif | #endif | ||||||
|  | #ifdef GRID_COMMS_MPI3 | ||||||
|  | #include <mpi.h> | ||||||
|  | #endif | ||||||
| #ifdef GRID_COMMS_SHMEM | #ifdef GRID_COMMS_SHMEM | ||||||
| #include <mpp/shmem.h> | #include <mpp/shmem.h> | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
| class CartesianCommunicator { | class CartesianCommunicator { | ||||||
|   public:     |   public:     | ||||||
|  |  | ||||||
|   // Communicator should know nothing of the physics grid, only processor grid. |   // 65536 ranks per node adequate for now | ||||||
|  |   // 128MB shared memory for comms enought for 48^4 local vol comms | ||||||
|  |   // Give external control (command line override?) of this | ||||||
|  |  | ||||||
|  |   static const int      MAXLOG2RANKSPERNODE = 16;             | ||||||
|  |   static const uint64_t MAX_MPI_SHM_BYTES   = 128*1024*1024;  | ||||||
|  |  | ||||||
|  |   // Communicator should know nothing of the physics grid, only processor grid. | ||||||
|   int              _Nprocessors;     // How many in all |   int              _Nprocessors;     // How many in all | ||||||
|   std::vector<int> _processors;      // Which dimensions get relayed out over processors lanes. |   std::vector<int> _processors;      // Which dimensions get relayed out over processors lanes. | ||||||
|   int              _processor;       // linear processor rank |   int              _processor;       // linear processor rank | ||||||
|   std::vector<int> _processor_coor;  // linear processor coordinate |   std::vector<int> _processor_coor;  // linear processor coordinate | ||||||
|   unsigned long _ndimension; |   unsigned long _ndimension; | ||||||
|  |  | ||||||
| #ifdef GRID_COMMS_MPI | #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) | ||||||
|   MPI_Comm communicator; |   MPI_Comm communicator; | ||||||
|  |   static MPI_Comm communicator_world; | ||||||
|   typedef MPI_Request CommsRequest_t; |   typedef MPI_Request CommsRequest_t; | ||||||
| #else  | #else  | ||||||
|   typedef int CommsRequest_t; |   typedef int CommsRequest_t; | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  |   //////////////////////////////////////////////////////////////////// | ||||||
|  |   // Helper functionality for SHM Windows common to all other impls | ||||||
|  |   //////////////////////////////////////////////////////////////////// | ||||||
|  |   // Longer term; drop this in favour of a master / slave model with  | ||||||
|  |   // cartesian communicator on a subset of ranks, slave ranks controlled | ||||||
|  |   // by group leader with data xfer via shared memory | ||||||
|  |   //////////////////////////////////////////////////////////////////// | ||||||
|  | #ifdef  GRID_COMMS_MPI3 | ||||||
|  |   std::vector<int>  WorldDims; | ||||||
|  |   std::vector<int>  GroupDims; | ||||||
|  |   std::vector<int>  ShmDims; | ||||||
|  |    | ||||||
|  |   std::vector<int> GroupCoor; | ||||||
|  |   std::vector<int> ShmCoor; | ||||||
|  |   std::vector<int> WorldCoor; | ||||||
|  |    | ||||||
|  |   static std::vector<int> GroupRanks;  | ||||||
|  |   static std::vector<int> MyGroup; | ||||||
|  |   static int ShmSetup; | ||||||
|  |   static MPI_Win ShmWindow;  | ||||||
|  |   static MPI_Comm ShmComm; | ||||||
|  |    | ||||||
|  |   std::vector<int>  LexicographicToWorldRank; | ||||||
|  |    | ||||||
|  |   static std::vector<void *> ShmCommBufs; | ||||||
|  | #else  | ||||||
|  |   static void ShmInitGeneric(void); | ||||||
|  |   static commVector<uint8_t> ShmBufStorageVector; | ||||||
|  | #endif  | ||||||
|  |   static void * ShmCommBuf; | ||||||
|  |   size_t heap_top; | ||||||
|  |   size_t heap_bytes; | ||||||
|  |   void *ShmBufferSelf(void); | ||||||
|  |   void *ShmBuffer(int rank); | ||||||
|  |   void *ShmBufferTranslate(int rank,void * local_p); | ||||||
|  |   void *ShmBufferMalloc(size_t bytes); | ||||||
|  |   void ShmBufferFreeAll(void) ; | ||||||
|  |    | ||||||
|  |   //////////////////////////////////////////////// | ||||||
|  |   // Must call in Grid startup | ||||||
|  |   //////////////////////////////////////////////// | ||||||
|   static void Init(int *argc, char ***argv); |   static void Init(int *argc, char ***argv); | ||||||
|    |    | ||||||
|     // Constructor |   //////////////////////////////////////////////// | ||||||
|  |   // Constructor of any given grid | ||||||
|  |   //////////////////////////////////////////////// | ||||||
|   CartesianCommunicator(const std::vector<int> &pdimensions_in); |   CartesianCommunicator(const std::vector<int> &pdimensions_in); | ||||||
|    |    | ||||||
|     // Wraps MPI_Cart routines |   //////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   // Wraps MPI_Cart routines, or implements equivalent on other impls | ||||||
|  |   //////////////////////////////////////////////////////////////////////////////////////// | ||||||
|   void ShiftedRanks(int dim,int shift,int & source, int & dest); |   void ShiftedRanks(int dim,int shift,int & source, int & dest); | ||||||
|   int  RankFromProcessorCoor(std::vector<int> &coor); |   int  RankFromProcessorCoor(std::vector<int> &coor); | ||||||
|   void ProcessorCoorFromRank(int rank,std::vector<int> &coor); |   void ProcessorCoorFromRank(int rank,std::vector<int> &coor); | ||||||
|    |    | ||||||
|   ///////////////////////////////// |   ///////////////////////////////// | ||||||
|     // Grid information queries |   // Grid information and queries | ||||||
|   ///////////////////////////////// |   ///////////////////////////////// | ||||||
|     int                      IsBoss(void)            { return _processor==0; }; |   static int ShmRank; | ||||||
|     int                      BossRank(void)          { return 0; }; |   static int ShmSize; | ||||||
|     int                      ThisRank(void)          { return _processor; }; |   static int GroupSize; | ||||||
|     const std::vector<int> & ThisProcessorCoor(void) { return _processor_coor; }; |   static int GroupRank; | ||||||
|     const std::vector<int> & ProcessorGrid(void)     { return _processors; }; |   static int WorldRank; | ||||||
|     int                      ProcessorCount(void)    { return _Nprocessors; }; |   static int WorldSize; | ||||||
|  |   static int Slave; | ||||||
|  |    | ||||||
|  |   int                      IsBoss(void)            ; | ||||||
|  |   int                      BossRank(void)          ; | ||||||
|  |   int                      ThisRank(void)          ; | ||||||
|  |   const std::vector<int> & ThisProcessorCoor(void) ; | ||||||
|  |   const std::vector<int> & ProcessorGrid(void)     ; | ||||||
|  |   int                      ProcessorCount(void)    ; | ||||||
|  |   static int Ranks    (void); | ||||||
|  |   static int Nodes    (void); | ||||||
|  |   static int Cores    (void); | ||||||
|  |   static int NodeRank (void); | ||||||
|  |   static int CoreRank (void); | ||||||
|  |  | ||||||
|  |   //////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   // very VERY rarely (Log, serial RNG) we need world without a grid | ||||||
|  |   //////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   static int  RankWorld(void) ; | ||||||
|  |   static void BroadcastWorld(int root,void* data, int bytes); | ||||||
|    |    | ||||||
|   //////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////// | ||||||
|   // Reduction |   // Reduction | ||||||
|   //////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////// | ||||||
|   void GlobalSum(RealF &); |   void GlobalSum(RealF &); | ||||||
|   void GlobalSumVector(RealF *,int N); |   void GlobalSumVector(RealF *,int N); | ||||||
|  |  | ||||||
|   void GlobalSum(RealD &); |   void GlobalSum(RealD &); | ||||||
|   void GlobalSumVector(RealD *,int N); |   void GlobalSumVector(RealD *,int N); | ||||||
|  |  | ||||||
|   void GlobalSum(uint32_t &); |   void GlobalSum(uint32_t &); | ||||||
|   void GlobalSum(uint64_t &); |   void GlobalSum(uint64_t &); | ||||||
|  |   void GlobalSum(ComplexF &c); | ||||||
|     void GlobalSum(ComplexF &c) |   void GlobalSumVector(ComplexF *c,int N); | ||||||
|     { |   void GlobalSum(ComplexD &c); | ||||||
|       GlobalSumVector((float *)&c,2); |   void GlobalSumVector(ComplexD *c,int N); | ||||||
|     } |  | ||||||
|     void GlobalSumVector(ComplexF *c,int N) |  | ||||||
|     { |  | ||||||
|       GlobalSumVector((float *)c,2*N); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     void GlobalSum(ComplexD &c) |  | ||||||
|     { |  | ||||||
|       GlobalSumVector((double *)&c,2); |  | ||||||
|     } |  | ||||||
|     void GlobalSumVector(ComplexD *c,int N) |  | ||||||
|     { |  | ||||||
|       GlobalSumVector((double *)c,2*N); |  | ||||||
|     } |  | ||||||
|    |    | ||||||
|   template<class obj> void GlobalSum(obj &o){ |   template<class obj> void GlobalSum(obj &o){ | ||||||
|     typedef typename obj::scalar_type scalar_type; |     typedef typename obj::scalar_type scalar_type; | ||||||
| @@ -112,6 +172,7 @@ class CartesianCommunicator { | |||||||
|     scalar_type * ptr = (scalar_type *)& o; |     scalar_type * ptr = (scalar_type *)& o; | ||||||
|     GlobalSumVector(ptr,words); |     GlobalSumVector(ptr,words); | ||||||
|   } |   } | ||||||
|  |    | ||||||
|   //////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////// | ||||||
|   // Face exchange, buffer swap in translational invariant way |   // Face exchange, buffer swap in translational invariant way | ||||||
|   //////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////// | ||||||
| @@ -133,8 +194,19 @@ class CartesianCommunicator { | |||||||
| 			   void *recv, | 			   void *recv, | ||||||
| 			   int recv_from_rank, | 			   int recv_from_rank, | ||||||
| 			   int bytes); | 			   int bytes); | ||||||
|  |    | ||||||
|   void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall); |   void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall); | ||||||
|  |  | ||||||
|  |   void StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
|  | 				  void *xmit, | ||||||
|  | 				  int xmit_to_rank, | ||||||
|  | 				  void *recv, | ||||||
|  | 				  int recv_from_rank, | ||||||
|  | 				  int bytes); | ||||||
|  |    | ||||||
|  |   void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall); | ||||||
|  |   void StencilBarrier(void); | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////// | ||||||
|   // Barrier |   // Barrier | ||||||
|   //////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////// | ||||||
| @@ -144,13 +216,12 @@ class CartesianCommunicator { | |||||||
|   // Broadcast a buffer and composite larger |   // Broadcast a buffer and composite larger | ||||||
|   //////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////// | ||||||
|   void Broadcast(int root,void* data, int bytes); |   void Broadcast(int root,void* data, int bytes); | ||||||
|  |    | ||||||
|   template<class obj> void Broadcast(int root,obj &data) |   template<class obj> void Broadcast(int root,obj &data) | ||||||
|     { |     { | ||||||
|       Broadcast(root,(void *)&data,sizeof(data)); |       Broadcast(root,(void *)&data,sizeof(data)); | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|     static void BroadcastWorld(int root,void* data, int bytes); |  | ||||||
|  |  | ||||||
| };  | };  | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -30,6 +30,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
|  |  | ||||||
|  | /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | // Info that is setup once and indept of cartesian layout | ||||||
|  | /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | MPI_Comm CartesianCommunicator::communicator_world; | ||||||
|  |  | ||||||
| // Should error check all MPI calls. | // Should error check all MPI calls. | ||||||
| void CartesianCommunicator::Init(int *argc, char ***argv) { | void CartesianCommunicator::Init(int *argc, char ***argv) { | ||||||
|   int flag; |   int flag; | ||||||
| @@ -37,12 +43,15 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { | |||||||
|   if ( !flag ) { |   if ( !flag ) { | ||||||
|     MPI_Init(argc,argv); |     MPI_Init(argc,argv); | ||||||
|   } |   } | ||||||
| } |   MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); | ||||||
|  |   MPI_Comm_rank(communicator_world,&WorldRank); | ||||||
|   int Rank(void) { |   MPI_Comm_size(communicator_world,&WorldSize); | ||||||
|     int pe; |   ShmRank=0; | ||||||
|     MPI_Comm_rank(MPI_COMM_WORLD,&pe); |   ShmSize=1; | ||||||
|     return pe; |   GroupRank=WorldRank; | ||||||
|  |   GroupSize=WorldSize; | ||||||
|  |   Slave    =0; | ||||||
|  |   ShmInitGeneric(); | ||||||
| } | } | ||||||
|  |  | ||||||
| CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) | CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) | ||||||
| @@ -54,7 +63,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) | |||||||
|   _processors = processors; |   _processors = processors; | ||||||
|   _processor_coor.resize(_ndimension); |   _processor_coor.resize(_ndimension); | ||||||
|    |    | ||||||
|   MPI_Cart_create(MPI_COMM_WORLD, _ndimension,&_processors[0],&periodic[0],1,&communicator); |   MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator); | ||||||
|   MPI_Comm_rank(communicator,&_processor); |   MPI_Comm_rank(communicator,&_processor); | ||||||
|   MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]); |   MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]); | ||||||
|  |  | ||||||
| @@ -67,7 +76,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) | |||||||
|    |    | ||||||
|   assert(Size==_Nprocessors); |   assert(Size==_Nprocessors); | ||||||
| } | } | ||||||
|  |  | ||||||
| void CartesianCommunicator::GlobalSum(uint32_t &u){ | void CartesianCommunicator::GlobalSum(uint32_t &u){ | ||||||
|   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); |   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); | ||||||
|   assert(ierr==0); |   assert(ierr==0); | ||||||
| @@ -168,7 +176,6 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> & | |||||||
|   int nreq=list.size(); |   int nreq=list.size(); | ||||||
|   std::vector<MPI_Status> status(nreq); |   std::vector<MPI_Status> status(nreq); | ||||||
|   int ierr = MPI_Waitall(nreq,&list[0],&status[0]); |   int ierr = MPI_Waitall(nreq,&list[0],&status[0]); | ||||||
|  |  | ||||||
|   assert(ierr==0); |   assert(ierr==0); | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -187,14 +194,17 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes) | |||||||
| 		     communicator); | 		     communicator); | ||||||
|   assert(ierr==0); |   assert(ierr==0); | ||||||
| } | } | ||||||
|  |   /////////////////////////////////////////////////////// | ||||||
|  |   // Should only be used prior to Grid Init finished. | ||||||
|  |   // Check for this? | ||||||
|  |   /////////////////////////////////////////////////////// | ||||||
| void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) | void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) | ||||||
| { | { | ||||||
|   int ierr= MPI_Bcast(data, |   int ierr= MPI_Bcast(data, | ||||||
| 		      bytes, | 		      bytes, | ||||||
| 		      MPI_BYTE, | 		      MPI_BYTE, | ||||||
| 		      root, | 		      root, | ||||||
| 		      MPI_COMM_WORLD); | 		      communicator_world); | ||||||
|   assert(ierr==0); |   assert(ierr==0); | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										574
									
								
								lib/communicator/Communicator_mpi3.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										574
									
								
								lib/communicator/Communicator_mpi3.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,574 @@ | |||||||
|  |     /************************************************************************************* | ||||||
|  |  | ||||||
|  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
|  |  | ||||||
|  |     Source file: ./lib/communicator/Communicator_mpi.cc | ||||||
|  |  | ||||||
|  |     Copyright (C) 2015 | ||||||
|  |  | ||||||
|  | Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||||
|  |  | ||||||
|  |     This program is free software; you can redistribute it and/or modify | ||||||
|  |     it under the terms of the GNU General Public License as published by | ||||||
|  |     the Free Software Foundation; either version 2 of the License, or | ||||||
|  |     (at your option) any later version. | ||||||
|  |  | ||||||
|  |     This program is distributed in the hope that it will be useful, | ||||||
|  |     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  |     GNU General Public License for more details. | ||||||
|  |  | ||||||
|  |     You should have received a copy of the GNU General Public License along | ||||||
|  |     with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|  |     *************************************************************************************/ | ||||||
|  |     /*  END LEGAL */ | ||||||
|  | #include "Grid.h" | ||||||
|  | #include <mpi.h> | ||||||
|  |  | ||||||
|  | namespace Grid { | ||||||
|  |  | ||||||
|  |  | ||||||
|  | /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | // Info that is setup once and indept of cartesian layout | ||||||
|  | /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | int CartesianCommunicator::ShmSetup = 0; | ||||||
|  |  | ||||||
|  | MPI_Comm CartesianCommunicator::communicator_world; | ||||||
|  | MPI_Comm CartesianCommunicator::ShmComm; | ||||||
|  | MPI_Win  CartesianCommunicator::ShmWindow; | ||||||
|  |  | ||||||
|  | std::vector<int> CartesianCommunicator::GroupRanks;   | ||||||
|  | std::vector<int> CartesianCommunicator::MyGroup; | ||||||
|  | std::vector<void *> CartesianCommunicator::ShmCommBufs; | ||||||
|  |  | ||||||
|  | void *CartesianCommunicator::ShmBufferSelf(void) | ||||||
|  | { | ||||||
|  |   return ShmCommBufs[ShmRank]; | ||||||
|  | } | ||||||
|  | void *CartesianCommunicator::ShmBuffer(int rank) | ||||||
|  | { | ||||||
|  |   int gpeer = GroupRanks[rank]; | ||||||
|  |   if (gpeer == MPI_UNDEFINED){ | ||||||
|  |     return NULL; | ||||||
|  |   } else {  | ||||||
|  |     return ShmCommBufs[gpeer]; | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) | ||||||
|  | { | ||||||
|  |   int gpeer = GroupRanks[rank]; | ||||||
|  |   if (gpeer == MPI_UNDEFINED){ | ||||||
|  |     return NULL; | ||||||
|  |   } else {  | ||||||
|  |     uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank]; | ||||||
|  |     uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset; | ||||||
|  |     return (void *) remote; | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void CartesianCommunicator::Init(int *argc, char ***argv) { | ||||||
|  |   int flag; | ||||||
|  |   MPI_Initialized(&flag); // needed to coexist with other libs apparently | ||||||
|  |   if ( !flag ) { | ||||||
|  |     MPI_Init(argc,argv); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); | ||||||
|  |   MPI_Comm_rank(communicator_world,&WorldRank); | ||||||
|  |   MPI_Comm_size(communicator_world,&WorldSize); | ||||||
|  |  | ||||||
|  |   ///////////////////////////////////////////////////////////////////// | ||||||
|  |   // Split into groups that can share memory | ||||||
|  |   ///////////////////////////////////////////////////////////////////// | ||||||
|  |   MPI_Comm_split_type(communicator_world, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm); | ||||||
|  |   MPI_Comm_rank(ShmComm     ,&ShmRank); | ||||||
|  |   MPI_Comm_size(ShmComm     ,&ShmSize); | ||||||
|  |   GroupSize = WorldSize/ShmSize; | ||||||
|  |  | ||||||
|  |   ///////////////////////////////////////////////////////////////////// | ||||||
|  |   // find world ranks in our SHM group (i.e. which ranks are on our node) | ||||||
|  |   ///////////////////////////////////////////////////////////////////// | ||||||
|  |   MPI_Group WorldGroup, ShmGroup; | ||||||
|  |   MPI_Comm_group (communicator_world, &WorldGroup);  | ||||||
|  |   MPI_Comm_group (ShmComm, &ShmGroup); | ||||||
|  |    | ||||||
|  |   std::vector<int> world_ranks(WorldSize);  | ||||||
|  |   GroupRanks.resize(WorldSize);  | ||||||
|  |   MyGroup.resize(ShmSize); | ||||||
|  |   for(int r=0;r<WorldSize;r++) world_ranks[r]=r; | ||||||
|  |    | ||||||
|  |   MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &GroupRanks[0]);  | ||||||
|  |  | ||||||
|  |   /////////////////////////////////////////////////////////////////// | ||||||
|  |   // Identify who is in my group and noninate the leader | ||||||
|  |     /////////////////////////////////////////////////////////////////// | ||||||
|  |   int g=0; | ||||||
|  |   for(int rank=0;rank<WorldSize;rank++){ | ||||||
|  |     if(GroupRanks[rank]!=MPI_UNDEFINED){ | ||||||
|  |       assert(g<ShmSize); | ||||||
|  |       MyGroup[g++] = rank; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   std::sort(MyGroup.begin(),MyGroup.end(),std::less<int>()); | ||||||
|  |   int myleader = MyGroup[0]; | ||||||
|  |    | ||||||
|  |   std::vector<int> leaders_1hot(WorldSize,0); | ||||||
|  |   std::vector<int> leaders_group(GroupSize,0); | ||||||
|  |   leaders_1hot [ myleader ] = 1; | ||||||
|  |      | ||||||
|  |   /////////////////////////////////////////////////////////////////// | ||||||
|  |   // global sum leaders over comm world | ||||||
|  |   /////////////////////////////////////////////////////////////////// | ||||||
|  |   int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator_world); | ||||||
|  |   assert(ierr==0); | ||||||
|  |    | ||||||
|  |   /////////////////////////////////////////////////////////////////// | ||||||
|  |   // find the group leaders world rank | ||||||
|  |   /////////////////////////////////////////////////////////////////// | ||||||
|  |   int group=0; | ||||||
|  |   for(int l=0;l<WorldSize;l++){ | ||||||
|  |     if(leaders_1hot[l]){ | ||||||
|  |       leaders_group[group++] = l; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   /////////////////////////////////////////////////////////////////// | ||||||
|  |   // Identify the rank of the group in which I (and my leader) live | ||||||
|  |   /////////////////////////////////////////////////////////////////// | ||||||
|  |   GroupRank=-1; | ||||||
|  |   for(int g=0;g<GroupSize;g++){ | ||||||
|  |     if (myleader == leaders_group[g]){ | ||||||
|  |       GroupRank=g; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   assert(GroupRank!=-1); | ||||||
|  |    | ||||||
|  |   ////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   // allocate the shared window for our group | ||||||
|  |   ////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |    | ||||||
|  |   ShmCommBuf = 0; | ||||||
|  |   ierr = MPI_Win_allocate_shared(MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,ShmComm,&ShmCommBuf,&ShmWindow); | ||||||
|  |   assert(ierr==0); | ||||||
|  |   // KNL hack -- force to numa-domain 1 in flat | ||||||
|  | #if 0 | ||||||
|  |   //#include <numaif.h> | ||||||
|  |   for(uint64_t page=0;page<MAX_MPI_SHM_BYTES;page+=4096){ | ||||||
|  |     void *pages = (void *) ( page + ShmCommBuf ); | ||||||
|  |     int status; | ||||||
|  |     int flags=MPOL_MF_MOVE_ALL; | ||||||
|  |     int nodes=1; // numa domain == MCDRAM | ||||||
|  |     unsigned long count=1; | ||||||
|  |     ierr= move_pages(0,count, &pages,&nodes,&status,flags); | ||||||
|  |     if (ierr && (page==0)) perror("numa relocate command failed"); | ||||||
|  |   } | ||||||
|  | #endif | ||||||
|  |   MPI_Win_lock_all (MPI_MODE_NOCHECK, ShmWindow); | ||||||
|  |    | ||||||
|  |   ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   // Plan: allocate a fixed SHM region. Scratch that is just used via some scheme during stencil comms, with no allocate free. | ||||||
|  |   ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   ShmCommBufs.resize(ShmSize); | ||||||
|  |   for(int r=0;r<ShmSize;r++){ | ||||||
|  |     MPI_Aint sz; | ||||||
|  |     int dsp_unit; | ||||||
|  |     MPI_Win_shared_query (ShmWindow, r, &sz, &dsp_unit, &ShmCommBufs[r]); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   ////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   // Verbose for now | ||||||
|  |   ////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   if (WorldRank == 0){ | ||||||
|  |     std::cout<<GridLogMessage<< "Grid MPI-3 configuration: detected "; | ||||||
|  |     std::cout<< WorldSize << " Ranks " ; | ||||||
|  |     std::cout<< GroupSize << " Nodes " ; | ||||||
|  |     std::cout<<  ShmSize  << " with ranks-per-node "<<std::endl; | ||||||
|  |      | ||||||
|  |     std::cout<<GridLogMessage     <<"Grid MPI-3 configuration: allocated shared memory region of size "; | ||||||
|  |     std::cout<<std::hex << MAX_MPI_SHM_BYTES <<" ShmCommBuf address = "<<ShmCommBuf << std::dec<<std::endl; | ||||||
|  |  | ||||||
|  |     for(int g=0;g<GroupSize;g++){ | ||||||
|  |       std::cout<<GridLogMessage<<" Node "<<g<<" led by MPI rank "<<leaders_group[g]<<std::endl; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     std::cout<<GridLogMessage<<" Boss Node Shm Pointers are {"; | ||||||
|  |     for(int g=0;g<ShmSize;g++){ | ||||||
|  |       std::cout<<std::hex<<ShmCommBufs[g]<<std::dec; | ||||||
|  |       if(g!=ShmSize-1) std::cout<<","; | ||||||
|  |       else std::cout<<"}"<<std::endl; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   for(int g=0;g<GroupSize;g++){ | ||||||
|  |     if ( (ShmRank == 0) && (GroupRank==g) )  std::cout<<GridLogMessage<<"["<<g<<"] Node Group "<<g<<" is ranks {"; | ||||||
|  |     for(int r=0;r<ShmSize;r++){ | ||||||
|  |       if ( (ShmRank == 0) && (GroupRank==g) ) { | ||||||
|  | 	std::cout<<MyGroup[r]; | ||||||
|  | 	if(r<ShmSize-1) std::cout<<","; | ||||||
|  | 	else std::cout<<"}"<<std::endl; | ||||||
|  |       } | ||||||
|  |       MPI_Barrier(communicator_world); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   assert(ShmSetup==0);  ShmSetup=1; | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | //////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | // Want to implement some magic ... Group sub-cubes into those on same node | ||||||
|  | //////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) | ||||||
|  | { | ||||||
|  |   std::vector<int> coor = _processor_coor; | ||||||
|  |  | ||||||
|  |   assert(std::abs(shift) <_processors[dim]); | ||||||
|  |  | ||||||
|  |   coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim]; | ||||||
|  |   Lexicographic::IndexFromCoor(coor,source,_processors); | ||||||
|  |   source = LexicographicToWorldRank[source]; | ||||||
|  |  | ||||||
|  |   coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim]; | ||||||
|  |   Lexicographic::IndexFromCoor(coor,dest,_processors); | ||||||
|  |   dest = LexicographicToWorldRank[dest]; | ||||||
|  | } | ||||||
|  | int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) | ||||||
|  | { | ||||||
|  |   int rank; | ||||||
|  |   Lexicographic::IndexFromCoor(coor,rank,_processors); | ||||||
|  |   rank = LexicographicToWorldRank[rank]; | ||||||
|  |   return rank; | ||||||
|  | } | ||||||
|  | void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor) | ||||||
|  | { | ||||||
|  |   Lexicographic::CoorFromIndex(coor,rank,_processors); | ||||||
|  |   rank = LexicographicToWorldRank[rank]; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) | ||||||
|  | {  | ||||||
|  |   int ierr; | ||||||
|  |  | ||||||
|  |   communicator=communicator_world; | ||||||
|  |  | ||||||
|  |   _ndimension = processors.size(); | ||||||
|  |    | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   // Assert power of two shm_size. | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   int log2size = -1; | ||||||
|  |   for(int i=0;i<=MAXLOG2RANKSPERNODE;i++){   | ||||||
|  |     if ( (0x1<<i) == ShmSize ) { | ||||||
|  |       log2size = i; | ||||||
|  |       break; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   assert(log2size != -1); | ||||||
|  |    | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   // Identify subblock of ranks on node spreading across dims | ||||||
|  |   // in a maximally symmetrical way | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   int dim = 0; | ||||||
|  |    | ||||||
|  |   std::vector<int> WorldDims = processors; | ||||||
|  |  | ||||||
|  |   ShmDims.resize(_ndimension,1); | ||||||
|  |   GroupDims.resize(_ndimension); | ||||||
|  |      | ||||||
|  |   ShmCoor.resize(_ndimension); | ||||||
|  |   GroupCoor.resize(_ndimension); | ||||||
|  |   WorldCoor.resize(_ndimension); | ||||||
|  |  | ||||||
|  |   for(int l2=0;l2<log2size;l2++){ | ||||||
|  |     while ( WorldDims[dim] / ShmDims[dim] <= 1 ) dim=(dim+1)%_ndimension; | ||||||
|  |     ShmDims[dim]*=2; | ||||||
|  |     dim=(dim+1)%_ndimension; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   // Establish torus of processes and nodes with sub-blockings | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   for(int d=0;d<_ndimension;d++){ | ||||||
|  |     GroupDims[d] = WorldDims[d]/ShmDims[d]; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   // Check processor counts match | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   _Nprocessors=1; | ||||||
|  |   _processors = processors; | ||||||
|  |   _processor_coor.resize(_ndimension); | ||||||
|  |   for(int i=0;i<_ndimension;i++){ | ||||||
|  |     _Nprocessors*=_processors[i]; | ||||||
|  |   } | ||||||
|  |   assert(WorldSize==_Nprocessors); | ||||||
|  |        | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   // Establish mapping between lexico physics coord and WorldRank | ||||||
|  |   //  | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   LexicographicToWorldRank.resize(WorldSize,0); | ||||||
|  |   Lexicographic::CoorFromIndex(GroupCoor,GroupRank,GroupDims); | ||||||
|  |   Lexicographic::CoorFromIndex(ShmCoor,ShmRank,ShmDims); | ||||||
|  |   for(int d=0;d<_ndimension;d++){ | ||||||
|  |     WorldCoor[d] = GroupCoor[d]*ShmDims[d]+ShmCoor[d]; | ||||||
|  |   } | ||||||
|  |   _processor_coor = WorldCoor; | ||||||
|  |  | ||||||
|  |   int lexico; | ||||||
|  |   Lexicographic::IndexFromCoor(WorldCoor,lexico,WorldDims); | ||||||
|  |   LexicographicToWorldRank[lexico]=WorldRank; | ||||||
|  |   _processor = lexico; | ||||||
|  |  | ||||||
|  |   /////////////////////////////////////////////////////////////////// | ||||||
|  |   // global sum Lexico to World mapping | ||||||
|  |   /////////////////////////////////////////////////////////////////// | ||||||
|  |   ierr=MPI_Allreduce(MPI_IN_PLACE,&LexicographicToWorldRank[0],WorldSize,MPI_INT,MPI_SUM,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  |    | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | void CartesianCommunicator::GlobalSum(uint32_t &u){ | ||||||
|  |   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalSum(uint64_t &u){ | ||||||
|  |   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalSum(float &f){ | ||||||
|  |   int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalSumVector(float *f,int N) | ||||||
|  | { | ||||||
|  |   int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalSum(double &d) | ||||||
|  | { | ||||||
|  |   int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalSumVector(double *d,int N) | ||||||
|  | { | ||||||
|  |   int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | // Basic Halo comms primitive | ||||||
|  | void CartesianCommunicator::SendToRecvFrom(void *xmit, | ||||||
|  | 					   int dest, | ||||||
|  | 					   void *recv, | ||||||
|  | 					   int from, | ||||||
|  | 					   int bytes) | ||||||
|  | { | ||||||
|  |   std::vector<CommsRequest_t> reqs(0); | ||||||
|  |   SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes); | ||||||
|  |   SendToRecvFromComplete(reqs); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void CartesianCommunicator::SendRecvPacket(void *xmit, | ||||||
|  | 					   void *recv, | ||||||
|  | 					   int sender, | ||||||
|  | 					   int receiver, | ||||||
|  | 					   int bytes) | ||||||
|  | { | ||||||
|  |   MPI_Status stat; | ||||||
|  |   assert(sender != receiver); | ||||||
|  |   int tag = sender; | ||||||
|  |   if ( _processor == sender ) { | ||||||
|  |     MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator); | ||||||
|  |   } | ||||||
|  |   if ( _processor == receiver ) {  | ||||||
|  |     MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // Basic Halo comms primitive | ||||||
|  | void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
|  | 						void *xmit, | ||||||
|  | 						int dest, | ||||||
|  | 						void *recv, | ||||||
|  | 						int from, | ||||||
|  | 						int bytes) | ||||||
|  | { | ||||||
|  | #if 0 | ||||||
|  |   this->StencilBarrier(); | ||||||
|  |  | ||||||
|  |   MPI_Request xrq; | ||||||
|  |   MPI_Request rrq; | ||||||
|  |    | ||||||
|  |   static int sequence; | ||||||
|  |  | ||||||
|  |   int ierr; | ||||||
|  |   int tag; | ||||||
|  |   int check; | ||||||
|  |  | ||||||
|  |   assert(dest != _processor); | ||||||
|  |   assert(from != _processor); | ||||||
|  |    | ||||||
|  |   int gdest = GroupRanks[dest]; | ||||||
|  |   int gfrom = GroupRanks[from]; | ||||||
|  |   int gme   = GroupRanks[_processor]; | ||||||
|  |  | ||||||
|  |   sequence++; | ||||||
|  |    | ||||||
|  |   char *from_ptr = (char *)ShmCommBufs[ShmRank]; | ||||||
|  |  | ||||||
|  |   int small = (bytes<MAX_MPI_SHM_BYTES); | ||||||
|  |  | ||||||
|  |   typedef uint64_t T; | ||||||
|  |   int words = bytes/sizeof(T); | ||||||
|  |  | ||||||
|  |   assert(((size_t)bytes &(sizeof(T)-1))==0); | ||||||
|  |   assert(gme == ShmRank); | ||||||
|  |  | ||||||
|  |   if ( small && (gdest !=MPI_UNDEFINED) ) { | ||||||
|  |  | ||||||
|  |     char *to_ptr   = (char *)ShmCommBufs[gdest]; | ||||||
|  |  | ||||||
|  |     assert(gme != gdest); | ||||||
|  |  | ||||||
|  |     T *ip = (T *)xmit; | ||||||
|  |     T *op = (T *)to_ptr; | ||||||
|  | PARALLEL_FOR_LOOP  | ||||||
|  |     for(int w=0;w<words;w++) { | ||||||
|  |       op[w]=ip[w]; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     bcopy(&_processor,&to_ptr[bytes],sizeof(_processor)); | ||||||
|  |     bcopy(&  sequence,&to_ptr[bytes+4],sizeof(sequence)); | ||||||
|  |   } else {  | ||||||
|  |     ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); | ||||||
|  |     assert(ierr==0); | ||||||
|  |     list.push_back(xrq); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   this->StencilBarrier(); | ||||||
|  |    | ||||||
|  |   if (small && (gfrom !=MPI_UNDEFINED) ) { | ||||||
|  |     T *ip = (T *)from_ptr; | ||||||
|  |     T *op = (T *)recv; | ||||||
|  | PARALLEL_FOR_LOOP  | ||||||
|  |     for(int w=0;w<words;w++) { | ||||||
|  |       op[w]=ip[w]; | ||||||
|  |     } | ||||||
|  |     bcopy(&from_ptr[bytes]  ,&tag  ,sizeof(tag)); | ||||||
|  |     bcopy(&from_ptr[bytes+4],&check,sizeof(check)); | ||||||
|  |     assert(check==sequence); | ||||||
|  |     assert(tag==from); | ||||||
|  |   } else {  | ||||||
|  |     ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); | ||||||
|  |     assert(ierr==0); | ||||||
|  |     list.push_back(rrq); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   this->StencilBarrier(); | ||||||
|  |  | ||||||
|  | #else | ||||||
|  |   MPI_Request xrq; | ||||||
|  |   MPI_Request rrq; | ||||||
|  |   int rank = _processor; | ||||||
|  |   int ierr; | ||||||
|  |   ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); | ||||||
|  |   ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); | ||||||
|  |    | ||||||
|  |   assert(ierr==0); | ||||||
|  |  | ||||||
|  |   list.push_back(xrq); | ||||||
|  |   list.push_back(rrq); | ||||||
|  | #endif | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
|  | 						       void *xmit, | ||||||
|  | 						       int dest, | ||||||
|  | 						       void *recv, | ||||||
|  | 						       int from, | ||||||
|  | 						       int bytes) | ||||||
|  | { | ||||||
|  |   MPI_Request xrq; | ||||||
|  |   MPI_Request rrq; | ||||||
|  |  | ||||||
|  |   int ierr; | ||||||
|  |  | ||||||
|  |   assert(dest != _processor); | ||||||
|  |   assert(from != _processor); | ||||||
|  |    | ||||||
|  |   int gdest = GroupRanks[dest]; | ||||||
|  |   int gfrom = GroupRanks[from]; | ||||||
|  |   int gme   = GroupRanks[_processor]; | ||||||
|  |  | ||||||
|  |   assert(gme == ShmRank); | ||||||
|  |  | ||||||
|  |   if ( gdest == MPI_UNDEFINED ) { | ||||||
|  |     ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); | ||||||
|  |     assert(ierr==0); | ||||||
|  |     list.push_back(xrq); | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   if ( gfrom ==MPI_UNDEFINED) { | ||||||
|  |     ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); | ||||||
|  |     assert(ierr==0); | ||||||
|  |     list.push_back(rrq); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list) | ||||||
|  | { | ||||||
|  |   SendToRecvFromComplete(list); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void CartesianCommunicator::StencilBarrier(void) | ||||||
|  | { | ||||||
|  |   MPI_Win_sync (ShmWindow);    | ||||||
|  |   MPI_Barrier  (ShmComm); | ||||||
|  |   MPI_Win_sync (ShmWindow);    | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list) | ||||||
|  | { | ||||||
|  |   int nreq=list.size(); | ||||||
|  |   std::vector<MPI_Status> status(nreq); | ||||||
|  |   int ierr = MPI_Waitall(nreq,&list[0],&status[0]); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void CartesianCommunicator::Barrier(void) | ||||||
|  | { | ||||||
|  |   int ierr = MPI_Barrier(communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void CartesianCommunicator::Broadcast(int root,void* data, int bytes) | ||||||
|  | { | ||||||
|  |   int ierr=MPI_Bcast(data, | ||||||
|  | 		     bytes, | ||||||
|  | 		     MPI_BYTE, | ||||||
|  | 		     root, | ||||||
|  | 		     communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) | ||||||
|  | { | ||||||
|  |   int ierr= MPI_Bcast(data, | ||||||
|  | 		      bytes, | ||||||
|  | 		      MPI_BYTE, | ||||||
|  | 		      root, | ||||||
|  | 		      communicator_world); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | } | ||||||
|  |  | ||||||
| @@ -28,12 +28,22 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #include "Grid.h" | #include "Grid.h" | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
|  | /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | // Info that is setup once and indept of cartesian layout | ||||||
|  | /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |  | ||||||
| void CartesianCommunicator::Init(int *argc, char *** arv) | void CartesianCommunicator::Init(int *argc, char *** arv) | ||||||
| { | { | ||||||
|  |   WorldRank = 0; | ||||||
|  |   WorldSize = 1; | ||||||
|  |   ShmRank=0; | ||||||
|  |   ShmSize=1; | ||||||
|  |   GroupRank=WorldRank; | ||||||
|  |   GroupSize=WorldSize; | ||||||
|  |   Slave    =0; | ||||||
|  |   ShmInitGeneric(); | ||||||
| } | } | ||||||
|  |  | ||||||
| int Rank(void ){ return 0; }; |  | ||||||
|  |  | ||||||
| CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) | CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) | ||||||
| { | { | ||||||
|   _processors = processors; |   _processors = processors; | ||||||
| @@ -89,30 +99,16 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> & | |||||||
|   assert(0); |   assert(0); | ||||||
| } | } | ||||||
|  |  | ||||||
| void CartesianCommunicator::Barrier(void) | void CartesianCommunicator::Barrier(void){} | ||||||
| { | void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {} | ||||||
| } | void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { } | ||||||
|  | int  CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) {  return 0;} | ||||||
| void CartesianCommunicator::Broadcast(int root,void* data, int bytes) | void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){  assert(0);} | ||||||
| { |  | ||||||
| } |  | ||||||
| void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) |  | ||||||
| { |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  |  | ||||||
| void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) | void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) | ||||||
| { | { | ||||||
|   source =0; |   source =0; | ||||||
|   dest=0; |   dest=0; | ||||||
| } | } | ||||||
| int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) |  | ||||||
| { |  | ||||||
|   return 0; |  | ||||||
| } |  | ||||||
| void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor) |  | ||||||
| { |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  |  | ||||||
| } | } | ||||||
|   | |||||||
| @@ -39,17 +39,22 @@ namespace Grid { | |||||||
|     BACKTRACEFILE();		   \ |     BACKTRACEFILE();		   \ | ||||||
|   }\ |   }\ | ||||||
| } | } | ||||||
| int Rank(void) { |  | ||||||
|   return shmem_my_pe(); |  | ||||||
| } | /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | // Info that is setup once and indept of cartesian layout | ||||||
|  | /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |  | ||||||
| typedef struct HandShake_t {  | typedef struct HandShake_t {  | ||||||
|   uint64_t seq_local; |   uint64_t seq_local; | ||||||
|   uint64_t seq_remote; |   uint64_t seq_remote; | ||||||
| } HandShake; | } HandShake; | ||||||
|  |  | ||||||
|  |  | ||||||
| static Vector< HandShake > XConnections; | static Vector< HandShake > XConnections; | ||||||
| static Vector< HandShake > RConnections; | static Vector< HandShake > RConnections; | ||||||
|  |  | ||||||
|  |  | ||||||
| void CartesianCommunicator::Init(int *argc, char ***argv) { | void CartesianCommunicator::Init(int *argc, char ***argv) { | ||||||
|   shmem_init(); |   shmem_init(); | ||||||
|   XConnections.resize(shmem_n_pes()); |   XConnections.resize(shmem_n_pes()); | ||||||
| @@ -60,8 +65,17 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { | |||||||
|     RConnections[pe].seq_local = 0; |     RConnections[pe].seq_local = 0; | ||||||
|     RConnections[pe].seq_remote= 0; |     RConnections[pe].seq_remote= 0; | ||||||
|   } |   } | ||||||
|  |   WorldSize = shmem_n_pes(); | ||||||
|  |   WorldRank = shmem_my_pe(); | ||||||
|  |   ShmRank=0; | ||||||
|  |   ShmSize=1; | ||||||
|  |   GroupRank=WorldRank; | ||||||
|  |   GroupSize=WorldSize; | ||||||
|  |   Slave    =0; | ||||||
|   shmem_barrier_all(); |   shmem_barrier_all(); | ||||||
|  |   ShmInitGeneric(); | ||||||
| } | } | ||||||
|  |  | ||||||
| CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) | CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) | ||||||
| { | { | ||||||
|   _ndimension = processors.size(); |   _ndimension = processors.size(); | ||||||
| @@ -230,12 +244,9 @@ void CartesianCommunicator::SendRecvPacket(void *xmit, | |||||||
|  |  | ||||||
|   if ( _processor == sender ) { |   if ( _processor == sender ) { | ||||||
|  |  | ||||||
|     printf("Sender SHMEM pt2pt %d -> %d\n",sender,receiver); |  | ||||||
|     // Check he has posted a receive |     // Check he has posted a receive | ||||||
|     while(SendSeq->seq_remote == SendSeq->seq_local); |     while(SendSeq->seq_remote == SendSeq->seq_local); | ||||||
|  |  | ||||||
|     printf("Sender receive %d posted\n",sender,receiver); |  | ||||||
|  |  | ||||||
|     // Advance our send count |     // Advance our send count | ||||||
|     seq = ++(SendSeq->seq_local); |     seq = ++(SendSeq->seq_local); | ||||||
|      |      | ||||||
| @@ -244,26 +255,19 @@ void CartesianCommunicator::SendRecvPacket(void *xmit, | |||||||
|     shmem_putmem(recv,xmit,bytes,receiver); |     shmem_putmem(recv,xmit,bytes,receiver); | ||||||
|     shmem_fence(); |     shmem_fence(); | ||||||
|  |  | ||||||
|     printf("Sender sent payload %d\n",seq); |  | ||||||
|     //Notify him we're done |     //Notify him we're done | ||||||
|     shmem_putmem((void *)&(RecvSeq->seq_remote),&seq,sizeof(seq),receiver); |     shmem_putmem((void *)&(RecvSeq->seq_remote),&seq,sizeof(seq),receiver); | ||||||
|     shmem_fence(); |     shmem_fence(); | ||||||
|     printf("Sender ringing door bell  %d\n",seq); |  | ||||||
|   } |   } | ||||||
|   if ( _processor == receiver ) { |   if ( _processor == receiver ) { | ||||||
|  |  | ||||||
|     printf("Receiver SHMEM pt2pt %d->%d\n",sender,receiver); |  | ||||||
|     // Post a receive |     // Post a receive | ||||||
|     seq = ++(RecvSeq->seq_local); |     seq = ++(RecvSeq->seq_local); | ||||||
|     shmem_putmem((void *)&(SendSeq->seq_remote),&seq,sizeof(seq),sender); |     shmem_putmem((void *)&(SendSeq->seq_remote),&seq,sizeof(seq),sender); | ||||||
|  |  | ||||||
|     printf("Receiver Opening letter box %d\n",seq); |  | ||||||
|  |  | ||||||
|      |  | ||||||
|     // Now wait until he has advanced our reception counter |     // Now wait until he has advanced our reception counter | ||||||
|     while(RecvSeq->seq_remote != RecvSeq->seq_local); |     while(RecvSeq->seq_remote != RecvSeq->seq_local); | ||||||
|  |  | ||||||
|     printf("Receiver Got the mail %d\n",seq); |  | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -45,7 +45,7 @@ public: | |||||||
| // Gather for when there is no need to SIMD split with compression | // Gather for when there is no need to SIMD split with compression | ||||||
| /////////////////////////////////////////////////////////////////// | /////////////////////////////////////////////////////////////////// | ||||||
| template<class vobj,class cobj,class compressor> void  | template<class vobj,class cobj,class compressor> void  | ||||||
| Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off=0) | Gather_plane_simple (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimension,int plane,int cbmask,compressor &compress, int off=0) | ||||||
| { | { | ||||||
|   int rd = rhs._grid->_rdimensions[dimension]; |   int rd = rhs._grid->_rdimensions[dimension]; | ||||||
|  |  | ||||||
| @@ -114,6 +114,7 @@ PARALLEL_NESTED_LOOP2 | |||||||
| 	int o      =   n*n1; | 	int o      =   n*n1; | ||||||
| 	int offset = b+n*n2; | 	int offset = b+n*n2; | ||||||
| 	cobj temp =compress(rhs._odata[so+o+b]); | 	cobj temp =compress(rhs._odata[so+o+b]); | ||||||
|  |  | ||||||
| 	extract<cobj>(temp,pointers,offset); | 	extract<cobj>(temp,pointers,offset); | ||||||
|  |  | ||||||
|       } |       } | ||||||
| @@ -121,6 +122,7 @@ PARALLEL_NESTED_LOOP2 | |||||||
|   } else {  |   } else {  | ||||||
|  |  | ||||||
|     assert(0); //Fixme think this is buggy |     assert(0); //Fixme think this is buggy | ||||||
|  |  | ||||||
|     for(int n=0;n<e1;n++){ |     for(int n=0;n<e1;n++){ | ||||||
|       for(int b=0;b<e2;b++){ |       for(int b=0;b<e2;b++){ | ||||||
| 	int o=n*rhs._grid->_slice_stride[dimension]; | 	int o=n*rhs._grid->_slice_stride[dimension]; | ||||||
| @@ -139,7 +141,7 @@ PARALLEL_NESTED_LOOP2 | |||||||
| ////////////////////////////////////////////////////// | ////////////////////////////////////////////////////// | ||||||
| // Gather for when there is no need to SIMD split | // Gather for when there is no need to SIMD split | ||||||
| ////////////////////////////////////////////////////// | ////////////////////////////////////////////////////// | ||||||
| template<class vobj> void Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer,             int dimension,int plane,int cbmask) | template<class vobj> void Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask) | ||||||
| { | { | ||||||
|   SimpleCompressor<vobj> dontcompress; |   SimpleCompressor<vobj> dontcompress; | ||||||
|   Gather_plane_simple (rhs,buffer,dimension,plane,cbmask,dontcompress); |   Gather_plane_simple (rhs,buffer,dimension,plane,cbmask,dontcompress); | ||||||
| @@ -157,7 +159,7 @@ template<class vobj> void Gather_plane_extract(const Lattice<vobj> &rhs,std::vec | |||||||
| ////////////////////////////////////////////////////// | ////////////////////////////////////////////////////// | ||||||
| // Scatter for when there is no need to SIMD split | // Scatter for when there is no need to SIMD split | ||||||
| ////////////////////////////////////////////////////// | ////////////////////////////////////////////////////// | ||||||
| template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer, int dimension,int plane,int cbmask) | template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask) | ||||||
| { | { | ||||||
|   int rd = rhs._grid->_rdimensions[dimension]; |   int rd = rhs._grid->_rdimensions[dimension]; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -119,8 +119,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | |||||||
|   assert(shift<fd); |   assert(shift<fd); | ||||||
|    |    | ||||||
|   int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension]; |   int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension]; | ||||||
|   std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size); |   commVector<vobj> send_buf(buffer_size); | ||||||
|   std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size); |   commVector<vobj> recv_buf(buffer_size); | ||||||
|  |  | ||||||
|   int cb= (cbmask==0x2)? Odd : Even; |   int cb= (cbmask==0x2)? Odd : Even; | ||||||
|   int sshift= rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); |   int sshift= rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); | ||||||
| @@ -191,8 +191,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | |||||||
|   int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; |   int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; | ||||||
|   int words = sizeof(vobj)/sizeof(vector_type); |   int words = sizeof(vobj)/sizeof(vector_type); | ||||||
|  |  | ||||||
|   std::vector<Vector<scalar_object> >   send_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) ); |   std::vector<commVector<scalar_object> >   send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) ); | ||||||
|   std::vector<Vector<scalar_object> >   recv_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) ); |   std::vector<commVector<scalar_object> >   recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) ); | ||||||
|  |  | ||||||
|   int bytes = buffer_size*sizeof(scalar_object); |   int bytes = buffer_size*sizeof(scalar_object); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -65,9 +65,6 @@ public: | |||||||
|      |      | ||||||
| class LatticeExpressionBase {}; | class LatticeExpressionBase {}; | ||||||
|  |  | ||||||
| template<class T> using Vector = std::vector<T,alignedAllocator<T> >;               // Aligned allocator?? |  | ||||||
| template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >; // Aligned allocator?? |  | ||||||
|  |  | ||||||
| template <typename Op, typename T1>                            | template <typename Op, typename T1>                            | ||||||
| class LatticeUnaryExpression  : public std::pair<Op,std::tuple<T1> > , public LatticeExpressionBase { | class LatticeUnaryExpression  : public std::pair<Op,std::tuple<T1> > , public LatticeExpressionBase { | ||||||
|  public: |  public: | ||||||
|   | |||||||
| @@ -164,15 +164,17 @@ PARALLEL_FOR_LOOP | |||||||
|       assert( l.checkerboard== l._grid->CheckerBoard(site)); |       assert( l.checkerboard== l._grid->CheckerBoard(site)); | ||||||
|       assert( sizeof(sobj)*Nsimd == sizeof(vobj)); |       assert( sizeof(sobj)*Nsimd == sizeof(vobj)); | ||||||
|  |  | ||||||
|  |       static const int words=sizeof(vobj)/sizeof(vector_type); | ||||||
|       int odx,idx; |       int odx,idx; | ||||||
|       idx= grid->iIndex(site); |       idx= grid->iIndex(site); | ||||||
|       odx= grid->oIndex(site); |       odx= grid->oIndex(site); | ||||||
|  |  | ||||||
|       std::vector<sobj> buf(Nsimd); |       scalar_type * vp = (scalar_type *)&l._odata[odx]; | ||||||
|  |       scalar_type * pt = (scalar_type *)&s; | ||||||
|        |        | ||||||
|       extract(l._odata[odx],buf); |       for(int w=0;w<words;w++){ | ||||||
|        |         pt[w] = vp[idx+w*Nsimd]; | ||||||
|       s = buf[idx]; |       } | ||||||
|        |        | ||||||
|       return; |       return; | ||||||
|     }; |     }; | ||||||
| @@ -190,18 +192,17 @@ PARALLEL_FOR_LOOP | |||||||
|       assert( l.checkerboard== l._grid->CheckerBoard(site)); |       assert( l.checkerboard== l._grid->CheckerBoard(site)); | ||||||
|       assert( sizeof(sobj)*Nsimd == sizeof(vobj)); |       assert( sizeof(sobj)*Nsimd == sizeof(vobj)); | ||||||
|  |  | ||||||
|  |       static const int words=sizeof(vobj)/sizeof(vector_type); | ||||||
|       int odx,idx; |       int odx,idx; | ||||||
|       idx= grid->iIndex(site); |       idx= grid->iIndex(site); | ||||||
|       odx= grid->oIndex(site); |       odx= grid->oIndex(site); | ||||||
|  |  | ||||||
|       std::vector<sobj> buf(Nsimd); |       scalar_type * vp = (scalar_type *)&l._odata[odx]; | ||||||
|  |       scalar_type * pt = (scalar_type *)&s; | ||||||
|        |        | ||||||
|       // extract-modify-merge cycle is easiest way and this is not perf critical |       for(int w=0;w<words;w++){ | ||||||
|       extract(l._odata[odx],buf); |         vp[idx+w*Nsimd] = pt[w]; | ||||||
|        |       } | ||||||
|       buf[idx] = s; |  | ||||||
|  |  | ||||||
|       merge(l._odata[odx],buf); |  | ||||||
|  |  | ||||||
|       return; |       return; | ||||||
|     }; |     }; | ||||||
|   | |||||||
| @@ -31,14 +31,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
|  |  | ||||||
| #include <random> | #include <random> | ||||||
|  |  | ||||||
| // Have not enable RNG_SPRNG_SHA256 by default yet. |  | ||||||
| // Uncomment the following line to see the effect of the new RNG. |  | ||||||
| // #define RNG_SPRNG_SHA256 |  | ||||||
|  |  | ||||||
| #ifdef RNG_SPRNG_SHA256 |  | ||||||
| #include "rng/sprng-sha256.h" |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -118,11 +110,7 @@ namespace Grid { | |||||||
|     int _seeded; |     int _seeded; | ||||||
|     // One generator per site. |     // One generator per site. | ||||||
|     // Uniform and Gaussian distributions from these generators. |     // Uniform and Gaussian distributions from these generators. | ||||||
| #ifdef RNG_SPRNG_SHA256 | #ifdef RNG_RANLUX | ||||||
|     typedef uint32_t      RngStateType; |  | ||||||
|     typedef SprngSha256 RngEngine; |  | ||||||
|     static const int RngStateCount = 22; |  | ||||||
| #elif defined RNG_RANLUX |  | ||||||
|     typedef uint64_t      RngStateType; |     typedef uint64_t      RngStateType; | ||||||
|     typedef std::ranlux48 RngEngine; |     typedef std::ranlux48 RngEngine; | ||||||
|     static const int RngStateCount = 15; |     static const int RngStateCount = 15; | ||||||
| @@ -285,34 +273,6 @@ namespace Grid { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|  |  | ||||||
| #ifdef RNG_SPRNG_SHA256 |  | ||||||
|     template<class source> void Seed(source &src) |  | ||||||
|     { |  | ||||||
|       std::vector<int> gcoor; |  | ||||||
|  |  | ||||||
|       long gsites = _grid->_gsites; |  | ||||||
|  |  | ||||||
|       RngState rs; |  | ||||||
|       for (int i = 0; i < 8; ++i) { |  | ||||||
|         splitRngState(rs, rs, src()); |  | ||||||
|       } |  | ||||||
|  |  | ||||||
|       for(long gidx=0;gidx<gsites;gidx++){ |  | ||||||
|  |  | ||||||
|         int rank,o_idx,i_idx; |  | ||||||
|         _grid->GlobalIndexToGlobalCoor(gidx,gcoor); |  | ||||||
|         _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); |  | ||||||
|  |  | ||||||
|         int l_idx=generator_idx(o_idx,i_idx); |  | ||||||
|  |  | ||||||
|         if( rank == _grid->ThisRank() ){ |  | ||||||
|           splitRngState(_generators[l_idx].rs, rs, gidx); |  | ||||||
|         } |  | ||||||
|       } |  | ||||||
|       _seeded=1; |  | ||||||
|  |  | ||||||
|     } |  | ||||||
| #else |  | ||||||
|     // This loop could be made faster to avoid the Ahmdahl by |     // This loop could be made faster to avoid the Ahmdahl by | ||||||
|     // i)  seed generators on each timeslice, for x=y=z=0; |     // i)  seed generators on each timeslice, for x=y=z=0; | ||||||
|     // ii) seed generators on each z for x=y=0 |     // ii) seed generators on each z for x=y=0 | ||||||
| @@ -337,8 +297,9 @@ namespace Grid { | |||||||
|  |  | ||||||
| 	int l_idx=generator_idx(o_idx,i_idx); | 	int l_idx=generator_idx(o_idx,i_idx); | ||||||
|  |  | ||||||
| 	std::vector<int> site_seeds(4); | 	const int num_rand_seed=16; | ||||||
| 	for(int i=0;i<4;i++){ | 	std::vector<int> site_seeds(num_rand_seed); | ||||||
|  | 	for(int i=0;i<site_seeds.size();i++){ | ||||||
| 	  site_seeds[i]= ui(pseeder); | 	  site_seeds[i]= ui(pseeder); | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| @@ -352,7 +313,6 @@ namespace Grid { | |||||||
|       } |       } | ||||||
|       _seeded=1; |       _seeded=1; | ||||||
|     }     |     }     | ||||||
| #endif |  | ||||||
|  |  | ||||||
|     //FIXME implement generic IO and create state save/restore |     //FIXME implement generic IO and create state save/restore | ||||||
|     //void SaveState(const std::string<char> &file); |     //void SaveState(const std::string<char> &file); | ||||||
|   | |||||||
| @@ -1,353 +0,0 @@ | |||||||
| // vim: set ts=2 sw=2 expandtab: |  | ||||||
|  |  | ||||||
| // Copyright (c) 2016 Luchang Jin |  | ||||||
| // All rights reserved. |  | ||||||
|  |  | ||||||
| // This program is free software: you can redistribute it and/or modify |  | ||||||
| // it under the terms of the GNU General Public License as published by |  | ||||||
| // the Free Software Foundation, either version 2 of the License, or |  | ||||||
| // (at your option) any later version. |  | ||||||
| // |  | ||||||
| // This program is distributed in the hope that it will be useful, |  | ||||||
| // but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
| // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
| // GNU General Public License for more details. |  | ||||||
| // |  | ||||||
| // You should have received a copy of the GNU General Public License |  | ||||||
| // along with this program.  If not, see <http://www.gnu.org/licenses/>. |  | ||||||
|  |  | ||||||
| #pragma once |  | ||||||
|  |  | ||||||
| #ifndef INCLUDE_RNG_STATE_H |  | ||||||
| #define INCLUDE_RNG_STATE_H |  | ||||||
|  |  | ||||||
| #include "show.h" |  | ||||||
|  |  | ||||||
| #ifndef USE_OPENSSL |  | ||||||
| #include "sha256.h" |  | ||||||
| #else |  | ||||||
| #include <openssl/sha.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #include <stdint.h> |  | ||||||
| #include <endian.h> |  | ||||||
| #include <cstring> |  | ||||||
| #include <cmath> |  | ||||||
| #include <cassert> |  | ||||||
| #include <string> |  | ||||||
| #include <ostream> |  | ||||||
| #include <istream> |  | ||||||
| #include <vector> |  | ||||||
|  |  | ||||||
| #ifdef CURRENT_DEFAULT_NAMESPACE_NAME |  | ||||||
| namespace CURRENT_DEFAULT_NAMESPACE_NAME { |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| struct RngState; |  | ||||||
|  |  | ||||||
| inline void reset(RngState& rs); |  | ||||||
|  |  | ||||||
| inline void reset(RngState& rs, const std::string& seed); |  | ||||||
|  |  | ||||||
| inline void reset(RngState& rs, const long seed) |  | ||||||
| { |  | ||||||
|   reset(rs, show(seed)); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline void splitRngState(RngState& rs, const RngState& rs0, const std::string& sindex); |  | ||||||
|  |  | ||||||
| inline void splitRngState(RngState& rs, const RngState& rs0, const long sindex = 0) |  | ||||||
| { |  | ||||||
|   splitRngState(rs, rs0, show(sindex)); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline uint64_t randGen(RngState& rs); |  | ||||||
|  |  | ||||||
| inline double uRandGen(RngState& rs, const double upper = 1.0, const double lower = 0.0); |  | ||||||
|  |  | ||||||
| inline double gRandGen(RngState& rs, const double sigma = 1.0, const double center = 0.0); |  | ||||||
|  |  | ||||||
| inline void computeHashWithInput(uint32_t hash[8], const RngState& rs, const std::string& input); |  | ||||||
|  |  | ||||||
| struct RngState |  | ||||||
| { |  | ||||||
|   uint64_t numBytes; |  | ||||||
|   uint32_t hash[8]; |  | ||||||
|   unsigned long index; |  | ||||||
|   // |  | ||||||
|   uint64_t cache[3]; |  | ||||||
|   double gaussian; |  | ||||||
|   int cacheAvail; |  | ||||||
|   bool gaussianAvail; |  | ||||||
|   // |  | ||||||
|   inline void init() |  | ||||||
|   { |  | ||||||
|     reset(*this); |  | ||||||
|   } |  | ||||||
|   // |  | ||||||
|   RngState() |  | ||||||
|   { |  | ||||||
|     init(); |  | ||||||
|   } |  | ||||||
|   RngState(const std::string& seed) |  | ||||||
|   { |  | ||||||
|     reset(*this, seed); |  | ||||||
|   } |  | ||||||
|   RngState(const long seed) |  | ||||||
|   { |  | ||||||
|     reset(*this, seed); |  | ||||||
|   } |  | ||||||
|   RngState(const RngState& rs0, const std::string& sindex) |  | ||||||
|   { |  | ||||||
|     splitRngState(*this, rs0, sindex); |  | ||||||
|   } |  | ||||||
|   RngState(const RngState& rs0, const long sindex) |  | ||||||
|   { |  | ||||||
|     splitRngState(*this, rs0, sindex); |  | ||||||
|   } |  | ||||||
|   // |  | ||||||
|   RngState split(const std::string& sindex) |  | ||||||
|   { |  | ||||||
|     RngState rs(*this, sindex); |  | ||||||
|     return rs; |  | ||||||
|   } |  | ||||||
|   RngState split(const long sindex) |  | ||||||
|   { |  | ||||||
|     RngState rs(*this, sindex); |  | ||||||
|     return rs; |  | ||||||
|   } |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| const size_t RNG_STATE_NUM_OF_INT32 = 2 + 8 + 2 + 3 * 2 + 2 + 1 + 1; |  | ||||||
|  |  | ||||||
| inline uint64_t patchTwoUint32(const uint32_t a, const uint32_t b) |  | ||||||
| { |  | ||||||
|   return (uint64_t)a << 32 | (uint64_t)b; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline void splitTwoUint32(uint32_t& a, uint32_t& b, const uint64_t x) |  | ||||||
| { |  | ||||||
|   b = (uint32_t)x; |  | ||||||
|   a = (uint32_t)(x >> 32); |  | ||||||
|   assert(x == patchTwoUint32(a, b)); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline void exportRngState(uint32_t* v, const RngState& rs) |  | ||||||
| { |  | ||||||
|   assert(22 == RNG_STATE_NUM_OF_INT32); |  | ||||||
|   splitTwoUint32(v[0], v[1], rs.numBytes); |  | ||||||
|   for (int i = 0; i < 8; ++i) { |  | ||||||
|     v[2 + i] = rs.hash[i]; |  | ||||||
|   } |  | ||||||
|   splitTwoUint32(v[10], v[11], rs.index); |  | ||||||
|   for (int i = 0; i < 3; ++i) { |  | ||||||
|     splitTwoUint32(v[12 + i * 2], v[12 + i * 2 + 1], rs.cache[i]); |  | ||||||
|   } |  | ||||||
|   union { |  | ||||||
|     double d; |  | ||||||
|     uint64_t l; |  | ||||||
|   } g; |  | ||||||
|   g.d = rs.gaussian; |  | ||||||
|   splitTwoUint32(v[18], v[19], g.l); |  | ||||||
|   v[20] = rs.cacheAvail; |  | ||||||
|   v[21] = rs.gaussianAvail; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline void importRngState(RngState& rs, const uint32_t* v) |  | ||||||
| { |  | ||||||
|   assert(22 == RNG_STATE_NUM_OF_INT32); |  | ||||||
|   rs.numBytes = patchTwoUint32(v[0], v[1]); |  | ||||||
|   for (int i = 0; i < 8; ++i) { |  | ||||||
|     rs.hash[i] = v[2 + i]; |  | ||||||
|   } |  | ||||||
|   rs.index = patchTwoUint32(v[10], v[11]); |  | ||||||
|   for (int i = 0; i < 3; ++i) { |  | ||||||
|     rs.cache[i] = patchTwoUint32(v[12 + i * 2], v[12 + i * 2 + 1]); |  | ||||||
|   } |  | ||||||
|   union { |  | ||||||
|     double d; |  | ||||||
|     uint64_t l; |  | ||||||
|   } g; |  | ||||||
|   g.l = patchTwoUint32(v[18], v[19]); |  | ||||||
|   rs.gaussian = g.d; |  | ||||||
|   rs.cacheAvail = v[20]; |  | ||||||
|   rs.gaussianAvail = v[21]; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline void exportRngState(std::vector<uint32_t>& v, const RngState& rs) |  | ||||||
| { |  | ||||||
|   v.resize(RNG_STATE_NUM_OF_INT32); |  | ||||||
|   exportRngState(v.data(), rs); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline void importRngState(RngState& rs, const std::vector<uint32_t>& v) |  | ||||||
| { |  | ||||||
|   assert(RNG_STATE_NUM_OF_INT32 == v.size()); |  | ||||||
|   importRngState(rs, v.data()); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline std::ostream& operator<<(std::ostream& os, const RngState& rs) |  | ||||||
| { |  | ||||||
|   std::vector<uint32_t> v(RNG_STATE_NUM_OF_INT32); |  | ||||||
|   exportRngState(v, rs); |  | ||||||
|   for (size_t i = 0; i < v.size() - 1; ++i) { |  | ||||||
|     os << v[i] << " "; |  | ||||||
|   } |  | ||||||
|   os << v.back(); |  | ||||||
|   return os; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline std::istream& operator>>(std::istream& is, RngState& rs) |  | ||||||
| { |  | ||||||
|   std::vector<uint32_t> v(RNG_STATE_NUM_OF_INT32); |  | ||||||
|   for (size_t i = 0; i < v.size(); ++i) { |  | ||||||
|     is >> v[i]; |  | ||||||
|   } |  | ||||||
|   importRngState(rs, v); |  | ||||||
|   return is; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline std::string show(const RngState& rs) |  | ||||||
| { |  | ||||||
|   return shows(rs); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline bool operator==(const RngState& rs1, const RngState& rs2) |  | ||||||
| { |  | ||||||
|   return 0 == memcmp(&rs1, &rs2, sizeof(RngState)); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline void reset(RngState& rs) |  | ||||||
| { |  | ||||||
|   std::memset(&rs, 0, sizeof(RngState)); |  | ||||||
|   rs.numBytes = 0; |  | ||||||
|   rs.hash[0] = 0; |  | ||||||
|   rs.hash[1] = 0; |  | ||||||
|   rs.hash[2] = 0; |  | ||||||
|   rs.hash[3] = 0; |  | ||||||
|   rs.hash[4] = 0; |  | ||||||
|   rs.hash[5] = 0; |  | ||||||
|   rs.hash[6] = 0; |  | ||||||
|   rs.hash[7] = 0; |  | ||||||
|   rs.index = 0; |  | ||||||
|   rs.cache[0] = 0; |  | ||||||
|   rs.cache[1] = 0; |  | ||||||
|   rs.cache[2] = 0; |  | ||||||
|   rs.gaussian = 0.0; |  | ||||||
|   rs.cacheAvail = 0; |  | ||||||
|   rs.gaussianAvail = false; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline void reset(RngState& rs, const std::string& seed) |  | ||||||
| { |  | ||||||
|   reset(rs); |  | ||||||
|   splitRngState(rs, rs, seed); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline void computeHashWithInput(uint32_t hash[8], const RngState& rs, const std::string& input) |  | ||||||
| { |  | ||||||
|   std::string data(32, ' '); |  | ||||||
|   for (int i = 0; i < 8; ++i) { |  | ||||||
|     data[i*4 + 0] = (rs.hash[i] >> 24) & 0xFF; |  | ||||||
|     data[i*4 + 1] = (rs.hash[i] >> 16) & 0xFF; |  | ||||||
|     data[i*4 + 2] = (rs.hash[i] >>  8) & 0xFF; |  | ||||||
|     data[i*4 + 3] =  rs.hash[i]        & 0xFF; |  | ||||||
|   } |  | ||||||
|   data += input; |  | ||||||
| #ifndef USE_OPENSSL |  | ||||||
|   sha256::computeHash(hash, (const uint8_t*)data.c_str(), data.length()); |  | ||||||
| #else |  | ||||||
|   { |  | ||||||
|     uint8_t rawHash[32]; |  | ||||||
|     SHA256((unsigned char*)data.c_str(), data.length(), rawHash); |  | ||||||
|     for (int i = 0; i < 8; ++i) { |  | ||||||
|       hash[i] = (((uint32_t)rawHash[i*4 + 0]) << 24) |  | ||||||
|               + (((uint32_t)rawHash[i*4 + 1]) << 16) |  | ||||||
|               + (((uint32_t)rawHash[i*4 + 2]) <<  8) |  | ||||||
|               + ( (uint32_t)rawHash[i*4 + 3]); |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
| #endif |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline void splitRngState(RngState& rs, const RngState& rs0, const std::string& sindex) |  | ||||||
|   // produce a new rng ``rs'' uniquely identified by ``rs0'' and ``sindex'' |  | ||||||
|   // will not affect old rng ``rs0'' |  | ||||||
|   // the function should behave correctly even if ``rs'' is actually ``rs0'' |  | ||||||
| { |  | ||||||
|   std::string input = ssprintf("[%lu] {%s}", rs0.index, sindex.c_str()); |  | ||||||
|   rs.numBytes = rs0.numBytes + 64 * ((32 + input.length() + 1 + 8 - 1) / 64 + 1); |  | ||||||
|   computeHashWithInput(rs.hash, rs0, input); |  | ||||||
|   rs.index = 0; |  | ||||||
|   rs.cache[0] = 0; |  | ||||||
|   rs.cache[1] = 0; |  | ||||||
|   rs.cache[2] = 0; |  | ||||||
|   rs.gaussian = 0.0; |  | ||||||
|   rs.cacheAvail = 0; |  | ||||||
|   rs.gaussianAvail = false; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline uint64_t randGen(RngState& rs) |  | ||||||
| { |  | ||||||
|   assert(0 <= rs.cacheAvail && rs.cacheAvail <= 3); |  | ||||||
|   rs.index += 1; |  | ||||||
|   if (rs.cacheAvail > 0) { |  | ||||||
|     rs.cacheAvail -= 1; |  | ||||||
|     uint64_t r = rs.cache[rs.cacheAvail]; |  | ||||||
|     rs.cache[rs.cacheAvail] = 0; |  | ||||||
|     return r; |  | ||||||
|   } else { |  | ||||||
|     uint32_t hash[8]; |  | ||||||
|     computeHashWithInput(hash, rs, ssprintf("[%lu]", rs.index)); |  | ||||||
|     rs.cache[0] = patchTwoUint32(hash[0], hash[1]); |  | ||||||
|     rs.cache[1] = patchTwoUint32(hash[2], hash[3]); |  | ||||||
|     rs.cache[2] = patchTwoUint32(hash[4], hash[5]); |  | ||||||
|     rs.cacheAvail = 3; |  | ||||||
|     return patchTwoUint32(hash[6], hash[7]); |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline double uRandGen(RngState& rs, const double upper, const double lower) |  | ||||||
| { |  | ||||||
|   uint64_t u = randGen(rs); |  | ||||||
|   const double fac = 1.0 / (256.0 * 256.0 * 256.0 * 256.0) / (256.0 * 256.0 * 256.0 * 256.0); |  | ||||||
|   return u * fac * (upper - lower) + lower; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline double gRandGen(RngState& rs, const double sigma, const double center) |  | ||||||
| { |  | ||||||
|   rs.index += 1; |  | ||||||
|   if (rs.gaussianAvail) { |  | ||||||
|     rs.gaussianAvail = false; |  | ||||||
|     return rs.gaussian * sigma + center; |  | ||||||
|   } else { |  | ||||||
|     // pick 2 uniform numbers in the square extending from |  | ||||||
|     // -1 to 1 in each direction, see if they are in the |  | ||||||
|     // unit circle, and try again if they are not. |  | ||||||
|     int num_try = 1; |  | ||||||
|     double v1, v2, rsq; |  | ||||||
|     do { |  | ||||||
|       v1 = uRandGen(rs, 1.0, -1.0); |  | ||||||
|       v2 = uRandGen(rs, 1.0, -1.0); |  | ||||||
|       if ((num_try % 1000)==0) { |  | ||||||
|         printf("gRandGen : WARNING num_try=%d v1=%e v2=%e\n",num_try,v1,v2); |  | ||||||
|       } |  | ||||||
|       rsq = v1*v1 + v2*v2; |  | ||||||
|       num_try++; |  | ||||||
|     } while ((num_try < 10000) && (rsq >= 1.0 || rsq == 0)); |  | ||||||
|     if (num_try > 9999) { |  | ||||||
|       printf("gRandGen : WARNING failed after 10000 tries (corrupted RNG?), returning ridiculous numbers (1e+10)\n"); |  | ||||||
|       return 1e+10; |  | ||||||
|     } |  | ||||||
|     double fac = std::sqrt(-2.0 * std::log(rsq)/rsq); |  | ||||||
|     rs.gaussian = v1 * fac; |  | ||||||
|     rs.gaussianAvail = true; |  | ||||||
|     return v2 * fac * sigma + center; |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #ifdef CURRENT_DEFAULT_NAMESPACE_NAME |  | ||||||
| } |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #endif |  | ||||||
| @@ -1,348 +0,0 @@ | |||||||
| // vim: set ts=2 sw=2 expandtab: |  | ||||||
|  |  | ||||||
| // Copyright (c) 2016 Luchang Jin |  | ||||||
| // All rights reserved. |  | ||||||
|  |  | ||||||
| // This program is free software: you can redistribute it and/or modify |  | ||||||
| // it under the terms of the GNU General Public License as published by |  | ||||||
| // the Free Software Foundation, either version 2 of the License, or |  | ||||||
| // (at your option) any later version. |  | ||||||
| // |  | ||||||
| // This program is distributed in the hope that it will be useful, |  | ||||||
| // but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
| // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
| // GNU General Public License for more details. |  | ||||||
| // |  | ||||||
| // You should have received a copy of the GNU General Public License |  | ||||||
| // along with this program.  If not, see <http://www.gnu.org/licenses/>. |  | ||||||
|  |  | ||||||
| // Code within namespace sha256 are originally from Stephan Brumme. |  | ||||||
| // see http://create.stephan-brumme.com/disclaimer.html |  | ||||||
|  |  | ||||||
| #pragma once |  | ||||||
|  |  | ||||||
| #include <stdint.h> |  | ||||||
| #include <endian.h> |  | ||||||
| #include <cstring> |  | ||||||
| #include <cmath> |  | ||||||
| #include <cassert> |  | ||||||
| #include <string> |  | ||||||
| #include <ostream> |  | ||||||
| #include <istream> |  | ||||||
| #include <vector> |  | ||||||
|  |  | ||||||
| #ifdef CURRENT_DEFAULT_NAMESPACE_NAME |  | ||||||
| namespace CURRENT_DEFAULT_NAMESPACE_NAME { |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| namespace sha256 { |  | ||||||
|  |  | ||||||
|   const size_t BlockSize = 512 / 8; |  | ||||||
|  |  | ||||||
|   const size_t HashBytes = 32; |  | ||||||
|  |  | ||||||
|   const size_t HashValues = HashBytes / 4; |  | ||||||
|  |  | ||||||
|   inline uint32_t rotate(uint32_t a, uint32_t c) |  | ||||||
|   { |  | ||||||
|     return (a >> c) | (a << (32 - c)); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   inline uint32_t swap(uint32_t x) |  | ||||||
|   { |  | ||||||
|     return (x >> 24) | |  | ||||||
|       ((x >>  8) & 0x0000FF00) | |  | ||||||
|       ((x <<  8) & 0x00FF0000) | |  | ||||||
|       (x << 24); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   inline uint32_t f1(uint32_t e, uint32_t f, uint32_t g) |  | ||||||
|   // mix functions for processBlock() |  | ||||||
|   { |  | ||||||
|     uint32_t term1 = rotate(e, 6) ^ rotate(e, 11) ^ rotate(e, 25); |  | ||||||
|     uint32_t term2 = (e & f) ^ (~e & g); //(g ^ (e & (f ^ g))) |  | ||||||
|     return term1 + term2; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   inline uint32_t f2(uint32_t a, uint32_t b, uint32_t c) |  | ||||||
|   // mix functions for processBlock() |  | ||||||
|   { |  | ||||||
|     uint32_t term1 = rotate(a, 2) ^ rotate(a, 13) ^ rotate(a, 22); |  | ||||||
|     uint32_t term2 = ((a | b) & c) | (a & b); //(a & (b ^ c)) ^ (b & c); |  | ||||||
|     return term1 + term2; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   inline void processBlock(uint32_t newHash[8], const uint32_t oldHash[8], const uint8_t data[64]) |  | ||||||
|     // process 64 bytes of data |  | ||||||
|     // newHash and oldHash and be the same |  | ||||||
|   { |  | ||||||
|     // get last hash |  | ||||||
|     uint32_t a = oldHash[0]; |  | ||||||
|     uint32_t b = oldHash[1]; |  | ||||||
|     uint32_t c = oldHash[2]; |  | ||||||
|     uint32_t d = oldHash[3]; |  | ||||||
|     uint32_t e = oldHash[4]; |  | ||||||
|     uint32_t f = oldHash[5]; |  | ||||||
|     uint32_t g = oldHash[6]; |  | ||||||
|     uint32_t h = oldHash[7]; |  | ||||||
|     // data represented as 16x 32-bit words |  | ||||||
|     const uint32_t* input = (uint32_t*) data; |  | ||||||
|     // convert to big endian |  | ||||||
|     uint32_t words[64]; |  | ||||||
|     int i; |  | ||||||
|     for (i = 0; i < 16; i++) { |  | ||||||
| #if defined(__BYTE_ORDER) && (__BYTE_ORDER != 0) && (__BYTE_ORDER == __BIG_ENDIAN) |  | ||||||
|       words[i] =      input[i]; |  | ||||||
| #else |  | ||||||
|       words[i] = swap(input[i]); |  | ||||||
| #endif |  | ||||||
|     } |  | ||||||
|     uint32_t x,y; // temporaries |  | ||||||
|     // first round |  | ||||||
|     x = h + f1(e,f,g) + 0x428a2f98 + words[ 0]; y = f2(a,b,c); d += x; h = x + y; |  | ||||||
|     x = g + f1(d,e,f) + 0x71374491 + words[ 1]; y = f2(h,a,b); c += x; g = x + y; |  | ||||||
|     x = f + f1(c,d,e) + 0xb5c0fbcf + words[ 2]; y = f2(g,h,a); b += x; f = x + y; |  | ||||||
|     x = e + f1(b,c,d) + 0xe9b5dba5 + words[ 3]; y = f2(f,g,h); a += x; e = x + y; |  | ||||||
|     x = d + f1(a,b,c) + 0x3956c25b + words[ 4]; y = f2(e,f,g); h += x; d = x + y; |  | ||||||
|     x = c + f1(h,a,b) + 0x59f111f1 + words[ 5]; y = f2(d,e,f); g += x; c = x + y; |  | ||||||
|     x = b + f1(g,h,a) + 0x923f82a4 + words[ 6]; y = f2(c,d,e); f += x; b = x + y; |  | ||||||
|     x = a + f1(f,g,h) + 0xab1c5ed5 + words[ 7]; y = f2(b,c,d); e += x; a = x + y; |  | ||||||
|     // secound round |  | ||||||
|     x = h + f1(e,f,g) + 0xd807aa98 + words[ 8]; y = f2(a,b,c); d += x; h = x + y; |  | ||||||
|     x = g + f1(d,e,f) + 0x12835b01 + words[ 9]; y = f2(h,a,b); c += x; g = x + y; |  | ||||||
|     x = f + f1(c,d,e) + 0x243185be + words[10]; y = f2(g,h,a); b += x; f = x + y; |  | ||||||
|     x = e + f1(b,c,d) + 0x550c7dc3 + words[11]; y = f2(f,g,h); a += x; e = x + y; |  | ||||||
|     x = d + f1(a,b,c) + 0x72be5d74 + words[12]; y = f2(e,f,g); h += x; d = x + y; |  | ||||||
|     x = c + f1(h,a,b) + 0x80deb1fe + words[13]; y = f2(d,e,f); g += x; c = x + y; |  | ||||||
|     x = b + f1(g,h,a) + 0x9bdc06a7 + words[14]; y = f2(c,d,e); f += x; b = x + y; |  | ||||||
|     x = a + f1(f,g,h) + 0xc19bf174 + words[15]; y = f2(b,c,d); e += x; a = x + y; |  | ||||||
|     // extend to 24 words |  | ||||||
|     for (; i < 24; i++) |  | ||||||
|       words[i] = words[i-16] + |  | ||||||
|         (rotate(words[i-15],  7) ^ rotate(words[i-15], 18) ^ (words[i-15] >>  3)) + |  | ||||||
|         words[i-7] + |  | ||||||
|         (rotate(words[i- 2], 17) ^ rotate(words[i- 2], 19) ^ (words[i- 2] >> 10)); |  | ||||||
|     // third round |  | ||||||
|     x = h + f1(e,f,g) + 0xe49b69c1 + words[16]; y = f2(a,b,c); d += x; h = x + y; |  | ||||||
|     x = g + f1(d,e,f) + 0xefbe4786 + words[17]; y = f2(h,a,b); c += x; g = x + y; |  | ||||||
|     x = f + f1(c,d,e) + 0x0fc19dc6 + words[18]; y = f2(g,h,a); b += x; f = x + y; |  | ||||||
|     x = e + f1(b,c,d) + 0x240ca1cc + words[19]; y = f2(f,g,h); a += x; e = x + y; |  | ||||||
|     x = d + f1(a,b,c) + 0x2de92c6f + words[20]; y = f2(e,f,g); h += x; d = x + y; |  | ||||||
|     x = c + f1(h,a,b) + 0x4a7484aa + words[21]; y = f2(d,e,f); g += x; c = x + y; |  | ||||||
|     x = b + f1(g,h,a) + 0x5cb0a9dc + words[22]; y = f2(c,d,e); f += x; b = x + y; |  | ||||||
|     x = a + f1(f,g,h) + 0x76f988da + words[23]; y = f2(b,c,d); e += x; a = x + y; |  | ||||||
|     // extend to 32 words |  | ||||||
|     for (; i < 32; i++) |  | ||||||
|       words[i] = words[i-16] + |  | ||||||
|         (rotate(words[i-15],  7) ^ rotate(words[i-15], 18) ^ (words[i-15] >>  3)) + |  | ||||||
|         words[i-7] + |  | ||||||
|         (rotate(words[i- 2], 17) ^ rotate(words[i- 2], 19) ^ (words[i- 2] >> 10)); |  | ||||||
|     // fourth round |  | ||||||
|     x = h + f1(e,f,g) + 0x983e5152 + words[24]; y = f2(a,b,c); d += x; h = x + y; |  | ||||||
|     x = g + f1(d,e,f) + 0xa831c66d + words[25]; y = f2(h,a,b); c += x; g = x + y; |  | ||||||
|     x = f + f1(c,d,e) + 0xb00327c8 + words[26]; y = f2(g,h,a); b += x; f = x + y; |  | ||||||
|     x = e + f1(b,c,d) + 0xbf597fc7 + words[27]; y = f2(f,g,h); a += x; e = x + y; |  | ||||||
|     x = d + f1(a,b,c) + 0xc6e00bf3 + words[28]; y = f2(e,f,g); h += x; d = x + y; |  | ||||||
|     x = c + f1(h,a,b) + 0xd5a79147 + words[29]; y = f2(d,e,f); g += x; c = x + y; |  | ||||||
|     x = b + f1(g,h,a) + 0x06ca6351 + words[30]; y = f2(c,d,e); f += x; b = x + y; |  | ||||||
|     x = a + f1(f,g,h) + 0x14292967 + words[31]; y = f2(b,c,d); e += x; a = x + y; |  | ||||||
|     // extend to 40 words |  | ||||||
|     for (; i < 40; i++) |  | ||||||
|       words[i] = words[i-16] + |  | ||||||
|         (rotate(words[i-15],  7) ^ rotate(words[i-15], 18) ^ (words[i-15] >>  3)) + |  | ||||||
|         words[i-7] + |  | ||||||
|         (rotate(words[i- 2], 17) ^ rotate(words[i- 2], 19) ^ (words[i- 2] >> 10)); |  | ||||||
|     // fifth round |  | ||||||
|     x = h + f1(e,f,g) + 0x27b70a85 + words[32]; y = f2(a,b,c); d += x; h = x + y; |  | ||||||
|     x = g + f1(d,e,f) + 0x2e1b2138 + words[33]; y = f2(h,a,b); c += x; g = x + y; |  | ||||||
|     x = f + f1(c,d,e) + 0x4d2c6dfc + words[34]; y = f2(g,h,a); b += x; f = x + y; |  | ||||||
|     x = e + f1(b,c,d) + 0x53380d13 + words[35]; y = f2(f,g,h); a += x; e = x + y; |  | ||||||
|     x = d + f1(a,b,c) + 0x650a7354 + words[36]; y = f2(e,f,g); h += x; d = x + y; |  | ||||||
|     x = c + f1(h,a,b) + 0x766a0abb + words[37]; y = f2(d,e,f); g += x; c = x + y; |  | ||||||
|     x = b + f1(g,h,a) + 0x81c2c92e + words[38]; y = f2(c,d,e); f += x; b = x + y; |  | ||||||
|     x = a + f1(f,g,h) + 0x92722c85 + words[39]; y = f2(b,c,d); e += x; a = x + y; |  | ||||||
|     // extend to 48 words |  | ||||||
|     for (; i < 48; i++) |  | ||||||
|       words[i] = words[i-16] + |  | ||||||
|         (rotate(words[i-15],  7) ^ rotate(words[i-15], 18) ^ (words[i-15] >>  3)) + |  | ||||||
|         words[i-7] + |  | ||||||
|         (rotate(words[i- 2], 17) ^ rotate(words[i- 2], 19) ^ (words[i- 2] >> 10)); |  | ||||||
|     // sixth round |  | ||||||
|     x = h + f1(e,f,g) + 0xa2bfe8a1 + words[40]; y = f2(a,b,c); d += x; h = x + y; |  | ||||||
|     x = g + f1(d,e,f) + 0xa81a664b + words[41]; y = f2(h,a,b); c += x; g = x + y; |  | ||||||
|     x = f + f1(c,d,e) + 0xc24b8b70 + words[42]; y = f2(g,h,a); b += x; f = x + y; |  | ||||||
|     x = e + f1(b,c,d) + 0xc76c51a3 + words[43]; y = f2(f,g,h); a += x; e = x + y; |  | ||||||
|     x = d + f1(a,b,c) + 0xd192e819 + words[44]; y = f2(e,f,g); h += x; d = x + y; |  | ||||||
|     x = c + f1(h,a,b) + 0xd6990624 + words[45]; y = f2(d,e,f); g += x; c = x + y; |  | ||||||
|     x = b + f1(g,h,a) + 0xf40e3585 + words[46]; y = f2(c,d,e); f += x; b = x + y; |  | ||||||
|     x = a + f1(f,g,h) + 0x106aa070 + words[47]; y = f2(b,c,d); e += x; a = x + y; |  | ||||||
|     // extend to 56 words |  | ||||||
|     for (; i < 56; i++) |  | ||||||
|       words[i] = words[i-16] + |  | ||||||
|         (rotate(words[i-15],  7) ^ rotate(words[i-15], 18) ^ (words[i-15] >>  3)) + |  | ||||||
|         words[i-7] + |  | ||||||
|         (rotate(words[i- 2], 17) ^ rotate(words[i- 2], 19) ^ (words[i- 2] >> 10)); |  | ||||||
|     // seventh round |  | ||||||
|     x = h + f1(e,f,g) + 0x19a4c116 + words[48]; y = f2(a,b,c); d += x; h = x + y; |  | ||||||
|     x = g + f1(d,e,f) + 0x1e376c08 + words[49]; y = f2(h,a,b); c += x; g = x + y; |  | ||||||
|     x = f + f1(c,d,e) + 0x2748774c + words[50]; y = f2(g,h,a); b += x; f = x + y; |  | ||||||
|     x = e + f1(b,c,d) + 0x34b0bcb5 + words[51]; y = f2(f,g,h); a += x; e = x + y; |  | ||||||
|     x = d + f1(a,b,c) + 0x391c0cb3 + words[52]; y = f2(e,f,g); h += x; d = x + y; |  | ||||||
|     x = c + f1(h,a,b) + 0x4ed8aa4a + words[53]; y = f2(d,e,f); g += x; c = x + y; |  | ||||||
|     x = b + f1(g,h,a) + 0x5b9cca4f + words[54]; y = f2(c,d,e); f += x; b = x + y; |  | ||||||
|     x = a + f1(f,g,h) + 0x682e6ff3 + words[55]; y = f2(b,c,d); e += x; a = x + y; |  | ||||||
|     // extend to 64 words |  | ||||||
|     for (; i < 64; i++) |  | ||||||
|       words[i] = words[i-16] + |  | ||||||
|         (rotate(words[i-15],  7) ^ rotate(words[i-15], 18) ^ (words[i-15] >>  3)) + |  | ||||||
|         words[i-7] + |  | ||||||
|         (rotate(words[i- 2], 17) ^ rotate(words[i- 2], 19) ^ (words[i- 2] >> 10)); |  | ||||||
|     // eigth round |  | ||||||
|     x = h + f1(e,f,g) + 0x748f82ee + words[56]; y = f2(a,b,c); d += x; h = x + y; |  | ||||||
|     x = g + f1(d,e,f) + 0x78a5636f + words[57]; y = f2(h,a,b); c += x; g = x + y; |  | ||||||
|     x = f + f1(c,d,e) + 0x84c87814 + words[58]; y = f2(g,h,a); b += x; f = x + y; |  | ||||||
|     x = e + f1(b,c,d) + 0x8cc70208 + words[59]; y = f2(f,g,h); a += x; e = x + y; |  | ||||||
|     x = d + f1(a,b,c) + 0x90befffa + words[60]; y = f2(e,f,g); h += x; d = x + y; |  | ||||||
|     x = c + f1(h,a,b) + 0xa4506ceb + words[61]; y = f2(d,e,f); g += x; c = x + y; |  | ||||||
|     x = b + f1(g,h,a) + 0xbef9a3f7 + words[62]; y = f2(c,d,e); f += x; b = x + y; |  | ||||||
|     x = a + f1(f,g,h) + 0xc67178f2 + words[63]; y = f2(b,c,d); e += x; a = x + y; |  | ||||||
|     // update hash |  | ||||||
|     newHash[0] = a + oldHash[0]; |  | ||||||
|     newHash[1] = b + oldHash[1]; |  | ||||||
|     newHash[2] = c + oldHash[2]; |  | ||||||
|     newHash[3] = d + oldHash[3]; |  | ||||||
|     newHash[4] = e + oldHash[4]; |  | ||||||
|     newHash[5] = f + oldHash[5]; |  | ||||||
|     newHash[6] = g + oldHash[6]; |  | ||||||
|     newHash[7] = h + oldHash[7]; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   inline void processInput( |  | ||||||
|       uint32_t hash[8], |  | ||||||
|       const uint32_t oldHash[8], const uint64_t numBytes, |  | ||||||
|       const uint8_t* input, const size_t inputSize) |  | ||||||
|     // process final block, less than 64 bytes |  | ||||||
|     // newHash and oldHash and be the same |  | ||||||
|   { |  | ||||||
|     // the input bytes are considered as bits strings, where the first bit is the most significant bit of the byte |  | ||||||
|     // - append "1" bit to message |  | ||||||
|     // - append "0" bits until message length in bit mod 512 is 448 |  | ||||||
|     // - append length as 64 bit integer |  | ||||||
|     // process initial parts of input |  | ||||||
|     std::memmove(hash, oldHash, 32); |  | ||||||
|     const int nBlocks = inputSize / 64; |  | ||||||
|     for (int i = 0; i < nBlocks; ++i) { |  | ||||||
|       processBlock(hash, hash, input + i * 64); |  | ||||||
|     } |  | ||||||
|     // initialize buffer from input |  | ||||||
|     const size_t bufferSize = inputSize - nBlocks * 64; |  | ||||||
|     unsigned char buffer[BlockSize]; |  | ||||||
|     std::memcpy(buffer, input + nBlocks * 64, bufferSize); |  | ||||||
|     // number of bits |  | ||||||
|     size_t paddedLength = bufferSize * 8; |  | ||||||
|     // plus one bit set to 1 (always appended) |  | ||||||
|     paddedLength++; |  | ||||||
|     // number of bits must be (numBits % 512) = 448 |  | ||||||
|     size_t lower11Bits = paddedLength & 511; |  | ||||||
|     if (lower11Bits <= 448) { |  | ||||||
|       paddedLength +=       448 - lower11Bits; |  | ||||||
|     } else { |  | ||||||
|       paddedLength += 512 + 448 - lower11Bits; |  | ||||||
|     } |  | ||||||
|     // convert from bits to bytes |  | ||||||
|     paddedLength /= 8; |  | ||||||
|     // only needed if additional data flows over into a second block |  | ||||||
|     unsigned char extra[BlockSize]; |  | ||||||
|     // append a "1" bit, 128 => binary 10000000 |  | ||||||
|     if (bufferSize < BlockSize) { |  | ||||||
|       buffer[bufferSize] = 128; |  | ||||||
|     } else { |  | ||||||
|       extra[0] = 128; |  | ||||||
|     } |  | ||||||
|     size_t i; |  | ||||||
|     for (i = bufferSize + 1; i < BlockSize; i++) { |  | ||||||
|       buffer[i] = 0; |  | ||||||
|     } |  | ||||||
|     for (; i < paddedLength; i++) { |  | ||||||
|       extra[i - BlockSize] = 0; |  | ||||||
|     } |  | ||||||
|     // add message length in bits as 64 bit number |  | ||||||
|     uint64_t msgBits = 8 * (numBytes + inputSize); |  | ||||||
|     // find right position |  | ||||||
|     unsigned char* addLength; |  | ||||||
|     if (paddedLength < BlockSize) { |  | ||||||
|       addLength = buffer + paddedLength; |  | ||||||
|     } else { |  | ||||||
|       addLength = extra + paddedLength - BlockSize; |  | ||||||
|     } |  | ||||||
|     // must be big endian |  | ||||||
|     *addLength++ = (unsigned char)((msgBits >> 56) & 0xFF); |  | ||||||
|     *addLength++ = (unsigned char)((msgBits >> 48) & 0xFF); |  | ||||||
|     *addLength++ = (unsigned char)((msgBits >> 40) & 0xFF); |  | ||||||
|     *addLength++ = (unsigned char)((msgBits >> 32) & 0xFF); |  | ||||||
|     *addLength++ = (unsigned char)((msgBits >> 24) & 0xFF); |  | ||||||
|     *addLength++ = (unsigned char)((msgBits >> 16) & 0xFF); |  | ||||||
|     *addLength++ = (unsigned char)((msgBits >>  8) & 0xFF); |  | ||||||
|     *addLength   = (unsigned char)( msgBits        & 0xFF); |  | ||||||
|     // process blocks |  | ||||||
|     processBlock(hash, hash, buffer); |  | ||||||
|     // flowed over into a second block ? |  | ||||||
|     if (paddedLength > BlockSize) { |  | ||||||
|       processBlock(hash, hash, extra); |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   inline void setInitialHash(uint32_t hash[8]) |  | ||||||
|   { |  | ||||||
|     hash[0] = 0x6a09e667; |  | ||||||
|     hash[1] = 0xbb67ae85; |  | ||||||
|     hash[2] = 0x3c6ef372; |  | ||||||
|     hash[3] = 0xa54ff53a; |  | ||||||
|     hash[4] = 0x510e527f; |  | ||||||
|     hash[5] = 0x9b05688c; |  | ||||||
|     hash[6] = 0x1f83d9ab; |  | ||||||
|     hash[7] = 0x5be0cd19; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   inline void computeHash(uint32_t hash[8], const void* data, const size_t size) |  | ||||||
|   { |  | ||||||
|     uint32_t initHash[8]; |  | ||||||
|     setInitialHash(initHash); |  | ||||||
|     processInput(hash, initHash, 0, (const uint8_t*)data, size); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   inline void rawHashFromHash(uint8_t rawHash[HashBytes], const uint32_t hash[HashValues]) |  | ||||||
|   { |  | ||||||
|     uint8_t* current = rawHash; |  | ||||||
|     for (size_t i = 0; i < HashValues; i++) { |  | ||||||
|       *current++ = (hash[i] >> 24) & 0xFF; |  | ||||||
|       *current++ = (hash[i] >> 16) & 0xFF; |  | ||||||
|       *current++ = (hash[i] >>  8) & 0xFF; |  | ||||||
|       *current++ =  hash[i]        & 0xFF; |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   inline std::string showRawHash(const uint8_t rawHash[HashBytes]) |  | ||||||
|   { |  | ||||||
|     std::string result; |  | ||||||
|     result.reserve(2 * HashBytes); |  | ||||||
|     for (size_t i = 0; i < HashBytes; i++) { |  | ||||||
|       static const char dec2hex[16+1] = "0123456789abcdef"; |  | ||||||
|       result += dec2hex[(rawHash[i] >> 4) & 15]; |  | ||||||
|       result += dec2hex[ rawHash[i]       & 15]; |  | ||||||
|     } |  | ||||||
|     return result; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   inline std::string showHash(const uint32_t hash[8]) |  | ||||||
|   { |  | ||||||
|     unsigned char rawHash[HashBytes]; |  | ||||||
|     rawHashFromHash(rawHash, hash); |  | ||||||
|     return showRawHash(rawHash); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #ifdef CURRENT_DEFAULT_NAMESPACE_NAME |  | ||||||
| } |  | ||||||
| #endif |  | ||||||
| @@ -1,125 +0,0 @@ | |||||||
| // vim: set ts=2 sw=2 expandtab: |  | ||||||
|  |  | ||||||
| // Copyright (c) 2014 Luchang Jin |  | ||||||
| // All rights reserved. |  | ||||||
|  |  | ||||||
| // This program is free software: you can redistribute it and/or modify |  | ||||||
| // it under the terms of the GNU General Public License as published by |  | ||||||
| // the Free Software Foundation, either version 2 of the License, or |  | ||||||
| // (at your option) any later version. |  | ||||||
| // |  | ||||||
| // This program is distributed in the hope that it will be useful, |  | ||||||
| // but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
| // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
| // GNU General Public License for more details. |  | ||||||
| // |  | ||||||
| // You should have received a copy of the GNU General Public License |  | ||||||
| // along with this program.  If not, see <http://www.gnu.org/licenses/>. |  | ||||||
|  |  | ||||||
| #pragma once |  | ||||||
|  |  | ||||||
| #ifndef INCLUDE_SHOW_H |  | ||||||
| #define INCLUDE_SHOW_H |  | ||||||
|  |  | ||||||
| #include <sstream> |  | ||||||
| #include <string> |  | ||||||
| #include <cstdarg> |  | ||||||
| #include <cstring> |  | ||||||
| #include <cstdlib> |  | ||||||
| #include <cstdio> |  | ||||||
| #include <sstream> |  | ||||||
|  |  | ||||||
| #ifdef CURRENT_DEFAULT_NAMESPACE_NAME |  | ||||||
| namespace CURRENT_DEFAULT_NAMESPACE_NAME { |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| inline std::string vssprintf(const char* fmt, va_list args) |  | ||||||
| { |  | ||||||
|   std::string str; |  | ||||||
|   char* cstr; |  | ||||||
|   vasprintf(&cstr, fmt, args); |  | ||||||
|   str += std::string(cstr); |  | ||||||
|   std::free(cstr); |  | ||||||
|   return str; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline std::string ssprintf(const char* fmt, ...) |  | ||||||
| { |  | ||||||
|   va_list args; |  | ||||||
|   va_start(args, fmt); |  | ||||||
|   return vssprintf(fmt, args); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline std::string show() |  | ||||||
| { |  | ||||||
|   return ""; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline std::string show(const int& x) |  | ||||||
| { |  | ||||||
|   return ssprintf("%d", x); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline std::string show(const unsigned int& x) |  | ||||||
| { |  | ||||||
|   return ssprintf("%u", x); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline std::string show(const long& x) |  | ||||||
| { |  | ||||||
|   return ssprintf("%ld", x); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline std::string show(const unsigned long& x) |  | ||||||
| { |  | ||||||
|   return ssprintf("%lu", x); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline std::string show(const double& x) |  | ||||||
| { |  | ||||||
|   return ssprintf("%24.17E", x); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline std::string show(const bool& x) |  | ||||||
| { |  | ||||||
|   return x ? "true" : "false"; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline std::string show(const std::string& x) |  | ||||||
| { |  | ||||||
|   std::ostringstream out; |  | ||||||
|   out << x; |  | ||||||
|   return out.str(); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template <class T> |  | ||||||
| std::string shows(const T& x) |  | ||||||
| { |  | ||||||
|   std::ostringstream out; |  | ||||||
|   out << x; |  | ||||||
|   return out.str(); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template <class T> |  | ||||||
| T& reads(T& x, const std::string& str) |  | ||||||
| { |  | ||||||
|   std::istringstream in(str); |  | ||||||
|   in >> x; |  | ||||||
|   return x; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline void fdisplay(FILE* fp, const std::string& str) |  | ||||||
| { |  | ||||||
|   fprintf(fp, "%s", str.c_str()); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline void fdisplayln(FILE* fp, const std::string& str) |  | ||||||
| { |  | ||||||
|   fprintf(fp, "%s\n", str.c_str()); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #ifdef CURRENT_DEFAULT_NAMESPACE_NAME |  | ||||||
| } |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #endif |  | ||||||
| @@ -1,115 +0,0 @@ | |||||||
| // vim: set ts=2 sw=2 expandtab: |  | ||||||
|  |  | ||||||
| // Copyright (c) 2016 Luchang Jin |  | ||||||
| // All rights reserved. |  | ||||||
|  |  | ||||||
| // This program is free software: you can redistribute it and/or modify |  | ||||||
| // it under the terms of the GNU General Public License as published by |  | ||||||
| // the Free Software Foundation, either version 2 of the License, or |  | ||||||
| // (at your option) any later version. |  | ||||||
| // |  | ||||||
| // This program is distributed in the hope that it will be useful, |  | ||||||
| // but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
| // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
| // GNU General Public License for more details. |  | ||||||
| // |  | ||||||
| // You should have received a copy of the GNU General Public License |  | ||||||
| // along with this program.  If not, see <http://www.gnu.org/licenses/>. |  | ||||||
|  |  | ||||||
| #pragma once |  | ||||||
|  |  | ||||||
| #ifndef INCLUDE_SPRNG_SHA256_H |  | ||||||
| #define INCLUDE_SPRNG_SHA256_H |  | ||||||
|  |  | ||||||
| #include "rng-state.h" |  | ||||||
|  |  | ||||||
| #include <array> |  | ||||||
| #include <cstring> |  | ||||||
| #include <ostream> |  | ||||||
| #include <istream> |  | ||||||
|  |  | ||||||
| #ifdef CURRENT_DEFAULT_NAMESPACE_NAME |  | ||||||
| namespace CURRENT_DEFAULT_NAMESPACE_NAME { |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| struct SprngSha256 |  | ||||||
| { |  | ||||||
|   RngState rs; |  | ||||||
|   // |  | ||||||
|   using result_type = uint64_t; |  | ||||||
|   // |  | ||||||
|   static constexpr result_type default_seed = 0; |  | ||||||
|   // |  | ||||||
|   explicit SprngSha256(result_type val = default_seed) |  | ||||||
|   { |  | ||||||
|     seed(val); |  | ||||||
|   } |  | ||||||
|   template<typename Sseq, typename = typename |  | ||||||
|     std::enable_if<!std::is_same<Sseq, SprngSha256>::value> |  | ||||||
|     ::type> |  | ||||||
|   explicit SprngSha256(Sseq& q) |  | ||||||
|   { |  | ||||||
|     seed(q); |  | ||||||
|   } |  | ||||||
|   // |  | ||||||
|   static constexpr result_type min() |  | ||||||
|   { |  | ||||||
|     return 0; |  | ||||||
|   } |  | ||||||
|   // |  | ||||||
|   static constexpr result_type max() |  | ||||||
|   { |  | ||||||
|     return UINT64_MAX; |  | ||||||
|   } |  | ||||||
|   // |  | ||||||
|   void seed(result_type val = default_seed) |  | ||||||
|   { |  | ||||||
|     reset(rs, (long)val); |  | ||||||
|   } |  | ||||||
|   template <class Sseq> |  | ||||||
|   typename std::enable_if<std::is_class<Sseq>::value>::type |  | ||||||
|   seed(Sseq& q) |  | ||||||
|   { |  | ||||||
|     std::array<uint32_t, 8> seq; |  | ||||||
|     q.generate(seq.begin(), seq.end()); |  | ||||||
|     reset(rs); |  | ||||||
|     for (size_t i = 0; i < seq.size(); ++i) { |  | ||||||
|       splitRngState(rs, rs, seq[i]); |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|   // |  | ||||||
|   result_type operator()() |  | ||||||
|   { |  | ||||||
|     return randGen(rs); |  | ||||||
|   } |  | ||||||
|   // |  | ||||||
|   void discard(unsigned long long z) |  | ||||||
|   { |  | ||||||
|     for (unsigned long long i = 0; i < z; ++i) { |  | ||||||
|       randGen(rs); |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| inline std::ostream& operator<<(std::ostream& os, const SprngSha256& ss) |  | ||||||
| { |  | ||||||
|   os << ss.rs; |  | ||||||
|   return os; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline std::istream& operator>>(std::istream& is, SprngSha256& ss) |  | ||||||
| { |  | ||||||
|   is >> ss.rs; |  | ||||||
|   return is; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline bool operator==(const SprngSha256& ss1, const SprngSha256& ss2) |  | ||||||
| { |  | ||||||
|   return ss1.rs == ss2.rs; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #ifdef CURRENT_DEFAULT_NAMESPACE_NAME |  | ||||||
| } |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #endif |  | ||||||
| @@ -33,7 +33,6 @@ directory | |||||||
| #define GRID_QCD_FERMION_OPERATOR_IMPL_H | #define GRID_QCD_FERMION_OPERATOR_IMPL_H | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
| namespace QCD { | namespace QCD { | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -108,13 +107,14 @@ namespace Grid { | |||||||
|   INHERIT_GIMPL_TYPES(Base)	 \ |   INHERIT_GIMPL_TYPES(Base)	 \ | ||||||
|   INHERIT_FIMPL_TYPES(Base) |   INHERIT_FIMPL_TYPES(Base) | ||||||
|    |    | ||||||
|     /////// |   ///////////////////////////////////////////////////////////////////////////// | ||||||
|   // Single flavour four spinors with colour index |   // Single flavour four spinors with colour index | ||||||
|     /////// |   ///////////////////////////////////////////////////////////////////////////// | ||||||
|   template <class S, class Representation = FundamentalRepresentation,class _Coeff_t = RealD > |   template <class S, class Representation = FundamentalRepresentation,class _Coeff_t = RealD > | ||||||
|     class WilsonImpl |   class WilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > { | ||||||
|       : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > { |  | ||||||
|     public: |     public: | ||||||
|  |  | ||||||
|     static const int Dimension = Representation::Dimension; |     static const int Dimension = Representation::Dimension; | ||||||
|     typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl; |     typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl; | ||||||
|        |        | ||||||
| @@ -124,7 +124,6 @@ namespace Grid { | |||||||
|     const bool LsVectorised=false; |     const bool LsVectorised=false; | ||||||
|     typedef _Coeff_t Coeff_t; |     typedef _Coeff_t Coeff_t; | ||||||
|  |  | ||||||
|  |  | ||||||
|     INHERIT_GIMPL_TYPES(Gimpl); |     INHERIT_GIMPL_TYPES(Gimpl); | ||||||
|        |        | ||||||
|     template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Dimension>, Ns> >; |     template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Dimension>, Ns> >; | ||||||
| @@ -158,8 +157,7 @@ namespace Grid { | |||||||
|     } |     } | ||||||
|        |        | ||||||
|     template <class ref> |     template <class ref> | ||||||
|       inline void loadLinkElement(Simd ®, |     inline void loadLinkElement(Simd ®, ref &memory) { | ||||||
| 				  ref &memory) { |  | ||||||
|       reg = memory; |       reg = memory; | ||||||
|     } |     } | ||||||
|        |        | ||||||
| @@ -202,9 +200,10 @@ namespace Grid { | |||||||
|     } |     } | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|     /////// |   //////////////////////////////////////////////////////////////////////////////////// | ||||||
|   // Single flavour four spinors with colour index, 5d redblack |   // Single flavour four spinors with colour index, 5d redblack | ||||||
|     /////// |   //////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |  | ||||||
| template<class S,int Nrepresentation=Nc,class _Coeff_t = RealD> | template<class S,int Nrepresentation=Nc,class _Coeff_t = RealD> | ||||||
| class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > {  | class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > {  | ||||||
|   public: |   public: | ||||||
| @@ -227,12 +226,9 @@ namespace Grid { | |||||||
|   typedef Lattice<SiteSpinor> FermionField; |   typedef Lattice<SiteSpinor> FermionField; | ||||||
|    |    | ||||||
|   // Make the doubled gauge field a *scalar* |   // Make the doubled gauge field a *scalar* | ||||||
|       typedef iImplDoubledGaugeField<typename Simd::scalar_type> |   typedef iImplDoubledGaugeField<typename Simd::scalar_type>  SiteDoubledGaugeField;  // This is a scalar | ||||||
|       SiteDoubledGaugeField;  // This is a scalar |   typedef iImplGaugeField<typename Simd::scalar_type>         SiteScalarGaugeField;  // scalar | ||||||
|       typedef iImplGaugeField<typename Simd::scalar_type> |   typedef iImplGaugeLink<typename Simd::scalar_type>          SiteScalarGaugeLink;  // scalar | ||||||
|       SiteScalarGaugeField;  // scalar |  | ||||||
|       typedef iImplGaugeLink<typename Simd::scalar_type> |  | ||||||
|       SiteScalarGaugeLink;  // scalar |  | ||||||
|        |        | ||||||
|   typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField; |   typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField; | ||||||
|        |        | ||||||
| @@ -250,6 +246,7 @@ namespace Grid { | |||||||
|   inline void loadLinkElement(Simd ®, ref &memory) { |   inline void loadLinkElement(Simd ®, ref &memory) { | ||||||
|     vsplat(reg, memory); |     vsplat(reg, memory); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U, |   inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U, | ||||||
| 		       const SiteHalfSpinor &chi, int mu, StencilEntry *SE, | 		       const SiteHalfSpinor &chi, int mu, StencilEntry *SE, | ||||||
| 		       StencilImpl &St) { | 		       StencilImpl &St) { | ||||||
| @@ -262,8 +259,8 @@ namespace Grid { | |||||||
|     mult(&phi(), &UU(), &chi()); |     mult(&phi(), &UU(), &chi()); | ||||||
|   } |   } | ||||||
|        |        | ||||||
|       inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds, |   inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu)  | ||||||
| 			      const GaugeField &Umu) { |   { | ||||||
|     SiteScalarGaugeField ScalarUmu; |     SiteScalarGaugeField ScalarUmu; | ||||||
|     SiteDoubledGaugeField ScalarUds; |     SiteDoubledGaugeField ScalarUds; | ||||||
|      |      | ||||||
| @@ -289,13 +286,13 @@ namespace Grid { | |||||||
|     } |     } | ||||||
|   } |   } | ||||||
|        |        | ||||||
|       inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, |   inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu)  | ||||||
| 				FermionField &A, int mu) { |   { | ||||||
|     assert(0); |     assert(0); | ||||||
|   } |   } | ||||||
|        |        | ||||||
|       inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, |   inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,FermionField Ã, int mu)  | ||||||
| 				FermionField Ã, int mu) { |   { | ||||||
| 	assert(0); | 	assert(0); | ||||||
|   } |   } | ||||||
| }; | }; | ||||||
| @@ -305,9 +302,9 @@ namespace Grid { | |||||||
|     //////////////////////////////////////////////////////////////////////////////////////// |     //////////////////////////////////////////////////////////////////////////////////////// | ||||||
|      |      | ||||||
| template <class S, int Nrepresentation,class _Coeff_t = RealD> | template <class S, int Nrepresentation,class _Coeff_t = RealD> | ||||||
|     class GparityWilsonImpl | class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > { | ||||||
|       : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > { |  | ||||||
|  public: |  public: | ||||||
|  |  | ||||||
|  static const int Dimension = Nrepresentation; |  static const int Dimension = Nrepresentation; | ||||||
|  |  | ||||||
|  const bool LsVectorised=false; |  const bool LsVectorised=false; | ||||||
| @@ -317,15 +314,9 @@ namespace Grid { | |||||||
|   |   | ||||||
|  INHERIT_GIMPL_TYPES(Gimpl); |  INHERIT_GIMPL_TYPES(Gimpl); | ||||||
|        |        | ||||||
|       template <typename vtype> |  template <typename vtype> using iImplSpinor            = iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp>; | ||||||
|       using iImplSpinor = |  template <typename vtype> using iImplHalfSpinor        = iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp>; | ||||||
|       iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp>; |  template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>; | ||||||
|       template <typename vtype> |  | ||||||
|       using iImplHalfSpinor = |  | ||||||
| 	iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp>; |  | ||||||
|       template <typename vtype> |  | ||||||
|       using iImplDoubledGaugeField = |  | ||||||
| 	iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>; |  | ||||||
|        |        | ||||||
|  typedef iImplSpinor<Simd> SiteSpinor; |  typedef iImplSpinor<Simd> SiteSpinor; | ||||||
|  typedef iImplHalfSpinor<Simd> SiteHalfSpinor; |  typedef iImplHalfSpinor<Simd> SiteHalfSpinor; | ||||||
| @@ -341,7 +332,6 @@ namespace Grid { | |||||||
|        |        | ||||||
|  ImplParams Params; |  ImplParams Params; | ||||||
|  |  | ||||||
|  |  | ||||||
|  GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){}; |  GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){}; | ||||||
|  |  | ||||||
|  bool overlapCommsCompute(void) { return Params.overlapCommsCompute; }; |  bool overlapCommsCompute(void) { return Params.overlapCommsCompute; }; | ||||||
| @@ -351,6 +341,7 @@ namespace Grid { | |||||||
|  inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U, |  inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U, | ||||||
| 		      const SiteHalfSpinor &chi, int mu, StencilEntry *SE, | 		      const SiteHalfSpinor &chi, int mu, StencilEntry *SE, | ||||||
| 		      StencilImpl &St) { | 		      StencilImpl &St) { | ||||||
|  |  | ||||||
|   typedef SiteHalfSpinor vobj; |   typedef SiteHalfSpinor vobj; | ||||||
|    typedef typename SiteHalfSpinor::scalar_object sobj; |    typedef typename SiteHalfSpinor::scalar_object sobj; | ||||||
| 	 | 	 | ||||||
| @@ -419,7 +410,6 @@ namespace Grid { | |||||||
|  |  | ||||||
|  inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) |  inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) | ||||||
|  { |  { | ||||||
| 	 |  | ||||||
|    conformable(Uds._grid,GaugeGrid); |    conformable(Uds._grid,GaugeGrid); | ||||||
|    conformable(Umu._grid,GaugeGrid); |    conformable(Umu._grid,GaugeGrid); | ||||||
|     |     | ||||||
| @@ -429,7 +419,6 @@ namespace Grid { | |||||||
|     |     | ||||||
|    Lattice<iScalar<vInteger> > coor(GaugeGrid); |    Lattice<iScalar<vInteger> > coor(GaugeGrid); | ||||||
| 	 | 	 | ||||||
| 	 |  | ||||||
|    for(int mu=0;mu<Nd;mu++){ |    for(int mu=0;mu<Nd;mu++){ | ||||||
| 	   | 	   | ||||||
|      LatticeCoordinate(coor,mu); |      LatticeCoordinate(coor,mu); | ||||||
| @@ -443,7 +432,6 @@ namespace Grid { | |||||||
|        Uconj = where(coor==neglink,-Uconj,Uconj); |        Uconj = where(coor==neglink,-Uconj,Uconj); | ||||||
|      } |      } | ||||||
| 	   | 	   | ||||||
| 	   |  | ||||||
| PARALLEL_FOR_LOOP | PARALLEL_FOR_LOOP | ||||||
|      for(auto ss=U.begin();ss<U.end();ss++){ |      for(auto ss=U.begin();ss<U.end();ss++){ | ||||||
|        Uds[ss](0)(mu) = U[ss](); |        Uds[ss](0)(mu) = U[ss](); | ||||||
| @@ -477,8 +465,8 @@ namespace Grid { | |||||||
|  } |  } | ||||||
|        |        | ||||||
|        |        | ||||||
|       inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, |  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) { | ||||||
| 				FermionField &A, int mu) { |  | ||||||
|    // DhopDir provides U or Uconj depending on coor/flavour. |    // DhopDir provides U or Uconj depending on coor/flavour. | ||||||
|    GaugeLinkField link(mat._grid); |    GaugeLinkField link(mat._grid); | ||||||
|    // use lorentz for flavour as hack. |    // use lorentz for flavour as hack. | ||||||
| @@ -491,8 +479,8 @@ namespace Grid { | |||||||
|    return; |    return; | ||||||
|  } |  } | ||||||
|        |        | ||||||
|       inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, |  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã, int mu) { | ||||||
| 				FermionField Ã, int mu) { |  | ||||||
|    int Ls = Btilde._grid->_fdimensions[0]; |    int Ls = Btilde._grid->_fdimensions[0]; | ||||||
| 	 | 	 | ||||||
|    GaugeLinkField tmp(mat._grid); |    GaugeLinkField tmp(mat._grid); | ||||||
| @@ -508,13 +496,13 @@ namespace Grid { | |||||||
|    PokeIndex<LorentzIndex>(mat, tmp, mu); |    PokeIndex<LorentzIndex>(mat, tmp, mu); | ||||||
|    return; |    return; | ||||||
|  } |  } | ||||||
|  |  | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  typedef WilsonImpl<vComplex,  FundamentalRepresentation > WilsonImplR;   // Real.. whichever prec |  typedef WilsonImpl<vComplex,  FundamentalRepresentation > WilsonImplR;   // Real.. whichever prec | ||||||
|  typedef WilsonImpl<vComplexF, FundamentalRepresentation > WilsonImplF;  // Float |  typedef WilsonImpl<vComplexF, FundamentalRepresentation > WilsonImplF;  // Float | ||||||
|  typedef WilsonImpl<vComplexD, FundamentalRepresentation > WilsonImplD;  // Double |  typedef WilsonImpl<vComplexD, FundamentalRepresentation > WilsonImplD;  // Double | ||||||
|  |  | ||||||
|  |  | ||||||
|  typedef WilsonImpl<vComplex,  FundamentalRepresentation, ComplexD > ZWilsonImplR; // Real.. whichever prec |  typedef WilsonImpl<vComplex,  FundamentalRepresentation, ComplexD > ZWilsonImplR; // Real.. whichever prec | ||||||
|  typedef WilsonImpl<vComplexF, FundamentalRepresentation, ComplexD > ZWilsonImplF; // Float |  typedef WilsonImpl<vComplexF, FundamentalRepresentation, ComplexD > ZWilsonImplF; // Float | ||||||
|  typedef WilsonImpl<vComplexD, FundamentalRepresentation, ComplexD > ZWilsonImplD; // Double |  typedef WilsonImpl<vComplexD, FundamentalRepresentation, ComplexD > ZWilsonImplD; // Double | ||||||
| @@ -538,6 +526,7 @@ namespace Grid { | |||||||
|  typedef GparityWilsonImpl<vComplex , Nc> GparityWilsonImplR;  // Real.. whichever prec |  typedef GparityWilsonImpl<vComplex , Nc> GparityWilsonImplR;  // Real.. whichever prec | ||||||
|  typedef GparityWilsonImpl<vComplexF, Nc> GparityWilsonImplF;  // Float |  typedef GparityWilsonImpl<vComplexF, Nc> GparityWilsonImplF;  // Float | ||||||
|  typedef GparityWilsonImpl<vComplexD, Nc> GparityWilsonImplD;  // Double |  typedef GparityWilsonImpl<vComplexD, Nc> GparityWilsonImplD;  // Double | ||||||
| } |  | ||||||
| } | }} | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -166,7 +166,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U, | |||||||
|     //////////////////////// |     //////////////////////// | ||||||
|     PARALLEL_FOR_LOOP |     PARALLEL_FOR_LOOP | ||||||
|     for (int sss = 0; sss < B._grid->oSites(); sss++) { |     for (int sss = 0; sss < B._grid->oSites(); sss++) { | ||||||
|       Kernels::DiracOptDhopDir(st, U, st.comm_buf, sss, sss, B, Btilde, mu, |       Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sss, sss, B, Btilde, mu, | ||||||
|                                gamma); |                                gamma); | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -277,7 +277,7 @@ void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out, | |||||||
|  |  | ||||||
|   PARALLEL_FOR_LOOP |   PARALLEL_FOR_LOOP | ||||||
|   for (int sss = 0; sss < in._grid->oSites(); sss++) { |   for (int sss = 0; sss < in._grid->oSites(); sss++) { | ||||||
|     Kernels::DiracOptDhopDir(Stencil, Umu, Stencil.comm_buf, sss, sss, in, out, |     Kernels::DiracOptDhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, | ||||||
|                              dirdisp, gamma); |                              dirdisp, gamma); | ||||||
|   } |   } | ||||||
| }; | }; | ||||||
| @@ -295,13 +295,13 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo, | |||||||
|   if (dag == DaggerYes) { |   if (dag == DaggerYes) { | ||||||
|     PARALLEL_FOR_LOOP |     PARALLEL_FOR_LOOP | ||||||
|     for (int sss = 0; sss < in._grid->oSites(); sss++) { |     for (int sss = 0; sss < in._grid->oSites(); sss++) { | ||||||
|       Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sss, sss, 1, 1, in, |       Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, | ||||||
|                                    out); |                                    out); | ||||||
|     } |     } | ||||||
|   } else { |   } else { | ||||||
|     PARALLEL_FOR_LOOP |     PARALLEL_FOR_LOOP | ||||||
|     for (int sss = 0; sss < in._grid->oSites(); sss++) { |     for (int sss = 0; sss < in._grid->oSites(); sss++) { | ||||||
|       Kernels::DiracOptDhopSite(st, lo, U, st.comm_buf, sss, sss, 1, 1, in, |       Kernels::DiracOptDhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, | ||||||
|                                 out); |                                 out); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|   | |||||||
| @@ -185,18 +185,14 @@ void WilsonFermion5D<Impl>::Report(void) | |||||||
|   if ( DhopCalls > 0 ) { |   if ( DhopCalls > 0 ) { | ||||||
|     std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; |     std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; | ||||||
|     std::cout << GridLogMessage << "WilsonFermion5D Number of Dhop Calls     : " << DhopCalls   << std::endl; |     std::cout << GridLogMessage << "WilsonFermion5D Number of Dhop Calls     : " << DhopCalls   << std::endl; | ||||||
|     std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime |     std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime<< " us" << std::endl; | ||||||
|               << " us" << std::endl; |     std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls           : " << DhopCommTime / DhopCalls << " us" << std::endl; | ||||||
|     std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls           : " |     std::cout << GridLogMessage << "WilsonFermion5D Total Compute time       : " << DhopComputeTime << " us" << std::endl; | ||||||
|               << DhopCommTime / DhopCalls << " us" << std::endl; |     std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls        : " << DhopComputeTime / DhopCalls << " us" << std::endl; | ||||||
|     std::cout << GridLogMessage << "WilsonFermion5D Total Compute time       : " |  | ||||||
|               << DhopComputeTime << " us" << std::endl; |  | ||||||
|     std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls        : " |  | ||||||
|               << DhopComputeTime / DhopCalls << " us" << std::endl; |  | ||||||
|  |  | ||||||
|     RealD mflops = 1344*volume*DhopCalls/DhopComputeTime; |     RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting | ||||||
|     std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl; |     std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl; | ||||||
|     std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl; |     std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl; | ||||||
|  |  | ||||||
|    } |    } | ||||||
|  |  | ||||||
| @@ -210,12 +206,9 @@ void WilsonFermion5D<Impl>::Report(void) | |||||||
|     std::cout << GridLogMessage << "WilsonFermion5D Total Dhop Compute time  : " <<DerivDhopComputeTime <<" us"<<std::endl; |     std::cout << GridLogMessage << "WilsonFermion5D Total Dhop Compute time  : " <<DerivDhopComputeTime <<" us"<<std::endl; | ||||||
|     std::cout << GridLogMessage << "WilsonFermion5D Dhop ComputeTime/Calls   : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl; |     std::cout << GridLogMessage << "WilsonFermion5D Dhop ComputeTime/Calls   : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl; | ||||||
|      |      | ||||||
|  |  | ||||||
|  |  | ||||||
|     RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime; |     RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime; | ||||||
|     std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl; |     std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl; | ||||||
|     std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl; |     std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl; | ||||||
|  |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   if (DerivCalls > 0 || DhopCalls > 0){ |   if (DerivCalls > 0 || DhopCalls > 0){ | ||||||
| @@ -275,7 +268,7 @@ PARALLEL_FOR_LOOP | |||||||
|     for(int s=0;s<Ls;s++){ |     for(int s=0;s<Ls;s++){ | ||||||
|       int sU=ss; |       int sU=ss; | ||||||
|       int sF = s+Ls*sU;  |       int sF = s+Ls*sU;  | ||||||
|       Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sF,sU,in,out,dirdisp,gamma); |       Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.CommBuf(),sF,sU,in,out,dirdisp,gamma); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
| }; | }; | ||||||
| @@ -327,8 +320,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st, | |||||||
|         assert(sF < B._grid->oSites()); |         assert(sF < B._grid->oSites()); | ||||||
|         assert(sU < U._grid->oSites()); |         assert(sU < U._grid->oSites()); | ||||||
|  |  | ||||||
|         Kernels::DiracOptDhopDir(st, U, st.comm_buf, sF, sU, B, Btilde, mu, |         Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sF, sU, B, Btilde, mu, gamma); | ||||||
|                                  gamma); |  | ||||||
|  |  | ||||||
|         //////////////////////////// |         //////////////////////////// | ||||||
|         // spin trace outer product |         // spin trace outer product | ||||||
| @@ -396,7 +388,6 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo, | |||||||
| 					 DoubledGaugeField & U, | 					 DoubledGaugeField & U, | ||||||
| 					 const FermionField &in, FermionField &out,int dag) | 					 const FermionField &in, FermionField &out,int dag) | ||||||
| { | { | ||||||
|   DhopCalls++; |  | ||||||
|   //  assert((dag==DaggerNo) ||(dag==DaggerYes)); |   //  assert((dag==DaggerNo) ||(dag==DaggerYes)); | ||||||
|   Compressor compressor(dag); |   Compressor compressor(dag); | ||||||
|  |  | ||||||
| @@ -413,8 +404,7 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo, | |||||||
|     for (int ss = 0; ss < U._grid->oSites(); ss++) { |     for (int ss = 0; ss < U._grid->oSites(); ss++) { | ||||||
|       int sU = ss; |       int sU = ss; | ||||||
|       int sF = LLs * sU; |       int sF = LLs * sU; | ||||||
|       Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in, |       Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sF, sU, LLs, 1, in, out); | ||||||
|                                    out); |  | ||||||
|     } |     } | ||||||
| #ifdef AVX512 | #ifdef AVX512 | ||||||
|   } else if (stat.is_init() ) { |   } else if (stat.is_init() ) { | ||||||
| @@ -428,11 +418,10 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo, | |||||||
|     int mythread = omp_get_thread_num(); |     int mythread = omp_get_thread_num(); | ||||||
|     stat.enter(mythread); |     stat.enter(mythread); | ||||||
| #pragma omp for nowait | #pragma omp for nowait | ||||||
|    for(int ss=0;ss<U._grid->oSites();ss++) |     for(int ss=0;ss<U._grid->oSites();ss++) { | ||||||
|     { |  | ||||||
|       int sU=ss; |       int sU=ss; | ||||||
|       int sF=LLs*sU; |       int sF=LLs*sU; | ||||||
|        Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out); |       Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out); | ||||||
|     } |     } | ||||||
|     stat.exit(mythread); |     stat.exit(mythread); | ||||||
|     } |     } | ||||||
| @@ -443,8 +432,7 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo, | |||||||
|     for (int ss = 0; ss < U._grid->oSites(); ss++) { |     for (int ss = 0; ss < U._grid->oSites(); ss++) { | ||||||
|       int sU = ss; |       int sU = ss; | ||||||
|       int sF = LLs * sU; |       int sF = LLs * sU; | ||||||
|       Kernels::DiracOptDhopSite(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in, |       Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out); | ||||||
|                                 out); |  | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|   DhopComputeTime+=usecond(); |   DhopComputeTime+=usecond(); | ||||||
| @@ -454,6 +442,7 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo, | |||||||
| template<class Impl> | template<class Impl> | ||||||
| void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag) | void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag) | ||||||
| { | { | ||||||
|  |   DhopCalls++; | ||||||
|   conformable(in._grid,FermionRedBlackGrid());    // verifies half grid |   conformable(in._grid,FermionRedBlackGrid());    // verifies half grid | ||||||
|   conformable(in._grid,out._grid); // drops the cb check |   conformable(in._grid,out._grid); // drops the cb check | ||||||
|  |  | ||||||
| @@ -465,6 +454,7 @@ void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int | |||||||
| template<class Impl> | template<class Impl> | ||||||
| void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) | void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) | ||||||
| { | { | ||||||
|  |   DhopCalls++; | ||||||
|   conformable(in._grid,FermionRedBlackGrid());    // verifies half grid |   conformable(in._grid,FermionRedBlackGrid());    // verifies half grid | ||||||
|   conformable(in._grid,out._grid); // drops the cb check |   conformable(in._grid,out._grid); // drops the cb check | ||||||
|  |  | ||||||
| @@ -476,6 +466,7 @@ void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int | |||||||
| template<class Impl> | template<class Impl> | ||||||
| void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag) | void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag) | ||||||
| { | { | ||||||
|  |   DhopCalls+=2; | ||||||
|   conformable(in._grid,FermionGrid()); // verifies full grid |   conformable(in._grid,FermionGrid()); // verifies full grid | ||||||
|   conformable(in._grid,out._grid); |   conformable(in._grid,out._grid); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -34,7 +34,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
| #include <Grid/Stat.h> | #include <Grid/Stat.h> | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
| namespace QCD { | namespace QCD { | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////////////////// | ||||||
| @@ -182,7 +181,7 @@ namespace Grid { | |||||||
|     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf; |     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf; | ||||||
|      |      | ||||||
|   }; |   }; | ||||||
|   } |  | ||||||
| } | }} | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -43,9 +43,8 @@ WilsonKernels<Impl>::WilsonKernels(const ImplParams &p) : Base(p){}; | |||||||
| //////////////////////////////////////////// | //////////////////////////////////////////// | ||||||
|  |  | ||||||
| template <class Impl> | template <class Impl> | ||||||
| void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag( | void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, | ||||||
|     StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, | 						     SiteHalfSpinor *buf, int sF, | ||||||
|     std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, int sF, |  | ||||||
| 						     int sU, const FermionField &in, FermionField &out) { | 						     int sU, const FermionField &in, FermionField &out) { | ||||||
|   SiteHalfSpinor tmp; |   SiteHalfSpinor tmp; | ||||||
|   SiteHalfSpinor chi; |   SiteHalfSpinor chi; | ||||||
| @@ -220,9 +219,8 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag( | |||||||
|  |  | ||||||
| // Need controls to do interior, exterior, or both | // Need controls to do interior, exterior, or both | ||||||
| template <class Impl> | template <class Impl> | ||||||
| void WilsonKernels<Impl>::DiracOptGenericDhopSite( | void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, | ||||||
|     StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, | 						  SiteHalfSpinor *buf, int sF, | ||||||
|     std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, int sF, |  | ||||||
| 						  int sU, const FermionField &in, FermionField &out) { | 						  int sU, const FermionField &in, FermionField &out) { | ||||||
|   SiteHalfSpinor tmp; |   SiteHalfSpinor tmp; | ||||||
|   SiteHalfSpinor chi; |   SiteHalfSpinor chi; | ||||||
| @@ -396,10 +394,9 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSite( | |||||||
| }; | }; | ||||||
|  |  | ||||||
| template <class Impl> | template <class Impl> | ||||||
| void WilsonKernels<Impl>::DiracOptDhopDir( | void WilsonKernels<Impl>::DiracOptDhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int sF, | ||||||
|     StencilImpl &st, DoubledGaugeField &U, |  | ||||||
|     std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, int sF, |  | ||||||
| 					   int sU, const FermionField &in, FermionField &out, int dir, int gamma) { | 					   int sU, const FermionField &in, FermionField &out, int dir, int gamma) { | ||||||
|  |  | ||||||
|   SiteHalfSpinor tmp; |   SiteHalfSpinor tmp; | ||||||
|   SiteHalfSpinor chi; |   SiteHalfSpinor chi; | ||||||
|   SiteSpinor result; |   SiteSpinor result; | ||||||
|   | |||||||
| @@ -32,7 +32,6 @@ directory | |||||||
| #define GRID_QCD_DHOP_H | #define GRID_QCD_DHOP_H | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
| namespace QCD { | namespace QCD { | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| @@ -56,16 +55,11 @@ namespace Grid { | |||||||
|     |     | ||||||
|   template <bool EnableBool = true> |   template <bool EnableBool = true> | ||||||
|   typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type |   typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type | ||||||
| 	DiracOptDhopSite( |   DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, | ||||||
| 			 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, | 		   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) { | ||||||
| 			 std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, |  | ||||||
| 			 int sF, int sU, int Ls, int Ns, const FermionField &in, |  | ||||||
| 			 FermionField &out) { |  | ||||||
| #ifdef AVX512 | #ifdef AVX512 | ||||||
|     if (AsmOpt) { |     if (AsmOpt) { | ||||||
| 	  WilsonKernels<Impl>::DiracOptAsmDhopSite(st, lo, U, buf, sF, sU, Ls, Ns, |       WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out); | ||||||
| 						   in, out); |  | ||||||
|  |  | ||||||
|     } else { |     } else { | ||||||
| #else | #else | ||||||
|     { |     { | ||||||
| @@ -73,11 +67,9 @@ namespace Grid { | |||||||
|       for (int site = 0; site < Ns; site++) { |       for (int site = 0; site < Ns; site++) { | ||||||
| 	for (int s = 0; s < Ls; s++) { | 	for (int s = 0; s < Ls; s++) { | ||||||
| 	  if (HandOpt) | 	  if (HandOpt) | ||||||
| 		  WilsonKernels<Impl>::DiracOptHandDhopSite(st, lo, U, buf, sF, sU, | 	    WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out); | ||||||
| 							    in, out); |  | ||||||
| 	  else | 	  else | ||||||
| 		  WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, | 	    WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out); | ||||||
| 							       in, out); |  | ||||||
| 	  sF++; | 	  sF++; | ||||||
| 	} | 	} | ||||||
| 	sU++; | 	sU++; | ||||||
| @@ -87,15 +79,12 @@ namespace Grid { | |||||||
|       |       | ||||||
|   template <bool EnableBool = true> |   template <bool EnableBool = true> | ||||||
|   typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type |   typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type | ||||||
| 	  DiracOptDhopSite( |   DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, | ||||||
| 			   StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, | 		   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) { | ||||||
| 			   std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, |       | ||||||
| 			   int sF, int sU, int Ls, int Ns, const FermionField &in, |  | ||||||
| 			   FermionField &out) { |  | ||||||
|     for (int site = 0; site < Ns; site++) { |     for (int site = 0; site < Ns; site++) { | ||||||
|       for (int s = 0; s < Ls; s++) { |       for (int s = 0; s < Ls; s++) { | ||||||
| 	      WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in, | 	WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in, out); | ||||||
| 							   out); |  | ||||||
| 	sF++; | 	sF++; | ||||||
|       } |       } | ||||||
|       sU++; |       sU++; | ||||||
| @@ -103,17 +92,12 @@ namespace Grid { | |||||||
|   } |   } | ||||||
|       |       | ||||||
|   template <bool EnableBool = true> |   template <bool EnableBool = true> | ||||||
| 	  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool, |   typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,void>::type | ||||||
| 				  void>::type |   DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, | ||||||
| 	  DiracOptDhopSiteDag( | 		      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) { | ||||||
| 			      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, |  | ||||||
| 			      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, |  | ||||||
| 			      int sF, int sU, int Ls, int Ns, const FermionField &in, |  | ||||||
| 			      FermionField &out) { |  | ||||||
| #ifdef AVX512 | #ifdef AVX512 | ||||||
|     if (AsmOpt) { |     if (AsmOpt) { | ||||||
| 				      WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st, lo, U, buf, sF, sU, Ls, |       WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out); | ||||||
| 										  Ns, in, out); |  | ||||||
|     } else { |     } else { | ||||||
| #else | #else | ||||||
|     { |     { | ||||||
| @@ -121,11 +105,9 @@ namespace Grid { | |||||||
|       for (int site = 0; site < Ns; site++) { |       for (int site = 0; site < Ns; site++) { | ||||||
| 	for (int s = 0; s < Ls; s++) { | 	for (int s = 0; s < Ls; s++) { | ||||||
| 	  if (HandOpt) | 	  if (HandOpt) | ||||||
| 					      WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st, lo, U, buf, sF, sU, | 	    WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out); | ||||||
| 											   in, out); |  | ||||||
| 	  else | 	  else | ||||||
| 					      WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF, | 	    WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out); | ||||||
| 											      sU, in, out); |  | ||||||
| 	  sF++; | 	  sF++; | ||||||
| 	} | 	} | ||||||
| 	sU++; | 	sU++; | ||||||
| @@ -134,73 +116,48 @@ namespace Grid { | |||||||
|   } |   } | ||||||
|  |  | ||||||
|   template <bool EnableBool = true> |   template <bool EnableBool = true> | ||||||
| 				      typename std::enable_if< |   typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,void>::type | ||||||
| 				      (Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, |   DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,SiteHalfSpinor * buf, | ||||||
| 				      void>::type | 		      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) { | ||||||
| 				      DiracOptDhopSiteDag( |  | ||||||
| 							  StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, |  | ||||||
| 							  std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, |  | ||||||
| 							  int sF, int sU, int Ls, int Ns, const FermionField &in, |  | ||||||
| 							  FermionField &out) { |  | ||||||
|     for (int site = 0; site < Ns; site++) { |     for (int site = 0; site < Ns; site++) { | ||||||
|       for (int s = 0; s < Ls; s++) { |       for (int s = 0; s < Ls; s++) { | ||||||
| 					    WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF, sU, | 	WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out); | ||||||
| 											    in, out); |  | ||||||
| 	sF++; | 	sF++; | ||||||
|       } |       } | ||||||
|       sU++; |       sU++; | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
| 				    void DiracOptDhopDir( |   void DiracOptDhopDir(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf, | ||||||
| 							 StencilImpl &st, DoubledGaugeField &U, | 		       int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, int gamma); | ||||||
| 							 std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, |  | ||||||
| 							 int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, |  | ||||||
| 							 int gamma); |  | ||||||
|        |        | ||||||
| private: | private: | ||||||
|      // Specialised variants |      // Specialised variants | ||||||
| 				    void DiracOptGenericDhopSite( |   void DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, | ||||||
| 								 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, |  | ||||||
| 								 std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, |  | ||||||
| 			       int sF, int sU, const FermionField &in, FermionField &out); | 			       int sF, int sU, const FermionField &in, FermionField &out); | ||||||
|        |        | ||||||
| 				    void DiracOptGenericDhopSiteDag( |   void DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, | ||||||
| 								    StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, |  | ||||||
| 								    std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, |  | ||||||
| 				  int sF, int sU, const FermionField &in, FermionField &out); | 				  int sF, int sU, const FermionField &in, FermionField &out); | ||||||
|  |  | ||||||
| 				    void DiracOptAsmDhopSite( |   void DiracOptAsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, | ||||||
| 							     StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, | 			   int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); | ||||||
| 							     std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, |  | ||||||
| 							     int sF, int sU, int Ls, int Ns, const FermionField &in, |  | ||||||
| 							     FermionField &out); |  | ||||||
|  |  | ||||||
| 				    void DiracOptAsmDhopSiteDag( |   void DiracOptAsmDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, | ||||||
| 								StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, | 			      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); | ||||||
| 								std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, |  | ||||||
| 								int sF, int sU, int Ls, int Ns, const FermionField &in, |  | ||||||
| 								FermionField &out); |  | ||||||
|  |  | ||||||
| 				    void DiracOptHandDhopSite( |   void DiracOptHandDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, | ||||||
| 							      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, |  | ||||||
| 							      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, |  | ||||||
| 			    int sF, int sU, const FermionField &in, FermionField &out); | 			    int sF, int sU, const FermionField &in, FermionField &out); | ||||||
|  |  | ||||||
| 				    void DiracOptHandDhopSiteDag( |   void DiracOptHandDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, | ||||||
| 								 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, |  | ||||||
| 								 std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, |  | ||||||
| 			       int sF, int sU, const FermionField &in, FermionField &out); | 			       int sF, int sU, const FermionField &in, FermionField &out); | ||||||
|        |        | ||||||
| public: | public: | ||||||
|  |  | ||||||
|   WilsonKernels(const ImplParams &p = ImplParams()); |   WilsonKernels(const ImplParams &p = ImplParams()); | ||||||
|  |  | ||||||
| }; | }; | ||||||
|      |      | ||||||
|       } | }} | ||||||
|     } |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -38,26 +38,22 @@ namespace Grid { | |||||||
| /////////////////////////////////////////////////////////// | /////////////////////////////////////////////////////////// | ||||||
| // Default to no assembler implementation | // Default to no assembler implementation | ||||||
| /////////////////////////////////////////////////////////// | /////////////////////////////////////////////////////////// | ||||||
|     template<class Impl> | template<class Impl> void  | ||||||
|       void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, | WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, | ||||||
|                              std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf, |  | ||||||
|                              int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) |  | ||||||
|     { |  | ||||||
|       assert(0); |  | ||||||
|     } |  | ||||||
|     template<class Impl> |  | ||||||
|       void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, |  | ||||||
|                                 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf, |  | ||||||
| 					  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) | 					  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) | ||||||
| { | { | ||||||
|   assert(0); |   assert(0); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | template<class Impl> void  | ||||||
|  | WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, | ||||||
|  | 					     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) | ||||||
|  | { | ||||||
|  |   assert(0); | ||||||
|  | } | ||||||
|  |  | ||||||
| #if defined(AVX512)  | #if defined(AVX512)  | ||||||
|      |      | ||||||
|      |  | ||||||
|     /////////////////////////////////////////////////////////// |     /////////////////////////////////////////////////////////// | ||||||
|     // If we are AVX512 specialise the single precision routine |     // If we are AVX512 specialise the single precision routine | ||||||
|     /////////////////////////////////////////////////////////// |     /////////////////////////////////////////////////////////// | ||||||
| @@ -84,16 +80,14 @@ namespace Grid { | |||||||
| #define FX(A) WILSONASM_ ##A | #define FX(A) WILSONASM_ ##A | ||||||
|    |    | ||||||
| #undef KERNEL_DAG | #undef KERNEL_DAG | ||||||
|     template<> | template<> void  | ||||||
|     void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, | WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, | ||||||
| 							 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf, |  | ||||||
| 						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) | 						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) | ||||||
| #include <qcd/action/fermion/WilsonKernelsAsmBody.h> | #include <qcd/action/fermion/WilsonKernelsAsmBody.h> | ||||||
|        |        | ||||||
| #define KERNEL_DAG | #define KERNEL_DAG | ||||||
|     template<> | template<> void  | ||||||
|     void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, | WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, | ||||||
| 							    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf, |  | ||||||
| 						   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) | 						   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) | ||||||
| #include <qcd/action/fermion/WilsonKernelsAsmBody.h> | #include <qcd/action/fermion/WilsonKernelsAsmBody.h> | ||||||
| 				     | 				     | ||||||
| @@ -109,31 +103,26 @@ namespace Grid { | |||||||
| #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf) | #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf) | ||||||
| 				     | 				     | ||||||
| #undef KERNEL_DAG | #undef KERNEL_DAG | ||||||
|     template<> | template<> void  | ||||||
|     void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, | WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, | ||||||
| 								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf, |  | ||||||
| 							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) | 							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) | ||||||
| #include <qcd/action/fermion/WilsonKernelsAsmBody.h> | #include <qcd/action/fermion/WilsonKernelsAsmBody.h> | ||||||
| 				     | 				     | ||||||
| #define KERNEL_DAG | #define KERNEL_DAG | ||||||
|     template<> | template<> void  | ||||||
|     void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, | WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, | ||||||
| 								     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf, |  | ||||||
| 							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) | 							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) | ||||||
| #include <qcd/action/fermion/WilsonKernelsAsmBody.h> | #include <qcd/action/fermion/WilsonKernelsAsmBody.h> | ||||||
| 				     | 				     | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  |  | ||||||
| #define INSTANTIATE_ASM(A)\ | #define INSTANTIATE_ASM(A)\ | ||||||
| template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\ | template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\ | ||||||
|                                    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\ |  | ||||||
|                                   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\ |                                   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\ | ||||||
| template void WilsonKernels<A>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\ |  \ | ||||||
|                                    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\ | template void WilsonKernels<A>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\ | ||||||
|                                   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\ |                                   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\ | ||||||
|  |  | ||||||
|  |  | ||||||
| INSTANTIATE_ASM(WilsonImplF); | INSTANTIATE_ASM(WilsonImplF); | ||||||
| INSTANTIATE_ASM(WilsonImplD); | INSTANTIATE_ASM(WilsonImplD); | ||||||
| INSTANTIATE_ASM(ZWilsonImplF); | INSTANTIATE_ASM(ZWilsonImplF); | ||||||
| @@ -144,6 +133,6 @@ INSTANTIATE_ASM(DomainWallVec5dImplF); | |||||||
| INSTANTIATE_ASM(DomainWallVec5dImplD); | INSTANTIATE_ASM(DomainWallVec5dImplD); | ||||||
| INSTANTIATE_ASM(ZDomainWallVec5dImplF); | INSTANTIATE_ASM(ZDomainWallVec5dImplF); | ||||||
| INSTANTIATE_ASM(ZDomainWallVec5dImplD); | INSTANTIATE_ASM(ZDomainWallVec5dImplD); | ||||||
|   } |  | ||||||
| } | }} | ||||||
|  |  | ||||||
|   | |||||||
| @@ -311,9 +311,8 @@ namespace Grid { | |||||||
| namespace QCD { | namespace QCD { | ||||||
|  |  | ||||||
|  |  | ||||||
|   template<class Impl> | template<class Impl> void  | ||||||
|   void WilsonKernels<Impl>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, | WilsonKernels<Impl>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, | ||||||
| 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf, |  | ||||||
| 					  int ss,int sU,const FermionField &in, FermionField &out) | 					  int ss,int sU,const FermionField &in, FermionField &out) | ||||||
| { | { | ||||||
|   typedef typename Simd::scalar_type S; |   typedef typename Simd::scalar_type S; | ||||||
| @@ -555,8 +554,7 @@ namespace QCD { | |||||||
| } | } | ||||||
|  |  | ||||||
| template<class Impl> | template<class Impl> | ||||||
|   void WilsonKernels<Impl>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, | void WilsonKernels<Impl>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, | ||||||
| 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf, |  | ||||||
| 						  int ss,int sU,const FermionField &in, FermionField &out) | 						  int ss,int sU,const FermionField &in, FermionField &out) | ||||||
| { | { | ||||||
|   //  std::cout << "Hand op Dhop "<<std::endl; |   //  std::cout << "Hand op Dhop "<<std::endl; | ||||||
| @@ -798,37 +796,34 @@ namespace QCD { | |||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////// |   //////////////////////////////////////////////// | ||||||
|   // Specialise Gparity to simple implementation |   // Specialise Gparity to simple implementation | ||||||
|   //////////////////////////////////////////////// |   //////////////////////////////////////////////// | ||||||
| template<> | template<> void  | ||||||
| void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, | WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, | ||||||
| 							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf, | 							SiteHalfSpinor *buf, | ||||||
| 							int sF,int sU,const FermionField &in, FermionField &out) | 							int sF,int sU,const FermionField &in, FermionField &out) | ||||||
| { | { | ||||||
|   assert(0); |   assert(0); | ||||||
| } | } | ||||||
|  |  | ||||||
| template<> | template<> void  | ||||||
| void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, | WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, | ||||||
| 								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf, | 							   SiteHalfSpinor *buf, | ||||||
| 							   int sF,int sU,const FermionField &in, FermionField &out) | 							   int sF,int sU,const FermionField &in, FermionField &out) | ||||||
| { | { | ||||||
|   assert(0); |   assert(0); | ||||||
| } | } | ||||||
|  |  | ||||||
| template<> | template<> void  | ||||||
| void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, | WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, | ||||||
| 							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf, |  | ||||||
| 							int sF,int sU,const FermionField &in, FermionField &out) | 							int sF,int sU,const FermionField &in, FermionField &out) | ||||||
| { | { | ||||||
|   assert(0); |   assert(0); | ||||||
| } | } | ||||||
|  |  | ||||||
| template<> | template<> void  | ||||||
| void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, | WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, | ||||||
| 								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf, |  | ||||||
| 							   int sF,int sU,const FermionField &in, FermionField &out) | 							   int sF,int sU,const FermionField &in, FermionField &out) | ||||||
| { | { | ||||||
|   assert(0); |   assert(0); | ||||||
| @@ -840,11 +835,9 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st, | |||||||
| // Need Nc=3 though // | // Need Nc=3 though // | ||||||
|  |  | ||||||
| #define INSTANTIATE_THEM(A) \ | #define INSTANTIATE_THEM(A) \ | ||||||
| template void WilsonKernels<A>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\ | template void WilsonKernels<A>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ | ||||||
| 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\ |  | ||||||
| 						     int ss,int sU,const FermionField &in, FermionField &out); \ | 						     int ss,int sU,const FermionField &in, FermionField &out); \ | ||||||
| template void WilsonKernels<A>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\ | template void WilsonKernels<A>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ | ||||||
| 								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\ |  | ||||||
| 							int ss,int sU,const FermionField &in, FermionField &out); | 							int ss,int sU,const FermionField &in, FermionField &out); | ||||||
|  |  | ||||||
| INSTANTIATE_THEM(WilsonImplF); | INSTANTIATE_THEM(WilsonImplF); | ||||||
|   | |||||||
| @@ -42,6 +42,16 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
| namespace Grid{ | namespace Grid{ | ||||||
| namespace Optimization { | namespace Optimization { | ||||||
|  |  | ||||||
|  |   union u512f { | ||||||
|  |     __m512 v; | ||||||
|  |     float f[16]; | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |   union u512d { | ||||||
|  |     __m512d v; | ||||||
|  |     double f[8]; | ||||||
|  |   }; | ||||||
|  |    | ||||||
|   struct Vsplat{ |   struct Vsplat{ | ||||||
|     //Complex float |     //Complex float | ||||||
|     inline __m512 operator()(float a, float b){ |     inline __m512 operator()(float a, float b){ | ||||||
| @@ -361,8 +371,14 @@ namespace Optimization { | |||||||
|   // Some Template specialization |   // Some Template specialization | ||||||
|  |  | ||||||
|   // Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases |   // Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases | ||||||
| #undef GNU_CLANG_COMPILER  | <<<<<<< HEAD | ||||||
|  | #define GNU_CLANG_COMPILER  | ||||||
| #ifdef GNU_CLANG_COMPILER | #ifdef GNU_CLANG_COMPILER | ||||||
|  | ======= | ||||||
|  |  | ||||||
|  | #ifndef __INTEL_COMPILER | ||||||
|  | #warning "Slow reduction due to incomplete reduce intrinsics" | ||||||
|  | >>>>>>> develop | ||||||
|   //Complex float Reduce |   //Complex float Reduce | ||||||
|   template<> |   template<> | ||||||
|     inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){ |     inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){ | ||||||
|   | |||||||
| @@ -250,8 +250,7 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int | |||||||
|   } |   } | ||||||
|  } |  } | ||||||
|  |  | ||||||
| template<class vobj> inline  | template<class vobj> inline void merge1(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset) | ||||||
| void merge1(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset) |  | ||||||
| { | { | ||||||
|   typedef typename vobj::scalar_type scalar_type ; |   typedef typename vobj::scalar_type scalar_type ; | ||||||
|   typedef typename vobj::vector_type vector_type ; |   typedef typename vobj::vector_type vector_type ; | ||||||
| @@ -269,8 +268,7 @@ void merge1(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int | |||||||
|   }} |   }} | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class vobj> inline  | template<class vobj> inline void merge2(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset) | ||||||
| void merge2(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset) |  | ||||||
| { | { | ||||||
|   typedef typename vobj::scalar_type scalar_type ; |   typedef typename vobj::scalar_type scalar_type ; | ||||||
|   typedef typename vobj::vector_type vector_type ; |   typedef typename vobj::vector_type vector_type ; | ||||||
|   | |||||||
| @@ -116,7 +116,7 @@ int main (int argc, char ** argv) | |||||||
| 	  else if (SE->_is_local) | 	  else if (SE->_is_local) | ||||||
| 	    Check._odata[i] = Foo._odata[SE->_offset]; | 	    Check._odata[i] = Foo._odata[SE->_offset]; | ||||||
| 	  else  | 	  else  | ||||||
| 	    Check._odata[i] = myStencil.comm_buf[SE->_offset]; | 	    Check._odata[i] = myStencil.CommBuf()[SE->_offset]; | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	Real nrmC = norm2(Check); | 	Real nrmC = norm2(Check); | ||||||
| @@ -207,7 +207,7 @@ int main (int argc, char ** argv) | |||||||
| 	  else if (SE->_is_local) | 	  else if (SE->_is_local) | ||||||
| 	    OCheck._odata[i] = EFoo._odata[SE->_offset]; | 	    OCheck._odata[i] = EFoo._odata[SE->_offset]; | ||||||
| 	  else  | 	  else  | ||||||
| 	    OCheck._odata[i] = EStencil.comm_buf[SE->_offset]; | 	    OCheck._odata[i] = EStencil.CommBuf()[SE->_offset]; | ||||||
| 	} | 	} | ||||||
| 	for(int i=0;i<ECheck._grid->oSites();i++){ | 	for(int i=0;i<ECheck._grid->oSites();i++){ | ||||||
| 	  int permute_type; | 	  int permute_type; | ||||||
| @@ -220,7 +220,7 @@ int main (int argc, char ** argv) | |||||||
| 	  else if (SE->_is_local) | 	  else if (SE->_is_local) | ||||||
| 	    ECheck._odata[i] = OFoo._odata[SE->_offset]; | 	    ECheck._odata[i] = OFoo._odata[SE->_offset]; | ||||||
| 	  else  | 	  else  | ||||||
| 	    ECheck._odata[i] = OStencil.comm_buf[SE->_offset]; | 	    ECheck._odata[i] = OStencil.CommBuf()[SE->_offset]; | ||||||
| 	} | 	} | ||||||
| 	 | 	 | ||||||
| 	setCheckerboard(Check,ECheck); | 	setCheckerboard(Check,ECheck); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user