From e06a11ee5e7be701f37db24505844c85fe5ee61c Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 6 Apr 2015 06:30:48 +0100 Subject: [PATCH] Bringing in LatticeInteger with the idea of implemented predicated assignment, subsets etc. c.f the QDP++ "where" syntax --- Grid_Cartesian.h | 283 ++++++++++++++++++++++++++------------- Grid_Communicator.h | 54 +++++++- Grid_Lattice.h | 93 +++++++------ Grid_QCD.h | 14 +- Grid_cshift_common.h | 12 +- Grid_cshift_fake.h | 8 +- Grid_cshift_mpi.h | 14 +- Grid_main.cc | 51 ++++--- Grid_math_types.h | 22 +--- Grid_mpi.cc | 29 +++- Grid_simd.h | 119 +++++++++++++---- Grid_vInteger.h | 307 +++++++++++++++++++++++++++++++++++++++++++ TODO | 3 +- 13 files changed, 788 insertions(+), 221 deletions(-) create mode 100644 Grid_vInteger.h diff --git a/Grid_Cartesian.h b/Grid_Cartesian.h index a983639c..004c2fa5 100644 --- a/Grid_Cartesian.h +++ b/Grid_Cartesian.h @@ -6,126 +6,236 @@ namespace Grid{ ///////////////////////////////////////////////////////////////////////////////////////// -// Grid Support. Following will go into Grid.h. +// Grid Support. ///////////////////////////////////////////////////////////////////////////////////////// - // Cartesian grids - // Grid::Grid - // Grid::GridCartesian - // Grid::GridCartesianRedBlack +// +// Cartesian grid inheritance +// Grid::GridBase +// | +// __________|___________ +// | | +// Grid::GridCartesian Grid::GridCartesianRedBlack +// +// TODO: document the following as an API guaranteed public interface + /* + * Rough map of functionality against QDP++ Layout + * + * Param | Grid | QDP++ + * ----------------------------------------- + * | | + * void | oSites, iSites, lSites | sitesOnNode + * void | gSites | vol + * | | + * gcoor | oIndex, iIndex | linearSiteIndex // no virtual node in QDP + * lcoor | | + * + * void | CheckerBoarded | - // No checkerboarded in QDP + * void | FullDimensions | lattSize + * void | GlobalDimensions | lattSize // No checkerboarded in QDP + * void | LocalDimensions | subgridLattSize + * void | VirtualLocalDimensions | subgridLattSize // no virtual node in QDP + * | | + * int x 3 | oiSiteRankToGlobal | siteCoords + * | ProcessorCoorLocalCoorToGlobalCoor | + * | | + * vector | GlobalCoorToRankIndex | nodeNumber(coord) + * vector | GlobalCoorToProcessorCoorLocalCoor| nodeCoord(coord) + * | | + * void | Processors | logicalSize // returns cart array shape + * void | ThisRank | nodeNumber(); // returns this node rank + * void | ThisProcessorCoor | // returns this node coor + * void | isBoss(void) | primaryNode(); + * | | + * | RankFromProcessorCoor | getLogicalCoorFrom(node) + * | ProcessorCoorFromRank | getNodeNumberFrom(logical_coord) + */ -class SimdGrid : public CartesianCommunicator { +class GridBase : public CartesianCommunicator { public: + // Give Lattice access + template friend class Lattice; - SimdGrid(std::vector & processor_grid) : CartesianCommunicator(processor_grid) {}; + GridBase(std::vector & processor_grid) : CartesianCommunicator(processor_grid) {}; - // Give Lattice access - template friend class Lattice; -//protected: - - // Lattice wide random support. not yet fully implemented. Need seed strategy - // and one generator per site. - //std::default_random_engine generator; - // static std::mt19937 generator( 9 ); - - // Grid information. - - // Commicator provides + //protected: + // Lattice wide random support. not yet fully implemented. Need seed strategy + // and one generator per site. + // std::default_random_engine generator; + // static std::mt19937 generator( 9 ); + + ////////////////////////////////////////////////////////////////////// + // Commicator provides information on the processor grid + ////////////////////////////////////////////////////////////////////// // unsigned long _ndimension; // std::vector _processors; // processor grid // int _processor; // linear processor rank // std::vector _processor_coor; // linear processor rank + ////////////////////////////////////////////////////////////////////// + // Physics Grid information. std::vector _simd_layout; // Which dimensions get relayed out over simd lanes. - - std::vector _fdimensions;// Global dimensions of array prior to cb removal std::vector _gdimensions;// Global dimensions of array after cb removal std::vector _ldimensions;// local dimensions of array with processor images removed std::vector _rdimensions;// Reduced local dimensions with simd lane images and processor images removed + std::vector _ostride; // Outer stride for each dimension + std::vector _istride; // Inner stride i.e. within simd lane + int _osites; // _isites*_osites = product(dimensions). + int _isites; + std::vector _slice_block; // subslice information + std::vector _slice_stride; + std::vector _slice_nblock; + // Might need these at some point // std::vector _lstart; // local start of array in gcoors. _processor_coor[d]*_ldimensions[d] // std::vector _lend; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1 - - std::vector _ostride; // Outer stride for each dimension - std::vector _istride; // Inner stride i.e. within simd lane - - int _osites; // _isites*_osites = product(dimensions). - int _isites; - - // subslice information - std::vector _slice_block; - std::vector _slice_stride; - std::vector _slice_nblock; public: - - // These routines are key. Subdivide the linearised cartesian index into - // "inner" index identifying which simd lane of object is associated with coord - // "outer" index identifying which element of _odata in class "Lattice" is associated with coord. - // Compared to, say, Blitz++ we simply need to store BOTH an inner stride and an outer - // stride per dimension. The cost of evaluating the indexing information is doubled for an n-dimensional - // coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all - // lanes are operated upon simultaneously. - - inline int oIndexReduced(std::vector &rcoor) - { - int idx=0; - for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*rcoor[d]; - return idx; - } - virtual int oIndex(std::vector &coor) - { - int idx=0; - for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]); - return idx; - } - inline int iIndex(std::vector &rcoor) - { - int idx=0; - for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(rcoor[d]/_rdimensions[d]); - return idx; - } - inline int iCoordFromIsite(int lane,int mu) - { - std::vector coor(_ndimension); - for(int d=0;d<_ndimension;d++){ - coor[d] = lane % _simd_layout[d]; - lane = lane / _simd_layout[d]; - } - return coor[mu]; - } - - inline int oSites(void) { return _osites; }; - inline int iSites(void) { return _isites; }; - inline int CheckerBoardFromOsite (int Osite){ + //////////////////////////////////////////////////////////////// + // Checkerboarding interface is virtual and overridden by + // GridCartesian / GridRedBlackCartesian + //////////////////////////////////////////////////////////////// + virtual int CheckerBoarded(int dim)=0; + virtual int CheckerBoard(std::vector site)=0; + virtual int CheckerBoardDestination(int source_cb,int shift)=0; + virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0; + inline int CheckerBoardFromOindex (int Oindex){ std::vector ocoor; - CoordFromOsite(ocoor,Osite); + oCoorFromOindex(ocoor,Oindex); int ss=0; for(int d=0;d<_ndimension;d++){ ss=ss+ocoor[d]; } return ss&0x1; } - inline void CoordFromOsite (std::vector& coor,int Osite){ + + ////////////////////////////////////////////////////////////////////////////////////////////// + // Local layout calculations + ////////////////////////////////////////////////////////////////////////////////////////////// + // These routines are key. Subdivide the linearised cartesian index into + // "inner" index identifying which simd lane of object is associated with coord + // "outer" index identifying which element of _odata in class "Lattice" is associated with coord. + // + // Compared to, say, Blitz++ we simply need to store BOTH an inner stride and an outer + // stride per dimension. The cost of evaluating the indexing information is doubled for an n-dimensional + // coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all + // lanes are operated upon simultaneously. + + virtual int oIndex(std::vector &coor) + { + int idx=0; + // Works with either global or local coordinates + for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]); + return idx; + } + inline int oIndexReduced(std::vector &ocoor) + { + int idx=0; + // ocoor is already reduced so can eliminate the modulo operation + // for fast indexing and inline the routine + for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d]; + return idx; + } + inline void oCoorFromOindex (std::vector& coor,int Oindex){ coor.resize(_ndimension); for(int d=0;d<_ndimension;d++){ - coor[d] = Osite % _rdimensions[d]; - Osite = Osite / _rdimensions[d]; + coor[d] = Oindex % _rdimensions[d]; + Oindex = Oindex / _rdimensions[d]; } } - virtual int CheckerBoarded(int dim)=0; - virtual int CheckerBoard(std::vector site)=0; - virtual int CheckerBoardDestination(int source_cb,int shift)=0; - virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0; + ////////////////////////////////////////////////////////// + // SIMD lane addressing + ////////////////////////////////////////////////////////// + inline int iIndex(std::vector &lcoor) + { + int idx=0; + for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]); + return idx; + } + inline void iCoorFromIindex(std::vector &coor,int lane) + { + coor.resize(_ndimension); + for(int d=0;d<_ndimension;d++){ + coor[d] = lane % _simd_layout[d]; + lane = lane / _simd_layout[d]; + } + } + + //////////////////////////////////////////////////////////////// + // Array sizing queries + //////////////////////////////////////////////////////////////// + + inline int iSites(void) { return _isites; }; + inline int Nsimd(void) { return _isites; };// Synonymous with iSites + inline int oSites(void) { return _osites; }; + inline int lSites(void) { return _isites*_osites; }; + inline int gSites(void) { return _isites*_osites*_Nprocessors; }; + inline int Nd (void) { return _ndimension;}; + inline const std::vector &FullDimensions(void) { return _fdimensions;}; + inline const std::vector &GlobalDimensions(void) { return _gdimensions;}; + inline const std::vector &LocalDimensions(void) { return _ldimensions;}; + inline const std::vector &VirtualLocalDimensions(void) { return _ldimensions;}; + + //////////////////////////////////////////////////////////////// + // Global addressing + //////////////////////////////////////////////////////////////// + void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector &gcoor) + { + gcoor.resize(_ndimension); + std::vector coor(_ndimension); + + ProcessorCoorFromRank(rank,coor); + for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = _ldimensions[mu]&coor[mu]; + + iCoorFromIindex(coor,i_idx); + for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += _rdimensions[mu]&coor[mu]; + + oCoorFromOindex (coor,o_idx); + for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += coor[mu]; + + } + void RankIndexCbToFullGlobalCoor(int rank, int o_idx, int i_idx, int cb,std::vector &fcoor) + { + RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor); + if(CheckerBoarded(0)){ + fcoor[0] = fcoor[0]*2+cb; + } + } + void ProcessorCoorLocalCoorToGlobalCoor(std::vector &Pcoor,std::vector &Lcoor,std::vector &gcoor) + { + gcoor.resize(_ndimension); + for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = Pcoor[mu]*_ldimensions[mu]+Lcoor[mu]; + } + void GlobalCoorToProcessorCoorLocalCoor(std::vector &pcoor,std::vector &lcoor,const std::vector &gcoor) + { + pcoor.resize(_ndimension); + lcoor.resize(_ndimension); + for(int mu=0;mu<_ndimension;mu++){ + pcoor[mu] = gcoor[mu]/_ldimensions[mu]; + lcoor[mu] = gcoor[mu]%_ldimensions[mu]; + } + } + void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const std::vector &gcoor) + { + std::vector pcoor; + std::vector lcoor; + GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor); + rank = RankFromProcessorCoor(pcoor); + i_idx= iIndex(lcoor); + o_idx= oIndex(lcoor); + } + }; -class GridCartesian: public SimdGrid { +class GridCartesian: public GridBase { + public: + virtual int CheckerBoarded(int dim){ return 0; } @@ -141,7 +251,7 @@ public: GridCartesian(std::vector &dimensions, std::vector &simd_layout, std::vector &processor_grid - ) : SimdGrid(processor_grid) + ) : GridBase(processor_grid) { /////////////////////// // Grid information @@ -200,14 +310,12 @@ public: _slice_nblock[d]=nblock; block = block*_rdimensions[d]; } - - assert( _isites == vComplex::Nsimd()); }; }; // Specialise this for red black grids storing half the data like a chess board. -class GridRedBlackCartesian : public SimdGrid +class GridRedBlackCartesian : public GridBase { public: virtual int CheckerBoarded(int dim){ @@ -230,7 +338,7 @@ public: // Probably faster with table lookup; // or by looping over x,y,z and multiply rather than computing checkerboard. - int ocb=CheckerBoardFromOsite(osite); + int ocb=CheckerBoardFromOindex(osite); if ( (source_cb+ocb)&1 ) { return (shift)/2; @@ -248,7 +356,7 @@ public: }; GridRedBlackCartesian(std::vector &dimensions, std::vector &simd_layout, - std::vector &processor_grid) : SimdGrid(processor_grid) + std::vector &processor_grid) : GridBase(processor_grid) { /////////////////////// // Grid information @@ -310,7 +418,6 @@ public: block = block*_rdimensions[d]; } - assert ( _isites == vComplex::Nsimd()); }; protected: virtual int oIndex(std::vector &coor) diff --git a/Grid_Communicator.h b/Grid_Communicator.h index 4f83da34..c7e71f12 100644 --- a/Grid_Communicator.h +++ b/Grid_Communicator.h @@ -22,23 +22,63 @@ class CartesianCommunicator { MPI_Comm communicator; #endif + // Constructor CartesianCommunicator(std::vector &pdimensions_in); + // Wraps MPI_Cart routines void ShiftedRanks(int dim,int shift,int & source, int & dest); - int Rank(std::vector coor); - int MyRank(void); - void GlobalSumF(float &); - void GlobalSumFVector(float *,int N); + int RankFromProcessorCoor(std::vector &coor); + void ProcessorCoorFromRank(int rank,std::vector &coor); - void GlobalSumF(double &); - void GlobalSumFVector(double *,int N); + ///////////////////////////////// + // Grid information queries + ///////////////////////////////// + int IsBoss(void) { return _processor==0; }; + int ThisRank(void) { return _processor; }; + const std::vector & ThisProcessorCoor(void) { return _processor_coor; }; + const std::vector & ProcessorGrid(void) { return _processors; }; + int ProcessorCount(void) { return _Nprocessors; }; + //////////////////////////////////////////////////////////// + // Reduction + //////////////////////////////////////////////////////////// + void GlobalSum(float &); + void GlobalSumVector(float *,int N); + + void GlobalSum(double &); + void GlobalSumVector(double *,int N); + + template void GlobalSumObj(obj &o){ + + typedef typename obj::scalar_type scalar_type; + int words = sizeof(obj)/sizeof(scalar_type); + + scalar_type * ptr = (scalar_type *)& o; + GlobalSum(ptr,words); + } + //////////////////////////////////////////////////////////// + // Face exchange + //////////////////////////////////////////////////////////// void SendToRecvFrom(void *xmit, int xmit_to_rank, void *recv, int recv_from_rank, int bytes); - + + //////////////////////////////////////////////////////////// + // Barrier + //////////////////////////////////////////////////////////// + void Barrier(void); + + //////////////////////////////////////////////////////////// + // Broadcast a buffer and composite larger + //////////////////////////////////////////////////////////// + void Broadcast(int root,void* data, int bytes); + template void Broadcast(int root,obj &data) + { + Broadcast(root,(void *)&data,sizeof(data)); + }; + }; } diff --git a/Grid_Lattice.h b/Grid_Lattice.h index aab4fec9..2aa46b97 100644 --- a/Grid_Lattice.h +++ b/Grid_Lattice.h @@ -20,7 +20,7 @@ template class Lattice { public: - SimdGrid *_grid; + GridBase *_grid; int checkerboard; std::vector > _odata; typedef typename vobj::scalar_type scalar_type; @@ -28,7 +28,7 @@ public: public: - Lattice(SimdGrid *grid) : _grid(grid) { + Lattice(GridBase *grid) : _grid(grid) { _odata.reserve(_grid->oSites()); assert((((uint64_t)&_odata[0])&0xF) ==0); checkerboard=0; @@ -95,56 +95,67 @@ public: template friend void pokeSite(const sobj &s,Lattice &l,std::vector &site){ - typedef typename vobj::scalar_type stype; - typedef typename vobj::vector_type vtype; + GridBase *grid=l._grid; - assert( l.checkerboard == l._grid->CheckerBoard(site)); + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; - int o_index = l._grid->oIndex(site); - int i_index = l._grid->iIndex(site); + int Nsimd = grid->Nsimd(); + + assert( l.checkerboard== l._grid->CheckerBoard(site)); + assert( sizeof(sobj)*Nsimd == sizeof(vobj)); + + int rank,odx,idx; + grid->GlobalCoorToRankIndex(rank,odx,idx,site); + + // Optional to broadcast from node 0. + grid->Broadcast(0,s); + + std::vector buf(Nsimd); + std::vector pointers(Nsimd); + for(int i=0;i - friend void peekSite(sobj &s,const Lattice &l,std::vector &site){ + friend void peekSite(sobj &s,Lattice &l,std::vector &site){ - typedef typename vobj::scalar_type stype; - typedef typename vobj::vector_type vtype; + GridBase *grid=l._grid; + + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nsimd = grid->Nsimd(); assert( l.checkerboard== l._grid->CheckerBoard(site)); + assert( sizeof(sobj)*Nsimd == sizeof(vobj)); - int o_index = l._grid->oIndex(site); - int i_index = l._grid->iIndex(site); + int rank,odx,idx; + grid->GlobalCoorToRankIndex(rank,odx,idx,site); + std::vector buf(Nsimd); + std::vector pointers(Nsimd); + for(int i=0;iBroadcast(rank,s); + return; }; - // Randomise + // FIXME Randomise; deprecate this friend void random(Lattice &l){ - Real *v_ptr = (Real *)&l._odata[0]; size_t v_len = l._grid->oSites()*sizeof(vobj); size_t d_len = v_len/sizeof(Real); @@ -154,9 +165,9 @@ public: v_ptr[i]=drand48(); } }; - + + // FIXME for debug; deprecate this friend void lex_sites(Lattice &l){ - Real *v_ptr = (Real *)&l._odata[0]; size_t o_len = l._grid->oSites(); size_t v_len = sizeof(vobj)/sizeof(vRealF); @@ -183,7 +194,6 @@ public: for(int i=0;i conj(const Lattice &lhs){ Lattice ret(lhs._grid); #pragma omp parallel for @@ -259,7 +270,7 @@ public: std::vector coor; int cbos; - full._grid->CoordFromOsite(coor,ss); + full._grid->oCoorFromOindex(coor,ss); cbos=half._grid->CheckerBoard(coor); if (cbos==cb) { @@ -277,7 +288,7 @@ public: std::vector coor; int cbos; - full._grid->CoordFromOsite(coor,ss); + full._grid->oCoorFromOindex(coor,ss); cbos=half._grid->CheckerBoard(coor); if (cbos==cb) { @@ -351,7 +362,6 @@ public: template inline auto operator * (const left &lhs,const Lattice &rhs) -> Lattice { - std::cerr <<"Oscalar * Lattice calling mult"< ret(rhs._grid); #pragma omp parallel for @@ -383,7 +393,6 @@ public: template inline auto operator * (const Lattice &lhs,const right &rhs) -> Lattice { - std::cerr <<"Lattice * Oscalar calling mult"< ret(lhs._grid); #pragma omp parallel for for(int ss=0;ssoSites(); ss++){ diff --git a/Grid_QCD.h b/Grid_QCD.h index f6cc6087..a0f7801a 100644 --- a/Grid_QCD.h +++ b/Grid_QCD.h @@ -21,7 +21,13 @@ namespace QCD { typedef iSinglet TComplex; // This is painful. Tensor singlet complex type. typedef iSinglet vTComplex; - typedef iSinglet TReal; // This is painful. Tensor singlet complex type. + typedef iSinglet TReal; // This is painful. Tensor singlet complex type. + + + typedef iSinglet vTIntegerF; + typedef iSinglet vTIntegerD; + typedef iSinglet vTIntegerC; + typedef iSinglet vTIntegerZ; typedef iSpinMatrix SpinMatrix; typedef iColourMatrix ColourMatrix; @@ -39,9 +45,13 @@ namespace QCD { typedef iSpinVector vSpinVector; typedef iColourVector vColourVector; typedef iSpinColourVector vSpinColourVector; - typedef Lattice LatticeComplex; + + typedef Lattice LatticeIntegerF; // Predicates for "where" + typedef Lattice LatticeIntegerD; + typedef Lattice LatticeIntegerC; + typedef Lattice LatticeIntegerZ; typedef Lattice LatticeColourMatrix; typedef Lattice LatticeSpinMatrix; diff --git a/Grid_cshift_common.h b/Grid_cshift_common.h index 203bf2f5..72518d7d 100644 --- a/Grid_cshift_common.h +++ b/Grid_cshift_common.h @@ -45,7 +45,7 @@ friend void Gather_plane_simple (Lattice &rhs,std::vector_slice_nblock[dimension];n++){ for(int b=0;b_slice_block[dimension];b++){ - int ocb=1<CheckerBoardFromOsite(o+b);// Could easily be a table lookup + int ocb=1<CheckerBoardFromOindex(o+b);// Could easily be a table lookup if ( ocb &cbmask ) { buffer[bo]=rhs._odata[so+o+b]; bo++; @@ -90,7 +90,7 @@ friend void Gather_plane_extract(Lattice &rhs,std::vector p for(int n=0;n_slice_nblock[dimension];n++){ for(int b=0;b_slice_block[dimension];b++){ - int ocb=1<CheckerBoardFromOsite(o+b); + int ocb=1<CheckerBoardFromOindex(o+b); if ( ocb & cbmask ) { extract(rhs._odata[so+o+b],pointers); } @@ -135,7 +135,7 @@ friend void Scatter_plane_simple (Lattice &rhs,std::vector_slice_nblock[dimension];n++){ for(int b=0;b_slice_block[dimension];b++){ - int ocb=1<CheckerBoardFromOsite(o+b);// Could easily be a table lookup + int ocb=1<CheckerBoardFromOindex(o+b);// Could easily be a table lookup if ( ocb & cbmask ) { rhs._odata[so+o+b]=buffer[bo++]; } @@ -179,7 +179,7 @@ friend void Scatter_plane_merge(Lattice &rhs,std::vector po for(int n=0;n_slice_nblock[dimension];n++){ for(int b=0;b_slice_block[dimension];b++){ - int ocb=1<CheckerBoardFromOsite(o+b); + int ocb=1<CheckerBoardFromOindex(o+b); if ( ocb&cbmask ) { merge(rhs._odata[so+o+b],pointers); } @@ -224,7 +224,7 @@ friend void Copy_plane(Lattice& lhs,Lattice &rhs, int dimension,int for(int n=0;n_slice_nblock[dimension];n++){ for(int b=0;b_slice_block[dimension];b++){ - int ocb=1<CheckerBoardFromOsite(o+b); + int ocb=1<CheckerBoardFromOindex(o+b); if ( ocb&cbmask ) { lhs._odata[lo+o+b]=rhs._odata[ro+o+b]; @@ -267,7 +267,7 @@ friend void Copy_plane_permute(Lattice& lhs,Lattice &rhs, int dimens for(int n=0;n_slice_nblock[dimension];n++){ for(int b=0;b_slice_block[dimension];b++){ - int ocb=1<CheckerBoardFromOsite(o+b); + int ocb=1<CheckerBoardFromOindex(o+b); if ( ocb&cbmask ) { permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type); diff --git a/Grid_cshift_fake.h b/Grid_cshift_fake.h index 0f306b1b..8b6c293b 100644 --- a/Grid_cshift_fake.h +++ b/Grid_cshift_fake.h @@ -7,9 +7,11 @@ friend Lattice Cshift(Lattice &rhs,int dimension,int shift) { typedef typename vobj::vector_type vector_type; typedef typename vobj::scalar_type scalar_type; - const int Nsimd = vector_type::Nsimd(); Lattice ret(rhs._grid); + + GridBase *grid=rhs._grid; + const int Nsimd = grid->Nsimd(); int fd = rhs._grid->_fdimensions[dimension]; int rd = rhs._grid->_rdimensions[dimension]; @@ -43,7 +45,7 @@ friend Lattice Cshift(Lattice &rhs,int dimension,int shift) for(int x=0;x Cshift(Lattice &rhs,int dimension,int shift) o +=rhs._grid->_slice_stride[dimension]; } - for(int i=0;i &ret,Lattice &rhs,int dimension,int typedef typename vobj::vector_type vector_type; typedef typename vobj::scalar_type scalar_type; - SimdGrid *grid=rhs._grid; + GridBase *grid=rhs._grid; Lattice temp(rhs._grid); int fd = rhs._grid->_fdimensions[dimension]; @@ -130,8 +130,8 @@ friend void Cshift_comms(Lattice &ret,Lattice &rhs,int dimension,int friend void Cshift_comms_simd(Lattice &ret,Lattice &rhs,int dimension,int shift,int cbmask) { - const int Nsimd = vector_type::Nsimd(); - SimdGrid *grid=rhs._grid; + GridBase *grid=rhs._grid; + const int Nsimd = grid->Nsimd(); typedef typename vobj::vector_type vector_type; typedef typename vobj::scalar_type scalar_type; @@ -173,6 +173,7 @@ friend void Cshift_comms_simd(Lattice &ret,Lattice &rhs,int dimensi std::vector comm_offnode(simd_layout); std::vector comm_proc (simd_layout); //relative processor coord in dim=dimension + std::vector icoor(grid->Nd()); for(int x=0;x &ret,Lattice &rhs,int dimensi for(int i=0;iiCoordFromIsite(i,dimension); + + int s; + grid->iCoorFromIindex(icoor,i); + s = icoor[dimension]; if(comm_offnode[s]){ @@ -232,7 +236,7 @@ friend void Cshift_comms_simd(Lattice &ret,Lattice &rhs,int dimensi if ( x< rd-num ) permute_slice=wrap; else permute_slice = 1-wrap; - for(int i=0;i simd_layout(4); std::vector mpi_layout(4); - mpi_layout[0]=4; - mpi_layout[1]=2; - mpi_layout[2]=4; + mpi_layout[0]=2; + mpi_layout[1]=1; + mpi_layout[2]=1; mpi_layout[3]=2; #ifdef AVX512 @@ -34,7 +34,7 @@ int main (int argc, char ** argv) omp_set_num_threads(omp); #endif - for(int lat=16;lat<=16;lat+=40){ + for(int lat=8;lat<=16;lat+=40){ latt_size[0] = lat; latt_size[1] = lat; latt_size[2] = lat; @@ -166,6 +166,7 @@ int main (int argc, char ** argv) // Lattice SU(3) x SU(3) + Fine.Barrier(); FooBar = Foo * Bar; // Lattice 12x12 GEMM @@ -179,37 +180,47 @@ int main (int argc, char ** argv) flops = ncall*1.0*volume*(8*Nc*Nc*Nc); bytes = ncall*1.0*volume*Nc*Nc *2*3*sizeof(Grid::Real); - printf("%f flop and %f bytes\n",flops,bytes/ncall); + if ( Fine.IsBoss() ) { + printf("%f flop and %f bytes\n",flops,bytes/ncall); + } FooBar = Foo * Bar; + Fine.Barrier(); t0=usecond(); for(int i=0;iblack - std::cout << "Shifting odd parities to even result"<red ShiftedCheck=zero; @@ -332,8 +343,10 @@ int main (int argc, char ** argv) nrm = nrm + real(conj(diff)*diff); }} }}}} + if( Fine.IsBoss() ){ + std::cout << "LatticeColorMatrix * LatticeColorMatrix nrm diff = "<(y,b,perm); + } + friend inline void merge(vIntegerF &y,std::vector &extracted) + { + Gmerge(y,extracted); + } + friend inline void extract(vIntegerF &y,std::vector &extracted) + { + Gextract(y,extracted); + } + }; + + + class vIntegerD : public vInteger + { + public: + static inline int Nsimd(void) { return sizeof(ivec)/sizeof(double);} + + friend inline void permute(vIntegerD &y,vIntegerD b,int perm) + { + Gpermute(y,b,perm); + } + friend inline void merge(vIntegerD &y,std::vector &extracted) + { + Gmerge(y,extracted); + } + friend inline void extract(vIntegerD &y,std::vector &extracted) + { + Gextract(y,extracted); + } + }; + + + class vIntegerC : public vInteger + { + public: + static inline int Nsimd(void) { return sizeof(ivec)/sizeof(ComplexF);} + + friend inline void permute(vIntegerC &y,vIntegerC b,int perm) + { + Gpermute(y,b,perm); + } + friend inline void merge(vIntegerC &y,std::vector &extracted) + { + Gmerge(y,extracted); + } + friend inline void extract(vIntegerC &y,std::vector &extracted) + { + Gextract(y,extracted); + } + }; + + class vIntegerZ : public vInteger + { + public: + static inline int Nsimd(void) { return sizeof(ivec)/sizeof(ComplexD);} + + friend inline void permute(vIntegerZ &y,vIntegerZ b,int perm) + { + Gpermute(y,b,perm); + } + friend inline void merge(vIntegerZ &y,std::vector &extracted) + { + Gmerge(y,extracted); + } + friend inline void extract(vIntegerZ &y,std::vector &extracted) + { + Gextract(y,extracted); + } + }; + +} + +#endif diff --git a/TODO b/TODO index 8108fbc6..9febd6c3 100644 --- a/TODO +++ b/TODO @@ -1,8 +1,9 @@ -* Make peekSite, pokeSite work in an MPI context. +* FIXME audit * Conditional execution Subset, where etc... * Coordinate information, integers etc... * Integer type padding/union to vector. +* LatticeCoordinate[mu] * Optimise the extract/merge SIMD routines