mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
parallel_for elimination -> thread_loop
This commit is contained in:
parent
9472b02771
commit
70e276e1ab
@ -128,9 +128,9 @@ public:
|
|||||||
for(int i=0;i<nbasis;i++){
|
for(int i=0;i<nbasis;i++){
|
||||||
blockProject(iProj,subspace[i],subspace);
|
blockProject(iProj,subspace[i],subspace);
|
||||||
eProj=Zero();
|
eProj=Zero();
|
||||||
parallel_for(int ss=0;ss<CoarseGrid->oSites();ss++){
|
thread_loop( (int ss=0;ss<CoarseGrid->oSites();ss++),{
|
||||||
eProj[ss](i)=CComplex(1.0);
|
eProj[ss](i)=CComplex(1.0);
|
||||||
}
|
});
|
||||||
eProj=eProj - iProj;
|
eProj=eProj - iProj;
|
||||||
std::cout<<GridLogMessage<<"Orthog check error "<<i<<" " << norm2(eProj)<<std::endl;
|
std::cout<<GridLogMessage<<"Orthog check error "<<i<<" " << norm2(eProj)<<std::endl;
|
||||||
}
|
}
|
||||||
@ -272,7 +272,7 @@ public:
|
|||||||
SimpleCompressor<siteVector> compressor;
|
SimpleCompressor<siteVector> compressor;
|
||||||
Stencil.HaloExchange(in,compressor);
|
Stencil.HaloExchange(in,compressor);
|
||||||
|
|
||||||
parallel_for(int ss=0;ss<Grid()->oSites();ss++){
|
thread_loop( (int ss=0;ss<Grid()->oSites();ss++),{
|
||||||
siteVector res = Zero();
|
siteVector res = Zero();
|
||||||
siteVector nbr;
|
siteVector nbr;
|
||||||
int ptype;
|
int ptype;
|
||||||
@ -291,7 +291,7 @@ public:
|
|||||||
res = res + A[point][ss]*nbr;
|
res = res + A[point][ss]*nbr;
|
||||||
}
|
}
|
||||||
vstream(out[ss],res);
|
vstream(out[ss],res);
|
||||||
}
|
});
|
||||||
return norm2(out);
|
return norm2(out);
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -384,14 +384,14 @@ public:
|
|||||||
Subspace.ProjectToSubspace(oProj,oblock);
|
Subspace.ProjectToSubspace(oProj,oblock);
|
||||||
// blockProject(iProj,iblock,Subspace.subspace);
|
// blockProject(iProj,iblock,Subspace.subspace);
|
||||||
// blockProject(oProj,oblock,Subspace.subspace);
|
// blockProject(oProj,oblock,Subspace.subspace);
|
||||||
parallel_for(int ss=0;ss<Grid()->oSites();ss++){
|
thread_loop( (int ss=0;ss<Grid()->oSites();ss++),{
|
||||||
for(int j=0;j<nbasis;j++){
|
for(int j=0;j<nbasis;j++){
|
||||||
if( disp!= 0 ) {
|
if( disp!= 0 ) {
|
||||||
A[p][ss](j,i) = oProj[ss](j);
|
A[p][ss](j,i) = oProj[ss](j);
|
||||||
}
|
}
|
||||||
A[self_stencil][ss](j,i) = A[self_stencil][ss](j,i) + iProj[ss](j);
|
A[self_stencil][ss](j,i) = A[self_stencil][ss](j,i) + iProj[ss](j);
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -55,11 +55,11 @@ void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, i
|
|||||||
typedef typename Field::vector_object vobj;
|
typedef typename Field::vector_object vobj;
|
||||||
GridBase* grid = basis[0].Grid();
|
GridBase* grid = basis[0].Grid();
|
||||||
|
|
||||||
parallel_region
|
thread_region
|
||||||
{
|
{
|
||||||
std::vector < vobj > B(Nm); // Thread private
|
std::vector < vobj > B(Nm); // Thread private
|
||||||
|
|
||||||
parallel_for_internal(int ss=0;ss < grid->oSites();ss++){
|
thread_loop_in_region( (int ss=0;ss < grid->oSites();ss++),{
|
||||||
for(int j=j0; j<j1; ++j) B[j]=0.;
|
for(int j=j0; j<j1; ++j) B[j]=0.;
|
||||||
|
|
||||||
for(int j=j0; j<j1; ++j){
|
for(int j=j0; j<j1; ++j){
|
||||||
@ -70,7 +70,7 @@ void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, i
|
|||||||
for(int j=j0; j<j1; ++j){
|
for(int j=j0; j<j1; ++j){
|
||||||
basis[j][ss] = B[j];
|
basis[j][ss] = B[j];
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -82,13 +82,13 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
|
|||||||
GridBase* grid = basis[0].Grid();
|
GridBase* grid = basis[0].Grid();
|
||||||
|
|
||||||
result.Checkerboard() = basis[0].Checkerboard();
|
result.Checkerboard() = basis[0].Checkerboard();
|
||||||
parallel_for(int ss=0;ss < grid->oSites();ss++){
|
thread_loop( (int ss=0;ss < grid->oSites();ss++),{
|
||||||
vobj B = Zero();
|
vobj B = Zero();
|
||||||
for(int k=k0; k<k1; ++k){
|
for(int k=k0; k<k1; ++k){
|
||||||
B +=Qt(j,k) * basis[k][ss];
|
B +=Qt(j,k) * basis[k][ss];
|
||||||
}
|
}
|
||||||
result[ss] = B;
|
result[ss] = B;
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Field>
|
template<class Field>
|
||||||
|
@ -48,7 +48,7 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
|
|||||||
|
|
||||||
int stride=rhs.Grid()->_slice_stride[dimension];
|
int stride=rhs.Grid()->_slice_stride[dimension];
|
||||||
if ( cbmask == 0x3 ) {
|
if ( cbmask == 0x3 ) {
|
||||||
thread_loop_collapse( (int n=0;n<e1;n++) ,
|
thread_loop_collapse( 2, (int n=0;n<e1;n++) ,
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o = n*stride;
|
int o = n*stride;
|
||||||
int bo = n*e2;
|
int bo = n*e2;
|
||||||
@ -92,7 +92,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_
|
|||||||
int n1=rhs.Grid()->_slice_stride[dimension];
|
int n1=rhs.Grid()->_slice_stride[dimension];
|
||||||
|
|
||||||
if ( cbmask ==0x3){
|
if ( cbmask ==0x3){
|
||||||
thread_loop_collapse( (int n=0;n<e1;n++), {
|
thread_loop_collapse( 2, (int n=0;n<e1;n++), {
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
|
|
||||||
int o = n*n1;
|
int o = n*n1;
|
||||||
@ -108,7 +108,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_
|
|||||||
// Case of SIMD split AND checker dim cannot currently be hit, except in
|
// Case of SIMD split AND checker dim cannot currently be hit, except in
|
||||||
// Test_cshift_red_black code.
|
// Test_cshift_red_black code.
|
||||||
std::cout << " Dense packed buffer WARNING " <<std::endl;
|
std::cout << " Dense packed buffer WARNING " <<std::endl;
|
||||||
thread_loop_collapse( (int n=0;n<e1;n++),{
|
thread_loop_collapse( 2, (int n=0;n<e1;n++),{
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
|
|
||||||
int o=n*n1;
|
int o=n*n1;
|
||||||
@ -142,7 +142,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
|
|||||||
int stride=rhs.Grid()->_slice_stride[dimension];
|
int stride=rhs.Grid()->_slice_stride[dimension];
|
||||||
|
|
||||||
if ( cbmask ==0x3 ) {
|
if ( cbmask ==0x3 ) {
|
||||||
thread_loop_collapse( (int n=0;n<e1;n++),{
|
thread_loop_collapse( 2, (int n=0;n<e1;n++),{
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o =n*rhs.Grid()->_slice_stride[dimension];
|
int o =n*rhs.Grid()->_slice_stride[dimension];
|
||||||
int bo =n*rhs.Grid()->_slice_block[dimension];
|
int bo =n*rhs.Grid()->_slice_block[dimension];
|
||||||
@ -184,7 +184,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<typ
|
|||||||
int e2=rhs.Grid()->_slice_block[dimension];
|
int e2=rhs.Grid()->_slice_block[dimension];
|
||||||
|
|
||||||
if(cbmask ==0x3 ) {
|
if(cbmask ==0x3 ) {
|
||||||
thread_loop_collapse( (int n=0;n<e1;n++),{
|
thread_loop_collapse(2, (int n=0;n<e1;n++),{
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o = n*rhs.Grid()->_slice_stride[dimension];
|
int o = n*rhs.Grid()->_slice_stride[dimension];
|
||||||
int offset = b+n*rhs.Grid()->_slice_block[dimension];
|
int offset = b+n*rhs.Grid()->_slice_block[dimension];
|
||||||
@ -228,7 +228,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
|||||||
int e2=rhs.Grid()->_slice_block[dimension];
|
int e2=rhs.Grid()->_slice_block[dimension];
|
||||||
int stride = rhs.Grid()->_slice_stride[dimension];
|
int stride = rhs.Grid()->_slice_stride[dimension];
|
||||||
if(cbmask == 0x3 ){
|
if(cbmask == 0x3 ){
|
||||||
thread_loop_collapse( (int n=0;n<e1;n++),{
|
thread_loop_collapse( 2,(int n=0;n<e1;n++),{
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o =n*stride+b;
|
int o =n*stride+b;
|
||||||
//lhs[lo+o]=rhs[ro+o];
|
//lhs[lo+o]=rhs[ro+o];
|
||||||
@ -236,7 +236,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
thread_loop_collapse( (int n=0;n<e1;n++),{
|
thread_loop_collapse(2, (int n=0;n<e1;n++),{
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o =n*stride+b;
|
int o =n*stride+b;
|
||||||
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o);
|
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o);
|
||||||
@ -266,7 +266,7 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
|
|||||||
int e2=rhs.Grid()->_slice_block [dimension];
|
int e2=rhs.Grid()->_slice_block [dimension];
|
||||||
int stride = rhs.Grid()->_slice_stride[dimension];
|
int stride = rhs.Grid()->_slice_stride[dimension];
|
||||||
|
|
||||||
thread_loop_collapse( (int n=0;n<e1;n++),{
|
thread_loop_collapse(2, (int n=0;n<e1;n++),{
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
|
|
||||||
int o =n*stride;
|
int o =n*stride;
|
||||||
|
@ -48,7 +48,7 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
|
|||||||
|
|
||||||
std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize());
|
std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize());
|
||||||
|
|
||||||
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
|
thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),{
|
||||||
int mywork, myoff;
|
int mywork, myoff;
|
||||||
GridThread::GetWork(left.Grid()->oSites(),thr,mywork,myoff);
|
GridThread::GetWork(left.Grid()->oSites(),thr,mywork,myoff);
|
||||||
|
|
||||||
@ -57,7 +57,7 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
|
|||||||
vnrm = vnrm + innerProductD(left[ss],right[ss]);
|
vnrm = vnrm + innerProductD(left[ss],right[ss]);
|
||||||
}
|
}
|
||||||
sumarray[thr]=TensorRemove(vnrm) ;
|
sumarray[thr]=TensorRemove(vnrm) ;
|
||||||
}
|
});
|
||||||
|
|
||||||
vector_type vvnrm; vvnrm=Zero(); // sum across threads
|
vector_type vvnrm; vvnrm=Zero(); // sum across threads
|
||||||
for(int i=0;i<grid->SumArraySize();i++){
|
for(int i=0;i<grid->SumArraySize();i++){
|
||||||
@ -104,7 +104,7 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
|
|||||||
sumarray[i]=Zero();
|
sumarray[i]=Zero();
|
||||||
}
|
}
|
||||||
|
|
||||||
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
|
thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),{
|
||||||
int mywork, myoff;
|
int mywork, myoff;
|
||||||
GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
|
GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
|
||||||
|
|
||||||
@ -113,7 +113,7 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
|
|||||||
vvsum = vvsum + arg[ss];
|
vvsum = vvsum + arg[ss];
|
||||||
}
|
}
|
||||||
sumarray[thr]=vvsum;
|
sumarray[thr]=vvsum;
|
||||||
}
|
});
|
||||||
|
|
||||||
vobj vsum=Zero(); // sum across threads
|
vobj vsum=Zero(); // sum across threads
|
||||||
for(int i=0;i<grid->SumArraySize();i++){
|
for(int i=0;i<grid->SumArraySize();i++){
|
||||||
@ -172,8 +172,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
|
|||||||
int stride=grid->_slice_stride[orthogdim];
|
int stride=grid->_slice_stride[orthogdim];
|
||||||
|
|
||||||
// sum over reduced dimension planes, breaking out orthog dir
|
// sum over reduced dimension planes, breaking out orthog dir
|
||||||
// Parallel over orthog direction
|
thread_loop( (int r=0;r<rd;r++),{
|
||||||
parallel_for(int r=0;r<rd;r++){
|
|
||||||
|
|
||||||
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
||||||
|
|
||||||
@ -183,7 +182,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
|
|||||||
lvSum[r]=lvSum[r]+Data[ss];
|
lvSum[r]=lvSum[r]+Data[ss];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
|
|
||||||
// Sum across simd lanes in the plane, breaking out orthog dir.
|
// Sum across simd lanes in the plane, breaking out orthog dir.
|
||||||
std::vector<int> icoor(Nd);
|
std::vector<int> icoor(Nd);
|
||||||
@ -252,7 +251,7 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
|
|||||||
int e2= grid->_slice_block [orthogdim];
|
int e2= grid->_slice_block [orthogdim];
|
||||||
int stride=grid->_slice_stride[orthogdim];
|
int stride=grid->_slice_stride[orthogdim];
|
||||||
|
|
||||||
parallel_for(int r=0;r<rd;r++){
|
thread_loop( (int r=0;r<rd;r++),{
|
||||||
|
|
||||||
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
||||||
|
|
||||||
@ -263,7 +262,7 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
|
|||||||
lvSum[r]=lvSum[r]+vv;
|
lvSum[r]=lvSum[r]+vv;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
|
|
||||||
// Sum across simd lanes in the plane, breaking out orthog dir.
|
// Sum across simd lanes in the plane, breaking out orthog dir.
|
||||||
std::vector<int> icoor(Nd);
|
std::vector<int> icoor(Nd);
|
||||||
@ -359,12 +358,12 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
|
|||||||
|
|
||||||
tensor_reduced at; at=av;
|
tensor_reduced at; at=av;
|
||||||
|
|
||||||
parallel_for_nest2(int n=0;n<e1;n++){
|
thread_loop_collapse(2, (int n=0;n<e1;n++),{
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int ss= so+n*stride+b;
|
int ss= so+n*stride+b;
|
||||||
R[ss] = at*X[ss]+Y[ss];
|
R[ss] = at*X[ss]+Y[ss];
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -53,7 +53,7 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
|
|||||||
M5Dcalls++;
|
M5Dcalls++;
|
||||||
M5Dtime-=usecond();
|
M5Dtime-=usecond();
|
||||||
|
|
||||||
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
thread_loop( (int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
auto tmp = psi[0];
|
auto tmp = psi[0];
|
||||||
if ( s==0 ) {
|
if ( s==0 ) {
|
||||||
@ -76,7 +76,7 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
|
|||||||
chi[ss+s]=chi[ss+s]+lower[s]*tmp;
|
chi[ss+s]=chi[ss+s]+lower[s]*tmp;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
M5Dtime+=usecond();
|
M5Dtime+=usecond();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -97,7 +97,7 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
|
|||||||
M5Dcalls++;
|
M5Dcalls++;
|
||||||
M5Dtime-=usecond();
|
M5Dtime-=usecond();
|
||||||
|
|
||||||
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
thread_loop( (int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
|
||||||
auto tmp = psi[0];
|
auto tmp = psi[0];
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
if ( s==0 ) {
|
if ( s==0 ) {
|
||||||
@ -120,7 +120,7 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
|
|||||||
chi[ss+s]=chi[ss+s]+lower[s]*tmp;
|
chi[ss+s]=chi[ss+s]+lower[s]*tmp;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
M5Dtime+=usecond();
|
M5Dtime+=usecond();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -135,7 +135,7 @@ void CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi, FermionField &
|
|||||||
MooeeInvCalls++;
|
MooeeInvCalls++;
|
||||||
MooeeInvTime-=usecond();
|
MooeeInvTime-=usecond();
|
||||||
|
|
||||||
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
thread_loop((int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
|
||||||
auto tmp = psi[0];
|
auto tmp = psi[0];
|
||||||
|
|
||||||
// flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls = 12*Ls * (9) = 108*Ls flops
|
// flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls = 12*Ls * (9) = 108*Ls flops
|
||||||
@ -163,7 +163,7 @@ void CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi, FermionField &
|
|||||||
spProj5m(tmp,chi[ss+s+1]);
|
spProj5m(tmp,chi[ss+s+1]);
|
||||||
chi[ss+s] = chi[ss+s] - uee[s]*tmp;
|
chi[ss+s] = chi[ss+s] - uee[s]*tmp;
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
|
|
||||||
MooeeInvTime+=usecond();
|
MooeeInvTime+=usecond();
|
||||||
|
|
||||||
@ -193,7 +193,7 @@ void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &
|
|||||||
MooeeInvCalls++;
|
MooeeInvCalls++;
|
||||||
MooeeInvTime-=usecond();
|
MooeeInvTime-=usecond();
|
||||||
|
|
||||||
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
thread_loop((int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
|
||||||
|
|
||||||
auto tmp = psi[0];
|
auto tmp = psi[0];
|
||||||
|
|
||||||
@ -221,7 +221,7 @@ void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &
|
|||||||
spProj5p(tmp,chi[ss+s+1]);
|
spProj5p(tmp,chi[ss+s+1]);
|
||||||
chi[ss+s] = chi[ss+s] - leec[s]*tmp;
|
chi[ss+s] = chi[ss+s] - leec[s]*tmp;
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
|
|
||||||
MooeeInvTime+=usecond();
|
MooeeInvTime+=usecond();
|
||||||
|
|
||||||
|
@ -93,7 +93,7 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
|
|||||||
|
|
||||||
assert(Nc==3);
|
assert(Nc==3);
|
||||||
|
|
||||||
parallel_for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
|
thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
|
||||||
#if 0
|
#if 0
|
||||||
alignas(64) SiteHalfSpinor hp;
|
alignas(64) SiteHalfSpinor hp;
|
||||||
alignas(64) SiteHalfSpinor hm;
|
alignas(64) SiteHalfSpinor hm;
|
||||||
@ -190,7 +190,7 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
|
|||||||
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
});
|
||||||
M5Dtime+=usecond();
|
M5Dtime+=usecond();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -233,7 +233,7 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
|
|||||||
|
|
||||||
M5Dcalls++;
|
M5Dcalls++;
|
||||||
M5Dtime-=usecond();
|
M5Dtime-=usecond();
|
||||||
parallel_for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
|
thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
|
||||||
#if 0
|
#if 0
|
||||||
alignas(64) SiteHalfSpinor hp;
|
alignas(64) SiteHalfSpinor hp;
|
||||||
alignas(64) SiteHalfSpinor hm;
|
alignas(64) SiteHalfSpinor hm;
|
||||||
@ -327,7 +327,7 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
|
|||||||
vstream(chi[ss+v]()(3)(2),p_32);
|
vstream(chi[ss+v]()(3)(2),p_32);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
});
|
||||||
M5Dtime+=usecond();
|
M5Dtime+=usecond();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -792,13 +792,13 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
|
|||||||
MooeeInvTime-=usecond();
|
MooeeInvTime-=usecond();
|
||||||
|
|
||||||
if ( switcheroo<Coeff_t>::iscomplex() ) {
|
if ( switcheroo<Coeff_t>::iscomplex() ) {
|
||||||
parallel_for(auto site=0;site<vol;site++){
|
thread_loop( (auto site=0;site<vol;site++),{
|
||||||
MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
|
MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
|
||||||
}
|
});
|
||||||
} else {
|
} else {
|
||||||
parallel_for(auto site=0;site<vol;site++){
|
thread_loop( (auto site=0;site<vol;site++),{
|
||||||
MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
|
MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
MooeeInvTime+=usecond();
|
MooeeInvTime+=usecond();
|
||||||
}
|
}
|
||||||
|
@ -52,7 +52,7 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionFiel
|
|||||||
this->M5Dcalls++;
|
this->M5Dcalls++;
|
||||||
this->M5Dtime -= usecond();
|
this->M5Dtime -= usecond();
|
||||||
|
|
||||||
parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
|
thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
|
||||||
for(int s=0; s<Ls; s++){
|
for(int s=0; s<Ls; s++){
|
||||||
auto tmp = psi[0];
|
auto tmp = psi[0];
|
||||||
if(s==0) {
|
if(s==0) {
|
||||||
@ -72,7 +72,7 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionFiel
|
|||||||
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
|
|
||||||
this->M5Dtime += usecond();
|
this->M5Dtime += usecond();
|
||||||
}
|
}
|
||||||
@ -90,7 +90,7 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionF
|
|||||||
this->M5Dcalls++;
|
this->M5Dcalls++;
|
||||||
this->M5Dtime -= usecond();
|
this->M5Dtime -= usecond();
|
||||||
|
|
||||||
parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
|
thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
|
||||||
auto tmp = psi[0];
|
auto tmp = psi[0];
|
||||||
for(int s=0; s<Ls; s++){
|
for(int s=0; s<Ls; s++){
|
||||||
if(s==0) {
|
if(s==0) {
|
||||||
@ -110,7 +110,7 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionF
|
|||||||
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
|
|
||||||
this->M5Dtime += usecond();
|
this->M5Dtime += usecond();
|
||||||
}
|
}
|
||||||
@ -126,7 +126,7 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField
|
|||||||
this->MooeeInvCalls++;
|
this->MooeeInvCalls++;
|
||||||
this->MooeeInvTime -= usecond();
|
this->MooeeInvTime -= usecond();
|
||||||
|
|
||||||
parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
|
thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
|
||||||
|
|
||||||
auto tmp1 = psi[0];
|
auto tmp1 = psi[0];
|
||||||
auto tmp2 = psi[0];
|
auto tmp2 = psi[0];
|
||||||
@ -158,7 +158,7 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField
|
|||||||
spProj5m(tmp1, chi[ss+s+1]);
|
spProj5m(tmp1, chi[ss+s+1]);
|
||||||
chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
|
chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
|
|
||||||
this->MooeeInvTime += usecond();
|
this->MooeeInvTime += usecond();
|
||||||
}
|
}
|
||||||
@ -190,7 +190,7 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionFi
|
|||||||
this->MooeeInvCalls++;
|
this->MooeeInvCalls++;
|
||||||
this->MooeeInvTime -= usecond();
|
this->MooeeInvTime -= usecond();
|
||||||
|
|
||||||
parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
|
thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
|
||||||
|
|
||||||
auto tmp1 = psi[0];
|
auto tmp1 = psi[0];
|
||||||
auto tmp2 = psi[0];
|
auto tmp2 = psi[0];
|
||||||
@ -221,7 +221,7 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionFi
|
|||||||
spProj5p(tmp1, chi[ss+s+1]);
|
spProj5p(tmp1, chi[ss+s+1]);
|
||||||
chi[ss+s] = chi[ss+s] - leec[s]*tmp1;
|
chi[ss+s] = chi[ss+s] - leec[s]*tmp1;
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
|
|
||||||
this->MooeeInvTime += usecond();
|
this->MooeeInvTime += usecond();
|
||||||
}
|
}
|
||||||
|
@ -89,7 +89,7 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionFiel
|
|||||||
|
|
||||||
assert(Nc == 3);
|
assert(Nc == 3);
|
||||||
|
|
||||||
parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
|
thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
|
|
||||||
@ -191,7 +191,7 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionFiel
|
|||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
}
|
});
|
||||||
|
|
||||||
this->M5Dtime += usecond();
|
this->M5Dtime += usecond();
|
||||||
}
|
}
|
||||||
@ -232,7 +232,7 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionF
|
|||||||
this->M5Dcalls++;
|
this->M5Dcalls++;
|
||||||
this->M5Dtime -= usecond();
|
this->M5Dtime -= usecond();
|
||||||
|
|
||||||
parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
|
thread_loop((int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
|
|
||||||
@ -330,7 +330,7 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionF
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
}
|
});
|
||||||
|
|
||||||
this->M5Dtime += usecond();
|
this->M5Dtime += usecond();
|
||||||
}
|
}
|
||||||
@ -566,13 +566,13 @@ void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, Fermion
|
|||||||
this->MooeeInvTime -= usecond();
|
this->MooeeInvTime -= usecond();
|
||||||
|
|
||||||
if(switcheroo<Coeff_t>::iscomplex()){
|
if(switcheroo<Coeff_t>::iscomplex()){
|
||||||
parallel_for(auto site=0; site<vol; site++){
|
thread_loop((auto site=0; site<vol; site++),{
|
||||||
MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
|
MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
|
||||||
}
|
});
|
||||||
} else {
|
} else {
|
||||||
parallel_for(auto site=0; site<vol; site++){
|
thread_loop((auto site=0; site<vol; site++){
|
||||||
MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
|
MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
this->MooeeInvTime += usecond();
|
this->MooeeInvTime += usecond();
|
||||||
|
@ -268,13 +268,13 @@ public:
|
|||||||
GaugeLinkField tmp(mat.Grid());
|
GaugeLinkField tmp(mat.Grid());
|
||||||
tmp = Zero();
|
tmp = Zero();
|
||||||
|
|
||||||
parallel_for(int sss=0;sss<tmp.Grid()->oSites();sss++){
|
thread_loop( (int sss=0;sss<tmp.Grid()->oSites();sss++),{
|
||||||
int sU=sss;
|
int sU=sss;
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
int sF = s+Ls*sU;
|
int sF = s+Ls*sU;
|
||||||
tmp[sU] = tmp[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde[sF],Atilde[sF])); // ordering here
|
tmp[sU] = tmp[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde[sF],Atilde[sF])); // ordering here
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
PokeIndex<LorentzIndex>(mat,tmp,mu);
|
PokeIndex<LorentzIndex>(mat,tmp,mu);
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -591,10 +591,10 @@ public:
|
|||||||
Uconj = where(coor==neglink,-Uconj,Uconj);
|
Uconj = where(coor==neglink,-Uconj,Uconj);
|
||||||
}
|
}
|
||||||
|
|
||||||
parallel_for(auto ss=U.begin();ss<U.end();ss++){
|
thread_loop( (auto ss=U.begin();ss<U.end();ss++),{
|
||||||
Uds[ss](0)(mu) = U[ss]();
|
Uds[ss](0)(mu) = U[ss]();
|
||||||
Uds[ss](1)(mu) = Uconj[ss]();
|
Uds[ss](1)(mu) = Uconj[ss]();
|
||||||
}
|
});
|
||||||
|
|
||||||
U = adj(Cshift(U ,mu,-1)); // correct except for spanning the boundary
|
U = adj(Cshift(U ,mu,-1)); // correct except for spanning the boundary
|
||||||
Uconj = adj(Cshift(Uconj,mu,-1));
|
Uconj = adj(Cshift(Uconj,mu,-1));
|
||||||
@ -605,18 +605,18 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
parallel_for(auto ss=U.begin();ss<U.end();ss++){
|
thread_loop((auto ss=U.begin();ss<U.end();ss++),{
|
||||||
Uds[ss](0)(mu+4) = Utmp[ss]();
|
Uds[ss](0)(mu+4) = Utmp[ss]();
|
||||||
}
|
});
|
||||||
|
|
||||||
Utmp = Uconj;
|
Utmp = Uconj;
|
||||||
if ( Params.twists[mu] ) {
|
if ( Params.twists[mu] ) {
|
||||||
Utmp = where(coor==0,U,Utmp);
|
Utmp = where(coor==0,U,Utmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
parallel_for(auto ss=U.begin();ss<U.end();ss++){
|
thread_loop((auto ss=U.begin();ss<U.end();ss++),{
|
||||||
Uds[ss](1)(mu+4) = Utmp[ss]();
|
Uds[ss](1)(mu+4) = Utmp[ss]();
|
||||||
}
|
});
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -627,9 +627,9 @@ public:
|
|||||||
GaugeLinkField link(mat.Grid());
|
GaugeLinkField link(mat.Grid());
|
||||||
// use lorentz for flavour as hack.
|
// use lorentz for flavour as hack.
|
||||||
auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
|
auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
|
||||||
parallel_for(auto ss = tmp.begin(); ss < tmp.end(); ss++) {
|
thread_loop((auto ss = tmp.begin(); ss < tmp.end(); ss++), {
|
||||||
link[ss]() = tmp[ss](0, 0) + conjugate(tmp[ss](1, 1));
|
link[ss]() = tmp[ss](0, 0) + conjugate(tmp[ss](1, 1));
|
||||||
}
|
});
|
||||||
PokeIndex<LorentzIndex>(mat, link, mu);
|
PokeIndex<LorentzIndex>(mat, link, mu);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -640,13 +640,13 @@ public:
|
|||||||
|
|
||||||
GaugeLinkField tmp(mat.Grid());
|
GaugeLinkField tmp(mat.Grid());
|
||||||
tmp = Zero();
|
tmp = Zero();
|
||||||
parallel_for(int ss = 0; ss < tmp.Grid()->oSites(); ss++) {
|
thread_loop((int ss = 0; ss < tmp.Grid()->oSites(); ss++) ,{
|
||||||
for (int s = 0; s < Ls; s++) {
|
for (int s = 0; s < Ls; s++) {
|
||||||
int sF = s + Ls * ss;
|
int sF = s + Ls * ss;
|
||||||
auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde[sF], Atilde[sF]));
|
auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde[sF], Atilde[sF]));
|
||||||
tmp[ss]() = tmp[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
|
tmp[ss]() = tmp[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
PokeIndex<LorentzIndex>(mat, tmp, mu);
|
PokeIndex<LorentzIndex>(mat, tmp, mu);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -172,13 +172,13 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionFi
|
|||||||
Compressor compressor;
|
Compressor compressor;
|
||||||
Stencil.HaloExchange(in,compressor);
|
Stencil.HaloExchange(in,compressor);
|
||||||
|
|
||||||
parallel_for(int ss=0;ss<Umu.Grid()->oSites();ss++){
|
thread_loop( (int ss=0;ss<Umu.Grid()->oSites();ss++),{
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
int sU=ss;
|
int sU=ss;
|
||||||
int sF = s+Ls*sU;
|
int sF = s+Ls*sU;
|
||||||
Kernels::DhopDirK(Stencil, Umu, UUUmu, Stencil.CommBuf(), sF, sU, in, out, dir, disp);
|
Kernels::DhopDirK(Stencil, Umu, UUUmu, Stencil.CommBuf(), sF, sU, in, out, dir, disp);
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
};
|
};
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -240,15 +240,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
|
|||||||
DhopComputeTime -= usecond();
|
DhopComputeTime -= usecond();
|
||||||
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
parallel_for (int ss = 0; ss < U.Grid()->oSites(); ss++) {
|
thread_loop( (int ss = 0; ss < U.Grid()->oSites(); ss++), {
|
||||||
int sU=ss;
|
int sU=ss;
|
||||||
Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out);
|
Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out);
|
||||||
}
|
});
|
||||||
} else {
|
} else {
|
||||||
parallel_for (int ss = 0; ss < U.Grid()->oSites(); ss++) {
|
thread_loop( (int ss = 0; ss < U.Grid()->oSites(); ss++) ,{
|
||||||
int sU=ss;
|
int sU=ss;
|
||||||
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
|
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
DhopComputeTime += usecond();
|
DhopComputeTime += usecond();
|
||||||
DhopTotalTime += usecond();
|
DhopTotalTime += usecond();
|
||||||
|
@ -50,7 +50,7 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi, const FermionField &p
|
|||||||
this->M5Dcalls++;
|
this->M5Dcalls++;
|
||||||
this->M5Dtime -= usecond();
|
this->M5Dtime -= usecond();
|
||||||
|
|
||||||
parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
|
thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
|
||||||
for(int s=0; s<Ls; s++){
|
for(int s=0; s<Ls; s++){
|
||||||
auto tmp = psi[0];
|
auto tmp = psi[0];
|
||||||
if(s==0){
|
if(s==0){
|
||||||
@ -70,7 +70,7 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi, const FermionField &p
|
|||||||
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
|
|
||||||
this->M5Dtime += usecond();
|
this->M5Dtime += usecond();
|
||||||
}
|
}
|
||||||
@ -91,7 +91,7 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi, const FermionFi
|
|||||||
this->M5Dcalls++;
|
this->M5Dcalls++;
|
||||||
this->M5Dtime -= usecond();
|
this->M5Dtime -= usecond();
|
||||||
|
|
||||||
parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
|
thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
|
||||||
for(int s=0; s<Ls; s++){
|
for(int s=0; s<Ls; s++){
|
||||||
auto tmp = psi[0];
|
auto tmp = psi[0];
|
||||||
if(s==0){
|
if(s==0){
|
||||||
@ -114,7 +114,7 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi, const FermionFi
|
|||||||
else{ spProj5m(tmp, psi[ss+shift_s]); }
|
else{ spProj5m(tmp, psi[ss+shift_s]); }
|
||||||
chi[ss+s] = chi[ss+s] + shift_coeffs[s]*tmp;
|
chi[ss+s] = chi[ss+s] + shift_coeffs[s]*tmp;
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
|
|
||||||
this->M5Dtime += usecond();
|
this->M5Dtime += usecond();
|
||||||
}
|
}
|
||||||
@ -133,7 +133,7 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi, const FermionField
|
|||||||
this->M5Dcalls++;
|
this->M5Dcalls++;
|
||||||
this->M5Dtime -= usecond();
|
this->M5Dtime -= usecond();
|
||||||
|
|
||||||
parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
|
thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
|
||||||
auto tmp = psi[0];
|
auto tmp = psi[0];
|
||||||
for(int s=0; s<Ls; s++){
|
for(int s=0; s<Ls; s++){
|
||||||
if(s==0) {
|
if(s==0) {
|
||||||
@ -153,7 +153,7 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi, const FermionField
|
|||||||
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
|
|
||||||
this->M5Dtime += usecond();
|
this->M5Dtime += usecond();
|
||||||
}
|
}
|
||||||
@ -174,7 +174,7 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi, const Fermio
|
|||||||
this->M5Dcalls++;
|
this->M5Dcalls++;
|
||||||
this->M5Dtime -= usecond();
|
this->M5Dtime -= usecond();
|
||||||
|
|
||||||
parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
|
thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
|
||||||
chi[ss+Ls-1] = Zero();
|
chi[ss+Ls-1] = Zero();
|
||||||
auto tmp = psi[0];
|
auto tmp = psi[0];
|
||||||
for(int s=0; s<Ls; s++){
|
for(int s=0; s<Ls; s++){
|
||||||
@ -198,7 +198,7 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi, const Fermio
|
|||||||
else{ spProj5m(tmp, psi[ss+s]); }
|
else{ spProj5m(tmp, psi[ss+s]); }
|
||||||
chi[ss+shift_s] = chi[ss+shift_s] + shift_coeffs[s]*tmp;
|
chi[ss+shift_s] = chi[ss+shift_s] + shift_coeffs[s]*tmp;
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
|
|
||||||
this->M5Dtime += usecond();
|
this->M5Dtime += usecond();
|
||||||
}
|
}
|
||||||
@ -216,7 +216,7 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi, FermionField &ch
|
|||||||
this->MooeeInvCalls++;
|
this->MooeeInvCalls++;
|
||||||
this->MooeeInvTime -= usecond();
|
this->MooeeInvTime -= usecond();
|
||||||
|
|
||||||
parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
|
thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
|
||||||
|
|
||||||
auto tmp = psi[0];
|
auto tmp = psi[0];
|
||||||
|
|
||||||
@ -245,7 +245,7 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi, FermionField &ch
|
|||||||
spProj5m(tmp, chi[ss+s+1]);
|
spProj5m(tmp, chi[ss+s+1]);
|
||||||
chi[ss+s] = chi[ss+s] - this->uee[s]*tmp;
|
chi[ss+s] = chi[ss+s] - this->uee[s]*tmp;
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
|
|
||||||
this->MooeeInvTime += usecond();
|
this->MooeeInvTime += usecond();
|
||||||
}
|
}
|
||||||
@ -261,7 +261,7 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi, FermionFie
|
|||||||
this->MooeeInvCalls++;
|
this->MooeeInvCalls++;
|
||||||
this->MooeeInvTime -= usecond();
|
this->MooeeInvTime -= usecond();
|
||||||
|
|
||||||
parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
|
thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
|
||||||
|
|
||||||
auto tmp1 = psi[0];
|
auto tmp1 = psi[0];
|
||||||
auto tmp2 = psi[0];
|
auto tmp2 = psi[0];
|
||||||
@ -300,7 +300,7 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi, FermionFie
|
|||||||
spProj5m(tmp1, chi[ss+s]);
|
spProj5m(tmp1, chi[ss+s]);
|
||||||
chi[ss+s] = chi[ss+s] + MooeeInv_shift_norm[s]*tmp2_spProj;
|
chi[ss+s] = chi[ss+s] + MooeeInv_shift_norm[s]*tmp2_spProj;
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
|
|
||||||
this->MooeeInvTime += usecond();
|
this->MooeeInvTime += usecond();
|
||||||
}
|
}
|
||||||
@ -318,7 +318,7 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi, FermionField
|
|||||||
this->MooeeInvCalls++;
|
this->MooeeInvCalls++;
|
||||||
this->MooeeInvTime -= usecond();
|
this->MooeeInvTime -= usecond();
|
||||||
|
|
||||||
parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
|
thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
|
||||||
|
|
||||||
auto tmp = psi[0];
|
auto tmp = psi[0];
|
||||||
|
|
||||||
@ -347,7 +347,7 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi, FermionField
|
|||||||
spProj5p(tmp, chi[ss+s+1]);
|
spProj5p(tmp, chi[ss+s+1]);
|
||||||
chi[ss+s] = chi[ss+s] - this->lee[s]*tmp;
|
chi[ss+s] = chi[ss+s] - this->lee[s]*tmp;
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
|
|
||||||
this->MooeeInvTime += usecond();
|
this->MooeeInvTime += usecond();
|
||||||
}
|
}
|
||||||
@ -363,7 +363,7 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi, Fermion
|
|||||||
this->MooeeInvCalls++;
|
this->MooeeInvCalls++;
|
||||||
this->MooeeInvTime -= usecond();
|
this->MooeeInvTime -= usecond();
|
||||||
|
|
||||||
parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
|
thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
|
||||||
|
|
||||||
auto tmp1 = psi[0];
|
auto tmp1 = psi[0];
|
||||||
auto tmp2 = psi[0];
|
auto tmp2 = psi[0];
|
||||||
@ -401,7 +401,7 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi, Fermion
|
|||||||
spProj5p(tmp1, chi[ss+s]);
|
spProj5p(tmp1, chi[ss+s]);
|
||||||
chi[ss+s] = chi[ss+s] + MooeeInvDag_shift_norm[s]*tmp2_spProj;
|
chi[ss+s] = chi[ss+s] + MooeeInvDag_shift_norm[s]*tmp2_spProj;
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
|
|
||||||
this->MooeeInvTime += usecond();
|
this->MooeeInvTime += usecond();
|
||||||
}
|
}
|
||||||
|
@ -100,7 +100,7 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& p
|
|||||||
|
|
||||||
assert(Nc == 3);
|
assert(Nc == 3);
|
||||||
|
|
||||||
parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
|
thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
|
|
||||||
@ -202,7 +202,7 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& p
|
|||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
}
|
});
|
||||||
|
|
||||||
this->M5Dtime += usecond();
|
this->M5Dtime += usecond();
|
||||||
}
|
}
|
||||||
@ -263,7 +263,7 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi, const FermionFi
|
|||||||
|
|
||||||
assert(Nc == 3);
|
assert(Nc == 3);
|
||||||
|
|
||||||
parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
|
thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
|
||||||
|
|
||||||
int vs = (this->pm == 1) ? LLs-1 : 0;
|
int vs = (this->pm == 1) ? LLs-1 : 0;
|
||||||
Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(2)(0) : psi[ss+vs]()(0)(0);
|
Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(2)(0) : psi[ss+vs]()(0)(0);
|
||||||
@ -381,7 +381,7 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi, const FermionFi
|
|||||||
vstream(chi[ss+v]()(3)(1), p_31);
|
vstream(chi[ss+v]()(3)(1), p_31);
|
||||||
vstream(chi[ss+v]()(3)(2), p_32);
|
vstream(chi[ss+v]()(3)(2), p_32);
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
|
|
||||||
this->M5Dtime += usecond();
|
this->M5Dtime += usecond();
|
||||||
|
|
||||||
@ -424,7 +424,7 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField
|
|||||||
this->M5Dcalls++;
|
this->M5Dcalls++;
|
||||||
this->M5Dtime -= usecond();
|
this->M5Dtime -= usecond();
|
||||||
|
|
||||||
parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
|
thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
|
|
||||||
@ -525,7 +525,7 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
}
|
});
|
||||||
|
|
||||||
this->M5Dtime += usecond();
|
this->M5Dtime += usecond();
|
||||||
}
|
}
|
||||||
@ -584,7 +584,7 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi, const Fermio
|
|||||||
this->M5Dcalls++;
|
this->M5Dcalls++;
|
||||||
this->M5Dtime -= usecond();
|
this->M5Dtime -= usecond();
|
||||||
|
|
||||||
parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
|
thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
|
||||||
|
|
||||||
int vs = (this->pm == 1) ? LLs-1 : 0;
|
int vs = (this->pm == 1) ? LLs-1 : 0;
|
||||||
Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(0)(0) : psi[ss+vs]()(2)(0);
|
Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(0)(0) : psi[ss+vs]()(2)(0);
|
||||||
@ -703,7 +703,7 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi, const Fermio
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
});
|
||||||
|
|
||||||
this->M5Dtime += usecond();
|
this->M5Dtime += usecond();
|
||||||
|
|
||||||
@ -943,13 +943,13 @@ void MobiusEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionFiel
|
|||||||
this->MooeeInvTime -= usecond();
|
this->MooeeInvTime -= usecond();
|
||||||
|
|
||||||
if(switcheroo<Coeff_t>::iscomplex()){
|
if(switcheroo<Coeff_t>::iscomplex()){
|
||||||
parallel_for(auto site=0; site<vol; site++){
|
thread_loop( (auto site=0; site<vol; site++),{
|
||||||
MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
|
MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
|
||||||
}
|
});
|
||||||
} else {
|
} else {
|
||||||
parallel_for(auto site=0; site<vol; site++){
|
thread_loop( (auto site=0; site<vol; site++),{
|
||||||
MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
|
MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
this->MooeeInvTime += usecond();
|
this->MooeeInvTime += usecond();
|
||||||
|
@ -54,10 +54,10 @@ public:
|
|||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
assert(grid->_simd_layout[0] == 1); // should be fine for ZMobius for now
|
assert(grid->_simd_layout[0] == 1); // should be fine for ZMobius for now
|
||||||
int Ls = grid->_rdimensions[0];
|
int Ls = grid->_rdimensions[0];
|
||||||
parallel_for(int ss=0;ss<grid->oSites();ss++){
|
thread_loop( (int ss=0;ss<grid->oSites();ss++),{
|
||||||
vobj tmp = s[ss % Ls]*in[ss];
|
vobj tmp = s[ss % Ls]*in[ss];
|
||||||
vstream(out[ss],tmp);
|
vstream(out[ss],tmp);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
RealD sscale_norm(const Field& in, Field& out, Coeff_t* s) {
|
RealD sscale_norm(const Field& in, Field& out, Coeff_t* s) {
|
||||||
|
@ -216,9 +216,9 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
|
|||||||
////////////////////////
|
////////////////////////
|
||||||
// Call the single hop
|
// Call the single hop
|
||||||
////////////////////////
|
////////////////////////
|
||||||
parallel_for (int sss = 0; sss < B.Grid()->oSites(); sss++) {
|
thread_loop( (int sss = 0; sss < B.Grid()->oSites(); sss++) ,{
|
||||||
Kernels::DhopDirK(st, U, st.CommBuf(), sss, sss, B, Btilde, mu, gamma);
|
Kernels::DhopDirK(st, U, st.CommBuf(), sss, sss, B, Btilde, mu, gamma);
|
||||||
}
|
});
|
||||||
|
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
// spin trace outer product
|
// spin trace outer product
|
||||||
@ -317,9 +317,9 @@ void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,
|
|||||||
|
|
||||||
Stencil.HaloExchange(in, compressor);
|
Stencil.HaloExchange(in, compressor);
|
||||||
|
|
||||||
parallel_for (int sss = 0; sss < in.Grid()->oSites(); sss++) {
|
thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) ,{
|
||||||
Kernels::DhopDirK(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma);
|
Kernels::DhopDirK(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma);
|
||||||
}
|
});
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -333,13 +333,13 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
st.HaloExchange(in, compressor);
|
st.HaloExchange(in, compressor);
|
||||||
|
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
parallel_for (int sss = 0; sss < in.Grid()->oSites(); sss++) {
|
thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) ,{
|
||||||
Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
|
Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
|
||||||
}
|
});
|
||||||
} else {
|
} else {
|
||||||
parallel_for (int sss = 0; sss < in.Grid()->oSites(); sss++) {
|
thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) ,{
|
||||||
Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
|
Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -366,8 +366,7 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
|
|||||||
// Inefficient comms method but not performance critical.
|
// Inefficient comms method but not performance critical.
|
||||||
tmp1 = Cshift(q_in_1, mu, 1);
|
tmp1 = Cshift(q_in_1, mu, 1);
|
||||||
tmp2 = Cshift(q_in_2, mu, 1);
|
tmp2 = Cshift(q_in_2, mu, 1);
|
||||||
parallel_for (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU)
|
thread_loop( (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU), {
|
||||||
{
|
|
||||||
Kernels::ContractConservedCurrentSiteFwd(tmp1[sU],
|
Kernels::ContractConservedCurrentSiteFwd(tmp1[sU],
|
||||||
q_in_2[sU],
|
q_in_2[sU],
|
||||||
q_out[sU],
|
q_out[sU],
|
||||||
@ -376,7 +375,7 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
|
|||||||
tmp2[sU],
|
tmp2[sU],
|
||||||
q_out[sU],
|
q_out[sU],
|
||||||
Umu, sU, mu);
|
Umu, sU, mu);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -415,38 +414,36 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
|||||||
tmp = ph*q_in;
|
tmp = ph*q_in;
|
||||||
tmpBwd = Cshift(tmp, mu, -1);
|
tmpBwd = Cshift(tmp, mu, -1);
|
||||||
|
|
||||||
parallel_for (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU)
|
thread_loop( (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU), {
|
||||||
{
|
|
||||||
// Compute the sequential conserved current insertion only if our simd
|
|
||||||
// object contains a timeslice we need.
|
|
||||||
vInteger t_mask = ((coords[sU] >= tmin) &&
|
|
||||||
(coords[sU] <= tmax));
|
|
||||||
Integer timeSlices = Reduce(t_mask);
|
|
||||||
|
|
||||||
if (timeSlices > 0)
|
// Compute the sequential conserved current insertion only if our simd
|
||||||
{
|
// object contains a timeslice we need.
|
||||||
Kernels::SeqConservedCurrentSiteFwd(tmpFwd[sU],
|
vInteger t_mask = ((coords[sU] >= tmin) &&
|
||||||
q_out[sU],
|
(coords[sU] <= tmax));
|
||||||
Umu, sU, mu, t_mask);
|
Integer timeSlices = Reduce(t_mask);
|
||||||
}
|
|
||||||
|
|
||||||
// Repeat for backward direction.
|
if (timeSlices > 0) {
|
||||||
t_mask = ((coords[sU] >= (tmin + tshift)) &&
|
Kernels::SeqConservedCurrentSiteFwd(tmpFwd[sU],
|
||||||
(coords[sU] <= (tmax + tshift)));
|
q_out[sU],
|
||||||
|
Umu, sU, mu, t_mask);
|
||||||
//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)
|
|
||||||
unsigned int t0 = 0;
|
|
||||||
if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords[sU] == t0 ));
|
|
||||||
|
|
||||||
timeSlices = Reduce(t_mask);
|
|
||||||
|
|
||||||
if (timeSlices > 0)
|
|
||||||
{
|
|
||||||
Kernels::SeqConservedCurrentSiteBwd(tmpBwd[sU],
|
|
||||||
q_out[sU],
|
|
||||||
Umu, sU, mu, t_mask);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Repeat for backward direction.
|
||||||
|
t_mask = ((coords[sU] >= (tmin + tshift)) &&
|
||||||
|
(coords[sU] <= (tmax + tshift)));
|
||||||
|
|
||||||
|
//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)
|
||||||
|
unsigned int t0 = 0;
|
||||||
|
if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords[sU] == t0 ));
|
||||||
|
|
||||||
|
timeSlices = Reduce(t_mask);
|
||||||
|
|
||||||
|
if (timeSlices > 0) {
|
||||||
|
Kernels::SeqConservedCurrentSiteBwd(tmpBwd[sU],
|
||||||
|
q_out[sU],
|
||||||
|
Umu, sU, mu, t_mask);
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
FermOpTemplateInstantiate(WilsonFermion);
|
FermOpTemplateInstantiate(WilsonFermion);
|
||||||
|
@ -244,13 +244,13 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
|
|||||||
assert(dirdisp<=7);
|
assert(dirdisp<=7);
|
||||||
assert(dirdisp>=0);
|
assert(dirdisp>=0);
|
||||||
|
|
||||||
parallel_for(int ss=0;ss<Umu.Grid()->oSites();ss++){
|
thread_loop( (int ss=0;ss<Umu.Grid()->oSites();ss++),{
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
int sU=ss;
|
int sU=ss;
|
||||||
int sF = s+Ls*sU;
|
int sF = s+Ls*sU;
|
||||||
Kernels::DhopDirK(Stencil,Umu,Stencil.CommBuf(),sF,sU,in,out,dirdisp,gamma);
|
Kernels::DhopDirK(Stencil,Umu,Stencil.CommBuf(),sF,sU,in,out,dirdisp,gamma);
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
};
|
};
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -293,7 +293,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
|
|||||||
////////////////////////
|
////////////////////////
|
||||||
|
|
||||||
DerivDhopComputeTime -= usecond();
|
DerivDhopComputeTime -= usecond();
|
||||||
parallel_for (int sss = 0; sss < U.Grid()->oSites(); sss++) {
|
thread_loop( (int sss = 0; sss < U.Grid()->oSites(); sss++) ,{
|
||||||
for (int s = 0; s < Ls; s++) {
|
for (int s = 0; s < Ls; s++) {
|
||||||
int sU = sss;
|
int sU = sss;
|
||||||
int sF = s + Ls * sU;
|
int sF = s + Ls * sU;
|
||||||
@ -307,7 +307,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
|
|||||||
// spin trace outer product
|
// spin trace outer product
|
||||||
////////////////////////////
|
////////////////////////////
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
////////////////////////////
|
////////////////////////////
|
||||||
// spin trace outer product
|
// spin trace outer product
|
||||||
////////////////////////////
|
////////////////////////////
|
||||||
@ -464,18 +464,18 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
|||||||
DhopComputeTime2-=usecond();
|
DhopComputeTime2-=usecond();
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
int sz=st.surface_list.size();
|
int sz=st.surface_list.size();
|
||||||
parallel_for (int ss = 0; ss < sz; ss++) {
|
thread_loop( (int ss = 0; ss < sz; ss++) ,{
|
||||||
int sU = st.surface_list[ss];
|
int sU = st.surface_list[ss];
|
||||||
int sF = LLs * sU;
|
int sF = LLs * sU;
|
||||||
Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1);
|
Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1);
|
||||||
}
|
});
|
||||||
} else {
|
} else {
|
||||||
int sz=st.surface_list.size();
|
int sz=st.surface_list.size();
|
||||||
parallel_for (int ss = 0; ss < sz; ss++) {
|
thread_loop( (int ss = 0; ss < sz; ss++) ,{
|
||||||
int sU = st.surface_list[ss];
|
int sU = st.surface_list[ss];
|
||||||
int sF = LLs * sU;
|
int sF = LLs * sU;
|
||||||
Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1);
|
Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
DhopComputeTime2+=usecond();
|
DhopComputeTime2+=usecond();
|
||||||
#else
|
#else
|
||||||
@ -502,17 +502,17 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
|
|||||||
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
||||||
|
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
parallel_for (int ss = 0; ss < U.Grid()->oSites(); ss++) {
|
thread_loop( (int ss = 0; ss < U.Grid()->oSites(); ss++) ,{
|
||||||
int sU = ss;
|
int sU = ss;
|
||||||
int sF = LLs * sU;
|
int sF = LLs * sU;
|
||||||
Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
|
Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
|
||||||
}
|
});
|
||||||
} else {
|
} else {
|
||||||
parallel_for (int ss = 0; ss < U.Grid()->oSites(); ss++) {
|
thread_loop( (int ss = 0; ss < U.Grid()->oSites(); ss++) ,{
|
||||||
int sU = ss;
|
int sU = ss;
|
||||||
int sF = LLs * sU;
|
int sF = LLs * sU;
|
||||||
Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
|
Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
DhopComputeTime+=usecond();
|
DhopComputeTime+=usecond();
|
||||||
}
|
}
|
||||||
@ -739,41 +739,37 @@ void WilsonFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
|
|||||||
// q2(x + mu, Ls - 1 - s). 5D lattice so shift 4D coordinate mu by one.
|
// q2(x + mu, Ls - 1 - s). 5D lattice so shift 4D coordinate mu by one.
|
||||||
tmp1 = Cshift(q_in_1, mu + 1, 1);
|
tmp1 = Cshift(q_in_1, mu + 1, 1);
|
||||||
tmp2 = Cshift(q_in_2, mu + 1, 1);
|
tmp2 = Cshift(q_in_2, mu + 1, 1);
|
||||||
parallel_for (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU)
|
thread_loop( (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU), {
|
||||||
{
|
unsigned int sF1 = sU * LLs;
|
||||||
unsigned int sF1 = sU * LLs;
|
unsigned int sF2 = (sU + 1) * LLs - 1;
|
||||||
unsigned int sF2 = (sU + 1) * LLs - 1;
|
|
||||||
|
for (unsigned int s = 0; s < LLs; ++s) {
|
||||||
|
|
||||||
for (unsigned int s = 0; s < LLs; ++s)
|
bool axial_sign = ((curr_type == Current::Axial) && \
|
||||||
{
|
(s < (LLs / 2)));
|
||||||
bool axial_sign = ((curr_type == Current::Axial) && \
|
SitePropagator qSite2, qmuSite2;
|
||||||
(s < (LLs / 2)));
|
|
||||||
SitePropagator qSite2, qmuSite2;
|
// If vectorised in 5th dimension, reverse q2 vector to match up
|
||||||
|
// sites correctly.
|
||||||
// If vectorised in 5th dimension, reverse q2 vector to match up
|
if (Impl::LsVectorised) {
|
||||||
// sites correctly.
|
REVERSE_LS(q_in_2[sF2], qSite2, Ls / LLs);
|
||||||
if (Impl::LsVectorised)
|
REVERSE_LS(tmp2[sF2], qmuSite2, Ls / LLs);
|
||||||
{
|
} else {
|
||||||
REVERSE_LS(q_in_2[sF2], qSite2, Ls / LLs);
|
qSite2 = q_in_2[sF2];
|
||||||
REVERSE_LS(tmp2[sF2], qmuSite2, Ls / LLs);
|
qmuSite2 = tmp2[sF2];
|
||||||
}
|
}
|
||||||
else
|
Kernels::ContractConservedCurrentSiteFwd(tmp1[sF1],
|
||||||
{
|
qSite2,
|
||||||
qSite2 = q_in_2[sF2];
|
q_out[sU],
|
||||||
qmuSite2 = tmp2[sF2];
|
Umu, sU, mu, axial_sign);
|
||||||
}
|
Kernels::ContractConservedCurrentSiteBwd(q_in_1[sF1],
|
||||||
Kernels::ContractConservedCurrentSiteFwd(tmp1[sF1],
|
qmuSite2,
|
||||||
qSite2,
|
q_out[sU],
|
||||||
q_out[sU],
|
Umu, sU, mu, axial_sign);
|
||||||
Umu, sU, mu, axial_sign);
|
sF1++;
|
||||||
Kernels::ContractConservedCurrentSiteBwd(q_in_1[sF1],
|
sF2--;
|
||||||
qmuSite2,
|
|
||||||
q_out[sU],
|
|
||||||
Umu, sU, mu, axial_sign);
|
|
||||||
sF1++;
|
|
||||||
sF2--;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -817,50 +813,46 @@ void WilsonFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
|||||||
tmp = ph*q_in;
|
tmp = ph*q_in;
|
||||||
tmpBwd = Cshift(tmp, mu + 1, -1);
|
tmpBwd = Cshift(tmp, mu + 1, -1);
|
||||||
|
|
||||||
parallel_for (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU)
|
thread_loop( (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU) ,{
|
||||||
{
|
// Compute the sequential conserved current insertion only if our simd
|
||||||
// Compute the sequential conserved current insertion only if our simd
|
// object contains a timeslice we need.
|
||||||
// object contains a timeslice we need.
|
vInteger t_mask = ((coords[sU] >= tmin) &&
|
||||||
vInteger t_mask = ((coords[sU] >= tmin) &&
|
(coords[sU] <= tmax));
|
||||||
(coords[sU] <= tmax));
|
Integer timeSlices = Reduce(t_mask);
|
||||||
Integer timeSlices = Reduce(t_mask);
|
|
||||||
|
if (timeSlices > 0) {
|
||||||
|
|
||||||
if (timeSlices > 0)
|
unsigned int sF = sU * LLs;
|
||||||
{
|
for (unsigned int s = 0; s < LLs; ++s) {
|
||||||
unsigned int sF = sU * LLs;
|
bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
|
||||||
for (unsigned int s = 0; s < LLs; ++s)
|
Kernels::SeqConservedCurrentSiteFwd(tmpFwd[sF],
|
||||||
{
|
q_out[sF], Umu, sU,
|
||||||
bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
|
mu, t_mask, axial_sign);
|
||||||
Kernels::SeqConservedCurrentSiteFwd(tmpFwd[sF],
|
++sF;
|
||||||
q_out[sF], Umu, sU,
|
}
|
||||||
mu, t_mask, axial_sign);
|
|
||||||
++sF;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Repeat for backward direction.
|
|
||||||
t_mask = ((coords[sU] >= (tmin + tshift)) &&
|
|
||||||
(coords[sU] <= (tmax + tshift)));
|
|
||||||
|
|
||||||
//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)
|
|
||||||
unsigned int t0 = 0;
|
|
||||||
if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords[sU] == t0 ));
|
|
||||||
|
|
||||||
timeSlices = Reduce(t_mask);
|
|
||||||
|
|
||||||
if (timeSlices > 0)
|
|
||||||
{
|
|
||||||
unsigned int sF = sU * LLs;
|
|
||||||
for (unsigned int s = 0; s < LLs; ++s)
|
|
||||||
{
|
|
||||||
bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
|
|
||||||
Kernels::SeqConservedCurrentSiteBwd(tmpBwd[sF],
|
|
||||||
q_out[sF], Umu, sU,
|
|
||||||
mu, t_mask, axial_sign);
|
|
||||||
++sF;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Repeat for backward direction.
|
||||||
|
t_mask = ((coords[sU] >= (tmin + tshift)) &&
|
||||||
|
(coords[sU] <= (tmax + tshift)));
|
||||||
|
|
||||||
|
//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)
|
||||||
|
unsigned int t0 = 0;
|
||||||
|
if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords[sU] == t0 ));
|
||||||
|
|
||||||
|
timeSlices = Reduce(t_mask);
|
||||||
|
|
||||||
|
if (timeSlices > 0) {
|
||||||
|
unsigned int sF = sU * LLs;
|
||||||
|
for (unsigned int s = 0; s < LLs; ++s) {
|
||||||
|
bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
|
||||||
|
Kernels::SeqConservedCurrentSiteBwd(tmpBwd[sF],
|
||||||
|
q_out[sF], Umu, sU,
|
||||||
|
mu, t_mask, axial_sign);
|
||||||
|
++sF;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
FermOpTemplateInstantiate(WilsonFermion5D);
|
FermOpTemplateInstantiate(WilsonFermion5D);
|
||||||
|
@ -101,10 +101,10 @@ public:
|
|||||||
//static std::chrono::duration<double> diff;
|
//static std::chrono::duration<double> diff;
|
||||||
|
|
||||||
//auto start = std::chrono::high_resolution_clock::now();
|
//auto start = std::chrono::high_resolution_clock::now();
|
||||||
parallel_for(int ss=0;ss<P.Grid()->oSites();ss++){
|
thread_loop( (int ss=0;ss<P.Grid()->oSites();ss++),{
|
||||||
for (int mu = 0; mu < Nd; mu++)
|
for (int mu = 0; mu < Nd; mu++)
|
||||||
U[ss]._internal[mu] = ProjectOnGroup(Exponentiate(P[ss]._internal[mu], ep, Nexp) * U[ss]._internal[mu]);
|
U[ss]._internal[mu] = ProjectOnGroup(Exponentiate(P[ss]._internal[mu], ep, Nexp) * U[ss]._internal[mu]);
|
||||||
}
|
});
|
||||||
|
|
||||||
//auto end = std::chrono::high_resolution_clock::now();
|
//auto end = std::chrono::high_resolution_clock::now();
|
||||||
// diff += end - start;
|
// diff += end - start;
|
||||||
|
@ -82,7 +82,7 @@ public:
|
|||||||
action = (2.0*Ndim + mass_square)*phisquared - lambda/24.*phisquared*phisquared;
|
action = (2.0*Ndim + mass_square)*phisquared - lambda/24.*phisquared*phisquared;
|
||||||
for (int mu = 0; mu < Ndim; mu++) {
|
for (int mu = 0; mu < Ndim; mu++) {
|
||||||
// pshift = Cshift(p, mu, +1); // not efficient, implement with stencils
|
// pshift = Cshift(p, mu, +1); // not efficient, implement with stencils
|
||||||
parallel_for (int i = 0; i < p.Grid()->oSites(); i++) {
|
thread_loop( (int i = 0; i < p.Grid()->oSites(); i++) ,{
|
||||||
int permute_type;
|
int permute_type;
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
vobj temp2;
|
vobj temp2;
|
||||||
@ -101,7 +101,7 @@ public:
|
|||||||
} else {
|
} else {
|
||||||
action[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset];
|
action[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset];
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
// action -= pshift*p + p*pshift;
|
// action -= pshift*p + p*pshift;
|
||||||
}
|
}
|
||||||
// NB the trace in the algebra is normalised to 1/2
|
// NB the trace in the algebra is normalised to 1/2
|
||||||
@ -118,7 +118,7 @@ public:
|
|||||||
|
|
||||||
//for (int mu = 0; mu < Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
|
//for (int mu = 0; mu < Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
|
||||||
for (int point = 0; point < npoint; point++) {
|
for (int point = 0; point < npoint; point++) {
|
||||||
parallel_for (int i = 0; i < p.Grid()->oSites(); i++) {
|
thread_loop( (int i = 0; i < p.Grid()->oSites(); i++) ,{
|
||||||
const vobj *temp;
|
const vobj *temp;
|
||||||
vobj temp2;
|
vobj temp2;
|
||||||
int permute_type;
|
int permute_type;
|
||||||
@ -136,7 +136,7 @@ public:
|
|||||||
} else {
|
} else {
|
||||||
force[i] -= phiStencil.CommBuf()[SE->_offset];
|
force[i] -= phiStencil.CommBuf()[SE->_offset];
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -48,12 +48,12 @@ void axpibg5x(Lattice<vobj> &z,const Lattice<vobj> &x,Coeff a,Coeff b)
|
|||||||
GridBase *grid=x.Grid();
|
GridBase *grid=x.Grid();
|
||||||
|
|
||||||
Gamma G5(Gamma::Algebra::Gamma5);
|
Gamma G5(Gamma::Algebra::Gamma5);
|
||||||
parallel_for(int ss=0;ss<grid->oSites();ss++){
|
thread_loop( (int ss=0;ss<grid->oSites();ss++),{
|
||||||
vobj tmp;
|
vobj tmp;
|
||||||
tmp = a*x[ss];
|
tmp = a*x[ss];
|
||||||
tmp = tmp + G5*(b*timesI(x[ss]));
|
tmp = tmp + G5*(b*timesI(x[ss]));
|
||||||
vstream(z[ss],tmp);
|
vstream(z[ss],tmp);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj,class Coeff>
|
template<class vobj,class Coeff>
|
||||||
@ -64,10 +64,10 @@ void axpby_ssp(Lattice<vobj> &z, Coeff a,const Lattice<vobj> &x,Coeff b,const La
|
|||||||
conformable(x,z);
|
conformable(x,z);
|
||||||
GridBase *grid=x.Grid();
|
GridBase *grid=x.Grid();
|
||||||
int Ls = grid->_rdimensions[0];
|
int Ls = grid->_rdimensions[0];
|
||||||
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
thread_loop( (int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
|
||||||
vobj tmp = a*x[ss+s]+b*y[ss+sp];
|
vobj tmp = a*x[ss+s]+b*y[ss+sp];
|
||||||
vstream(z[ss+s],tmp);
|
vstream(z[ss+s],tmp);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj,class Coeff>
|
template<class vobj,class Coeff>
|
||||||
@ -80,12 +80,12 @@ void ag5xpby_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const L
|
|||||||
int Ls = grid->_rdimensions[0];
|
int Ls = grid->_rdimensions[0];
|
||||||
|
|
||||||
Gamma G5(Gamma::Algebra::Gamma5);
|
Gamma G5(Gamma::Algebra::Gamma5);
|
||||||
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
thread_loop((int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
|
||||||
vobj tmp;
|
vobj tmp;
|
||||||
tmp = G5*x[ss+s]*a;
|
tmp = G5*x[ss+s]*a;
|
||||||
tmp = tmp + b*y[ss+sp];
|
tmp = tmp + b*y[ss+sp];
|
||||||
vstream(z[ss+s],tmp);
|
vstream(z[ss+s],tmp);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj,class Coeff>
|
template<class vobj,class Coeff>
|
||||||
@ -97,12 +97,12 @@ void axpbg5y_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const L
|
|||||||
GridBase *grid=x.Grid();
|
GridBase *grid=x.Grid();
|
||||||
int Ls = grid->_rdimensions[0];
|
int Ls = grid->_rdimensions[0];
|
||||||
Gamma G5(Gamma::Algebra::Gamma5);
|
Gamma G5(Gamma::Algebra::Gamma5);
|
||||||
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
thread_loop((int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
|
||||||
vobj tmp;
|
vobj tmp;
|
||||||
tmp = G5*y[ss+sp]*b;
|
tmp = G5*y[ss+sp]*b;
|
||||||
tmp = tmp + a*x[ss+s];
|
tmp = tmp + a*x[ss+s];
|
||||||
vstream(z[ss+s],tmp);
|
vstream(z[ss+s],tmp);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj,class Coeff>
|
template<class vobj,class Coeff>
|
||||||
@ -115,13 +115,13 @@ void ag5xpbg5y_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const
|
|||||||
int Ls = grid->_rdimensions[0];
|
int Ls = grid->_rdimensions[0];
|
||||||
|
|
||||||
Gamma G5(Gamma::Algebra::Gamma5);
|
Gamma G5(Gamma::Algebra::Gamma5);
|
||||||
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
thread_loop((int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
|
||||||
vobj tmp1;
|
vobj tmp1;
|
||||||
vobj tmp2;
|
vobj tmp2;
|
||||||
tmp1 = a*x[ss+s]+b*y[ss+sp];
|
tmp1 = a*x[ss+s]+b*y[ss+sp];
|
||||||
tmp2 = G5*tmp1;
|
tmp2 = G5*tmp1;
|
||||||
vstream(z[ss+s],tmp2);
|
vstream(z[ss+s],tmp2);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj,class Coeff>
|
template<class vobj,class Coeff>
|
||||||
@ -132,12 +132,12 @@ void axpby_ssp_pminus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,co
|
|||||||
conformable(x,z);
|
conformable(x,z);
|
||||||
GridBase *grid=x.Grid();
|
GridBase *grid=x.Grid();
|
||||||
int Ls = grid->_rdimensions[0];
|
int Ls = grid->_rdimensions[0];
|
||||||
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
thread_loop((int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
|
||||||
vobj tmp;
|
vobj tmp;
|
||||||
spProj5m(tmp,y[ss+sp]);
|
spProj5m(tmp,y[ss+sp]);
|
||||||
tmp = a*x[ss+s]+b*tmp;
|
tmp = a*x[ss+s]+b*tmp;
|
||||||
vstream(z[ss+s],tmp);
|
vstream(z[ss+s],tmp);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj,class Coeff>
|
template<class vobj,class Coeff>
|
||||||
@ -148,12 +148,12 @@ void axpby_ssp_pplus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,con
|
|||||||
conformable(x,z);
|
conformable(x,z);
|
||||||
GridBase *grid=x.Grid();
|
GridBase *grid=x.Grid();
|
||||||
int Ls = grid->_rdimensions[0];
|
int Ls = grid->_rdimensions[0];
|
||||||
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
thread_loop((int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
|
||||||
vobj tmp;
|
vobj tmp;
|
||||||
spProj5p(tmp,y[ss+sp]);
|
spProj5p(tmp,y[ss+sp]);
|
||||||
tmp = a*x[ss+s]+b*tmp;
|
tmp = a*x[ss+s]+b*tmp;
|
||||||
vstream(z[ss+s],tmp);
|
vstream(z[ss+s],tmp);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
@ -164,14 +164,14 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
|
|||||||
conformable(x,z);
|
conformable(x,z);
|
||||||
int Ls = grid->_rdimensions[0];
|
int Ls = grid->_rdimensions[0];
|
||||||
Gamma G5(Gamma::Algebra::Gamma5);
|
Gamma G5(Gamma::Algebra::Gamma5);
|
||||||
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls) {
|
thread_loop((int ss=0;ss<grid->oSites();ss+=Ls) {
|
||||||
vobj tmp;
|
vobj tmp;
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
int sp = Ls-1-s;
|
int sp = Ls-1-s;
|
||||||
tmp = G5*x[ss+s];
|
tmp = G5*x[ss+s];
|
||||||
vstream(z[ss+sp],tmp);
|
vstream(z[ss+sp],tmp);
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
@ -223,7 +223,7 @@ public:
|
|||||||
int i0, i1;
|
int i0, i1;
|
||||||
su2SubGroupIndex(i0, i1, su2_index);
|
su2SubGroupIndex(i0, i1, su2_index);
|
||||||
|
|
||||||
parallel_for (int ss = 0; ss < grid->oSites(); ss++) {
|
thread_loop( (int ss = 0; ss < grid->oSites(); ss++) ,{
|
||||||
subgroup[ss]()()(0, 0) = source[ss]()()(i0, i0);
|
subgroup[ss]()()(0, 0) = source[ss]()()(i0, i0);
|
||||||
subgroup[ss]()()(0, 1) = source[ss]()()(i0, i1);
|
subgroup[ss]()()(0, 1) = source[ss]()()(i0, i1);
|
||||||
subgroup[ss]()()(1, 0) = source[ss]()()(i1, i0);
|
subgroup[ss]()()(1, 0) = source[ss]()()(i1, i0);
|
||||||
@ -238,7 +238,7 @@ public:
|
|||||||
// this should be purely real
|
// this should be purely real
|
||||||
Determinant[ss] =
|
Determinant[ss] =
|
||||||
Sigma()()(0, 0) * Sigma()()(1, 1) - Sigma()()(0, 1) * Sigma()()(1, 0);
|
Sigma()()(0, 0) * Sigma()()(1, 1) - Sigma()()(0, 1) * Sigma()()(1, 0);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -253,12 +253,12 @@ public:
|
|||||||
su2SubGroupIndex(i0, i1, su2_index);
|
su2SubGroupIndex(i0, i1, su2_index);
|
||||||
|
|
||||||
dest = 1.0; // start out with identity
|
dest = 1.0; // start out with identity
|
||||||
parallel_for (int ss = 0; ss < grid->oSites(); ss++) {
|
thread_loop( (int ss = 0; ss < grid->oSites(); ss++) ,{
|
||||||
dest[ss]()()(i0, i0) = subgroup[ss]()()(0, 0);
|
dest[ss]()()(i0, i0) = subgroup[ss]()()(0, 0);
|
||||||
dest[ss]()()(i0, i1) = subgroup[ss]()()(0, 1);
|
dest[ss]()()(i0, i1) = subgroup[ss]()()(0, 1);
|
||||||
dest[ss]()()(i1, i0) = subgroup[ss]()()(1, 0);
|
dest[ss]()()(i1, i0) = subgroup[ss]()()(1, 0);
|
||||||
dest[ss]()()(i1, i1) = subgroup[ss]()()(1, 1);
|
dest[ss]()()(i1, i1) = subgroup[ss]()()(1, 1);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////
|
///////////////////////////////////////////////
|
||||||
|
@ -63,9 +63,9 @@ template<class vobj,class cobj,class compressor>
|
|||||||
void Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)
|
void Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)
|
||||||
{
|
{
|
||||||
int num=table.size();
|
int num=table.size();
|
||||||
parallel_for(int i=0;i<num;i++){
|
thread_loop( (int i=0;i<num;i++), {
|
||||||
compress.Compress(&buffer[off],table[i].first,rhs[so+table[i].second]);
|
compress.Compress(&buffer[off],table[i].first,rhs[so+table[i].second]);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
@ -83,10 +83,10 @@ void Gather_plane_exchange_table(std::vector<std::pair<int,int> >& table,const L
|
|||||||
assert( (table.size()&0x1)==0);
|
assert( (table.size()&0x1)==0);
|
||||||
int num=table.size()/2;
|
int num=table.size()/2;
|
||||||
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
|
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
|
||||||
parallel_for(int j=0;j<num;j++){
|
thread_loop( (int j=0;j<num;j++), {
|
||||||
compress.CompressExchange(&pointers[0][0],&pointers[1][0],&rhs[0],
|
compress.CompressExchange(&pointers[0][0],&pointers[1][0],&rhs[0],
|
||||||
j,so+table[2*j].second,so+table[2*j+1].second,type);
|
j,so+table[2*j].second,so+table[2*j+1].second,type);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
struct StencilEntry {
|
struct StencilEntry {
|
||||||
@ -330,20 +330,14 @@ public:
|
|||||||
}
|
}
|
||||||
void Communicate(void)
|
void Communicate(void)
|
||||||
{
|
{
|
||||||
#ifdef GRID_OMP
|
thread_region
|
||||||
#pragma omp parallel
|
|
||||||
{
|
{
|
||||||
// must be called in parallel region
|
// must be called in parallel region
|
||||||
int mythread = omp_get_thread_num();
|
int mythread = thread_num();
|
||||||
int maxthreads= omp_get_max_threads();
|
int maxthreads= thread_max();
|
||||||
int nthreads = CartesianCommunicator::nCommThreads;
|
int nthreads = CartesianCommunicator::nCommThreads;
|
||||||
assert(nthreads <= maxthreads);
|
assert(nthreads <= maxthreads);
|
||||||
|
|
||||||
if (nthreads == -1) nthreads = 1;
|
if (nthreads == -1) nthreads = 1;
|
||||||
#else
|
|
||||||
int mythread = 0;
|
|
||||||
int nthreads = 1;
|
|
||||||
#endif
|
|
||||||
if (mythread < nthreads) {
|
if (mythread < nthreads) {
|
||||||
for (int i = mythread; i < Packets.size(); i += nthreads) {
|
for (int i = mythread; i < Packets.size(); i += nthreads) {
|
||||||
double start = usecond();
|
double start = usecond();
|
||||||
@ -355,9 +349,7 @@ public:
|
|||||||
comm_time_thr[mythread] += usecond() - start;
|
comm_time_thr[mythread] += usecond() - start;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#ifdef GRID_OMP
|
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
|
template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
|
||||||
@ -509,20 +501,20 @@ public:
|
|||||||
|
|
||||||
for(int i=0;i<mm.size();i++){
|
for(int i=0;i<mm.size();i++){
|
||||||
mergetime-=usecond();
|
mergetime-=usecond();
|
||||||
parallel_for(int o=0;o<mm[i].buffer_size/2;o++){
|
thread_loop( (int o=0;o<mm[i].buffer_size/2;o++), {
|
||||||
decompress.Exchange(mm[i].mpointer,
|
decompress.Exchange(mm[i].mpointer,
|
||||||
mm[i].vpointers[0],
|
mm[i].vpointers[0],
|
||||||
mm[i].vpointers[1],
|
mm[i].vpointers[1],
|
||||||
mm[i].type,o);
|
mm[i].type,o);
|
||||||
}
|
});
|
||||||
mergetime+=usecond();
|
mergetime+=usecond();
|
||||||
}
|
}
|
||||||
|
|
||||||
for(int i=0;i<dd.size();i++){
|
for(int i=0;i<dd.size();i++){
|
||||||
decompresstime-=usecond();
|
decompresstime-=usecond();
|
||||||
parallel_for(int o=0;o<dd[i].buffer_size;o++){
|
thread_loop( (int o=0;o<dd[i].buffer_size;o++),{
|
||||||
decompress.Decompress(dd[i].kernel_p,dd[i].mpi_p,o);
|
decompress.Decompress(dd[i].kernel_p,dd[i].mpi_p,o);
|
||||||
}
|
});
|
||||||
decompresstime+=usecond();
|
decompresstime+=usecond();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -52,32 +52,21 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#ifdef GRID_OMP
|
#ifdef GRID_OMP
|
||||||
#define thread_loop( range , ... ) _Pragma("omp parallel for schedule(static)") for range { __VA_ARGS__ ; };
|
#define thread_loop( range , ... ) _Pragma("omp parallel for schedule(static)") for range { __VA_ARGS__ ; };
|
||||||
#define thread_loop_in_region( range , ... ) _Pragma("omp for schedule(static)") for range { __VA_ARGS__ ; };
|
#define thread_loop_in_region( range , ... ) _Pragma("omp for schedule(static)") for range { __VA_ARGS__ ; };
|
||||||
#define thread_loop_collapse( range , ... ) _Pragma("omp parallel for collapse(2)") for range { __VA_ARGS__ };
|
#define thread_loop_collapse( n, range , ... ) _Pragma("omp parallel for collapse(" #n ")") for range { __VA_ARGS__ };
|
||||||
#define thread_region _Pragma("omp parallel")
|
#define thread_region _Pragma("omp parallel")
|
||||||
#define thread_critical _Pragma("omp critical")
|
#define thread_critical _Pragma("omp critical")
|
||||||
|
#define thread_num(a) omp_get_thread_num()
|
||||||
|
#define thread_max(a) omp_get_max_threads()
|
||||||
#else
|
#else
|
||||||
#define thread_loop( range , ... ) for range { __VA_ARGS__ ; };
|
#define thread_loop( range , ... ) for range { __VA_ARGS__ ; };
|
||||||
#define thread_loop_in_region( range , ... ) for range { __VA_ARGS__ ; };
|
#define thread_loop_in_region( range , ... ) for range { __VA_ARGS__ ; };
|
||||||
#define thread_loop_collapse( range , ... ) for range { __VA_ARGS__ ; };
|
#define thread_loop_collapse( n, range , ... ) for range { __VA_ARGS__ ; };
|
||||||
#define thread_region
|
#define thread_region
|
||||||
#define thread_critical
|
#define thread_critical
|
||||||
|
#define thread_num(a) (0)
|
||||||
|
#define thread_max(a) (1)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Deprecated primitives; to eridicate when done delete and fix errors.
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
#ifdef GRID_OMP
|
|
||||||
#define parallel_for _Pragma("omp parallel for schedule(static)") for
|
|
||||||
#define parallel_for_internal _Pragma("omp for schedule(static)") for
|
|
||||||
#define parallel_region thread_region
|
|
||||||
#define parallel_for_nest2 for
|
|
||||||
#else
|
|
||||||
#define parallel_for for
|
|
||||||
#define parallel_for_internal for
|
|
||||||
#define parallel_region
|
|
||||||
#define parallel_for_nest2 for
|
|
||||||
#endif
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////
|
||||||
// Accelerator primitives; fall back to threading
|
// Accelerator primitives; fall back to threading
|
||||||
|
Loading…
Reference in New Issue
Block a user