mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-14 01:35:36 +00:00
GPU support
This commit is contained in:
parent
8c91e82ee8
commit
c43a2b599a
@ -67,9 +67,9 @@ void sliceInnerProductMesonField(std::vector< std::vector<ComplexD> > &mat,
|
|||||||
// sum across these down to scalars
|
// sum across these down to scalars
|
||||||
// splitting the SIMD
|
// splitting the SIMD
|
||||||
std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd*Lblock*Rblock);
|
std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd*Lblock*Rblock);
|
||||||
parallel_for (int r = 0; r < rd * Lblock * Rblock; r++){
|
thread_loop( (int r = 0; r < rd * Lblock * Rblock; r++),{
|
||||||
lvSum[r] = Zero();
|
lvSum[r] = Zero();
|
||||||
}
|
});
|
||||||
|
|
||||||
std::vector<scalar_type > lsSum(ld*Lblock*Rblock,scalar_type(0.0));
|
std::vector<scalar_type > lsSum(ld*Lblock*Rblock,scalar_type(0.0));
|
||||||
|
|
||||||
@ -79,7 +79,7 @@ void sliceInnerProductMesonField(std::vector< std::vector<ComplexD> > &mat,
|
|||||||
|
|
||||||
std::cout << GridLogMessage << " Entering first parallel loop "<<std::endl;
|
std::cout << GridLogMessage << " Entering first parallel loop "<<std::endl;
|
||||||
// Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
|
// Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
|
||||||
parallel_for(int r=0;r<rd;r++){
|
thread_loop((int r=0;r<rd;r++),{
|
||||||
|
|
||||||
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
||||||
|
|
||||||
@ -110,11 +110,11 @@ void sliceInnerProductMesonField(std::vector< std::vector<ComplexD> > &mat,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
|
|
||||||
std::cout << GridLogMessage << " Entering second parallel loop "<<std::endl;
|
std::cout << GridLogMessage << " Entering second parallel loop "<<std::endl;
|
||||||
// Sum across simd lanes in the plane, breaking out orthog dir.
|
// Sum across simd lanes in the plane, breaking out orthog dir.
|
||||||
parallel_for(int rt=0;rt<rd;rt++){
|
thread_loop((int rt=0;rt<rd;rt++),{
|
||||||
|
|
||||||
Coordinate icoor(Nd);
|
Coordinate icoor(Nd);
|
||||||
|
|
||||||
@ -139,7 +139,7 @@ void sliceInnerProductMesonField(std::vector< std::vector<ComplexD> > &mat,
|
|||||||
|
|
||||||
}
|
}
|
||||||
}}
|
}}
|
||||||
}
|
});
|
||||||
|
|
||||||
std::cout << GridLogMessage << " Entering non parallel loop "<<std::endl;
|
std::cout << GridLogMessage << " Entering non parallel loop "<<std::endl;
|
||||||
for(int t=0;t<fd;t++)
|
for(int t=0;t<fd;t++)
|
||||||
@ -199,14 +199,14 @@ void sliceInnerProductMesonFieldGamma(std::vector< std::vector<ComplexD> > &mat,
|
|||||||
int MFlvol = ld*Lblock*Rblock*Ngamma;
|
int MFlvol = ld*Lblock*Rblock*Ngamma;
|
||||||
|
|
||||||
std::vector<vector_type,alignedAllocator<vector_type> > lvSum(MFrvol);
|
std::vector<vector_type,alignedAllocator<vector_type> > lvSum(MFrvol);
|
||||||
parallel_for (int r = 0; r < MFrvol; r++){
|
thread_loop( (int r = 0; r < MFrvol; r++),{
|
||||||
lvSum[r] = Zero();
|
lvSum[r] = Zero();
|
||||||
}
|
});
|
||||||
|
|
||||||
std::vector<scalar_type > lsSum(MFlvol);
|
std::vector<scalar_type > lsSum(MFlvol);
|
||||||
parallel_for (int r = 0; r < MFlvol; r++){
|
thread_loop( (int r = 0; r < MFlvol; r++),{
|
||||||
lsSum[r]=scalar_type(0.0);
|
lsSum[r]=scalar_type(0.0);
|
||||||
}
|
});
|
||||||
|
|
||||||
int e1= grid->_slice_nblock[orthogdim];
|
int e1= grid->_slice_nblock[orthogdim];
|
||||||
int e2= grid->_slice_block [orthogdim];
|
int e2= grid->_slice_block [orthogdim];
|
||||||
@ -215,7 +215,7 @@ void sliceInnerProductMesonFieldGamma(std::vector< std::vector<ComplexD> > &mat,
|
|||||||
std::cout << GridLogMessage << " Entering first parallel loop "<<std::endl;
|
std::cout << GridLogMessage << " Entering first parallel loop "<<std::endl;
|
||||||
|
|
||||||
// Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
|
// Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
|
||||||
parallel_for(int r=0;r<rd;r++){
|
thread_loop((int r=0;r<rd;r++),{
|
||||||
|
|
||||||
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
||||||
|
|
||||||
@ -252,11 +252,11 @@ void sliceInnerProductMesonFieldGamma(std::vector< std::vector<ComplexD> > &mat,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
|
|
||||||
std::cout << GridLogMessage << " Entering second parallel loop "<<std::endl;
|
std::cout << GridLogMessage << " Entering second parallel loop "<<std::endl;
|
||||||
// Sum across simd lanes in the plane, breaking out orthog dir.
|
// Sum across simd lanes in the plane, breaking out orthog dir.
|
||||||
parallel_for(int rt=0;rt<rd;rt++){
|
thread_loop((int rt=0;rt<rd;rt++),{
|
||||||
|
|
||||||
iScalar<vector_type> temp;
|
iScalar<vector_type> temp;
|
||||||
Coordinate icoor(Nd);
|
Coordinate icoor(Nd);
|
||||||
@ -282,7 +282,7 @@ void sliceInnerProductMesonFieldGamma(std::vector< std::vector<ComplexD> > &mat,
|
|||||||
|
|
||||||
}
|
}
|
||||||
}}}
|
}}}
|
||||||
}
|
});
|
||||||
|
|
||||||
std::cout << GridLogMessage << " Entering non parallel loop "<<std::endl;
|
std::cout << GridLogMessage << " Entering non parallel loop "<<std::endl;
|
||||||
for(int t=0;t<fd;t++)
|
for(int t=0;t<fd;t++)
|
||||||
@ -347,14 +347,14 @@ void sliceInnerProductMesonFieldGamma1(std::vector< std::vector<ComplexD> > &mat
|
|||||||
int MFlvol = ld*Lblock*Rblock;
|
int MFlvol = ld*Lblock*Rblock;
|
||||||
|
|
||||||
Vector<SpinMatrix_v > lvSum(MFrvol);
|
Vector<SpinMatrix_v > lvSum(MFrvol);
|
||||||
parallel_for (int r = 0; r < MFrvol; r++){
|
thread_loop( (int r = 0; r < MFrvol; r++),{
|
||||||
lvSum[r] = Zero();
|
lvSum[r] = Zero();
|
||||||
}
|
});
|
||||||
|
|
||||||
Vector<SpinMatrix_s > lsSum(MFlvol);
|
Vector<SpinMatrix_s > lsSum(MFlvol);
|
||||||
parallel_for (int r = 0; r < MFlvol; r++){
|
thread_loop( (int r = 0; r < MFlvol; r++),{
|
||||||
lsSum[r]=scalar_type(0.0);
|
lsSum[r]=scalar_type(0.0);
|
||||||
}
|
});
|
||||||
|
|
||||||
int e1= grid->_slice_nblock[orthogdim];
|
int e1= grid->_slice_nblock[orthogdim];
|
||||||
int e2= grid->_slice_block [orthogdim];
|
int e2= grid->_slice_block [orthogdim];
|
||||||
@ -363,7 +363,7 @@ void sliceInnerProductMesonFieldGamma1(std::vector< std::vector<ComplexD> > &mat
|
|||||||
std::cout << GridLogMessage << " Entering first parallel loop "<<std::endl;
|
std::cout << GridLogMessage << " Entering first parallel loop "<<std::endl;
|
||||||
|
|
||||||
// Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
|
// Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
|
||||||
parallel_for(int r=0;r<rd;r++){
|
thread_loop((int r=0;r<rd;r++),{
|
||||||
|
|
||||||
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
||||||
|
|
||||||
@ -394,11 +394,11 @@ void sliceInnerProductMesonFieldGamma1(std::vector< std::vector<ComplexD> > &mat
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
|
|
||||||
std::cout << GridLogMessage << " Entering second parallel loop "<<std::endl;
|
std::cout << GridLogMessage << " Entering second parallel loop "<<std::endl;
|
||||||
// Sum across simd lanes in the plane, breaking out orthog dir.
|
// Sum across simd lanes in the plane, breaking out orthog dir.
|
||||||
parallel_for(int rt=0;rt<rd;rt++){
|
thread_loop((int rt=0;rt<rd;rt++),{
|
||||||
|
|
||||||
Coordinate icoor(Nd);
|
Coordinate icoor(Nd);
|
||||||
ExtractBuffer<SpinMatrix_s> extracted(Nsimd);
|
ExtractBuffer<SpinMatrix_s> extracted(Nsimd);
|
||||||
@ -422,10 +422,10 @@ void sliceInnerProductMesonFieldGamma1(std::vector< std::vector<ComplexD> > &mat
|
|||||||
|
|
||||||
}
|
}
|
||||||
}}
|
}}
|
||||||
}
|
});
|
||||||
|
|
||||||
std::cout << GridLogMessage << " Entering third parallel loop "<<std::endl;
|
std::cout << GridLogMessage << " Entering third parallel loop "<<std::endl;
|
||||||
parallel_for(int t=0;t<fd;t++)
|
thread_loop((int t=0;t<fd;t++)
|
||||||
{
|
{
|
||||||
int pt = t / ld; // processor plane
|
int pt = t / ld; // processor plane
|
||||||
int lt = t % ld;
|
int lt = t % ld;
|
||||||
@ -443,7 +443,7 @@ void sliceInnerProductMesonFieldGamma1(std::vector< std::vector<ComplexD> > &mat
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}}
|
}}
|
||||||
}
|
});
|
||||||
std::cout << GridLogMessage << " Done "<<std::endl;
|
std::cout << GridLogMessage << " Done "<<std::endl;
|
||||||
// defer sum over nodes.
|
// defer sum over nodes.
|
||||||
return;
|
return;
|
||||||
@ -490,14 +490,16 @@ void sliceInnerProductMesonFieldGammaMom(std::vector< std::vector<ComplexD> > &m
|
|||||||
int MFlvol = ld*Lblock*Rblock*Nmom;
|
int MFlvol = ld*Lblock*Rblock*Nmom;
|
||||||
|
|
||||||
Vector<SpinMatrix_v > lvSum(MFrvol);
|
Vector<SpinMatrix_v > lvSum(MFrvol);
|
||||||
parallel_for (int r = 0; r < MFrvol; r++){
|
thread_loop( (int r = 0; r < MFrvol; r++),
|
||||||
|
{
|
||||||
lvSum[r] = Zero();
|
lvSum[r] = Zero();
|
||||||
}
|
});
|
||||||
|
|
||||||
Vector<SpinMatrix_s > lsSum(MFlvol);
|
Vector<SpinMatrix_s > lsSum(MFlvol);
|
||||||
parallel_for (int r = 0; r < MFlvol; r++){
|
thread_loop( (int r = 0; r < MFlvol; r++),
|
||||||
|
{
|
||||||
lsSum[r]=scalar_type(0.0);
|
lsSum[r]=scalar_type(0.0);
|
||||||
}
|
});
|
||||||
|
|
||||||
int e1= grid->_slice_nblock[orthogdim];
|
int e1= grid->_slice_nblock[orthogdim];
|
||||||
int e2= grid->_slice_block [orthogdim];
|
int e2= grid->_slice_block [orthogdim];
|
||||||
@ -506,7 +508,8 @@ void sliceInnerProductMesonFieldGammaMom(std::vector< std::vector<ComplexD> > &m
|
|||||||
std::cout << GridLogMessage << " Entering first parallel loop "<<std::endl;
|
std::cout << GridLogMessage << " Entering first parallel loop "<<std::endl;
|
||||||
|
|
||||||
// Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
|
// Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
|
||||||
parallel_for(int r=0;r<rd;r++){
|
thread_loop((int r=0;r<rd;r++),
|
||||||
|
{
|
||||||
|
|
||||||
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
||||||
|
|
||||||
@ -545,11 +548,12 @@ void sliceInnerProductMesonFieldGammaMom(std::vector< std::vector<ComplexD> > &m
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
|
|
||||||
std::cout << GridLogMessage << " Entering second parallel loop "<<std::endl;
|
std::cout << GridLogMessage << " Entering second parallel loop "<<std::endl;
|
||||||
// Sum across simd lanes in the plane, breaking out orthog dir.
|
// Sum across simd lanes in the plane, breaking out orthog dir.
|
||||||
parallel_for(int rt=0;rt<rd;rt++){
|
thread_loop((int rt=0;rt<rd;rt++),
|
||||||
|
{
|
||||||
|
|
||||||
Coordinate icoor(Nd);
|
Coordinate icoor(Nd);
|
||||||
ExtractBuffer<SpinMatrix_s> extracted(Nsimd);
|
ExtractBuffer<SpinMatrix_s> extracted(Nsimd);
|
||||||
@ -575,10 +579,10 @@ void sliceInnerProductMesonFieldGammaMom(std::vector< std::vector<ComplexD> > &m
|
|||||||
|
|
||||||
}
|
}
|
||||||
}}}
|
}}}
|
||||||
}
|
});
|
||||||
|
|
||||||
std::cout << GridLogMessage << " Entering third parallel loop "<<std::endl;
|
std::cout << GridLogMessage << " Entering third parallel loop "<<std::endl;
|
||||||
parallel_for(int t=0;t<fd;t++)
|
thread_loop((int t=0;t<fd;t++),
|
||||||
{
|
{
|
||||||
int pt = t / ld; // processor plane
|
int pt = t / ld; // processor plane
|
||||||
int lt = t % ld;
|
int lt = t % ld;
|
||||||
@ -602,7 +606,7 @@ void sliceInnerProductMesonFieldGammaMom(std::vector< std::vector<ComplexD> > &m
|
|||||||
}}
|
}}
|
||||||
}
|
}
|
||||||
}}
|
}}
|
||||||
}
|
});
|
||||||
std::cout << GridLogMessage << " Done "<<std::endl;
|
std::cout << GridLogMessage << " Done "<<std::endl;
|
||||||
// defer sum over nodes.
|
// defer sum over nodes.
|
||||||
return;
|
return;
|
||||||
|
Loading…
Reference in New Issue
Block a user