Advanced Computing Platform for Theoretical Physics

Commit 6cf13493 authored by mikeaclark's avatar mikeaclark
Browse files

Revisions to blas benchmarking

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@488 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 82646005
...@@ -36,6 +36,7 @@ int blocksComplex = 0; ...@@ -36,6 +36,7 @@ int blocksComplex = 0;
int blocksFloat3 = 0; int blocksFloat3 = 0;
unsigned long long blas_quda_flops; unsigned long long blas_quda_flops;
unsigned long long blas_quda_bytes;
void initReduceFloat(int blocks) { void initReduceFloat(int blocks) {
if (blocks != blocksFloat) { if (blocks != blocksFloat) {
...@@ -482,6 +483,7 @@ void axpbyCuda(double a, ParitySpinor x, double b, ParitySpinor y) { ...@@ -482,6 +483,7 @@ void axpbyCuda(double a, ParitySpinor x, double b, ParitySpinor y) {
int blocks = min(REDUCE_MAX_BLOCKS, max(x.length/REDUCE_THREADS, 1)); int blocks = min(REDUCE_MAX_BLOCKS, max(x.length/REDUCE_THREADS, 1));
dim3 dimBlock(REDUCE_THREADS, 1, 1); dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1); dim3 dimGrid(blocks, 1, 1);
blas_quda_bytes += 3*x.length*sizeof(x.precision);
if (x.precision == QUDA_DOUBLE_PRECISION) { if (x.precision == QUDA_DOUBLE_PRECISION) {
axpbyKernel<<<dimGrid, dimBlock>>>(a, (double*)x.spinor, b, (double*)y.spinor, x.length); axpbyKernel<<<dimGrid, dimBlock>>>(a, (double*)x.spinor, b, (double*)y.spinor, x.length);
} else if (x.precision == QUDA_SINGLE_PRECISION) { } else if (x.precision == QUDA_SINGLE_PRECISION) {
...@@ -532,6 +534,7 @@ void xpyCuda(ParitySpinor x, ParitySpinor y) { ...@@ -532,6 +534,7 @@ void xpyCuda(ParitySpinor x, ParitySpinor y) {
int blocks = min(REDUCE_MAX_BLOCKS, max(x.length/REDUCE_THREADS, 1)); int blocks = min(REDUCE_MAX_BLOCKS, max(x.length/REDUCE_THREADS, 1));
dim3 dimBlock(REDUCE_THREADS, 1, 1); dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1); dim3 dimGrid(blocks, 1, 1);
blas_quda_bytes += 3*x.length*sizeof(x.precision);
if (x.precision == QUDA_DOUBLE_PRECISION) { if (x.precision == QUDA_DOUBLE_PRECISION) {
xpyKernel<<<dimGrid, dimBlock>>>((double*)x.spinor, (double*)y.spinor, x.length); xpyKernel<<<dimGrid, dimBlock>>>((double*)x.spinor, (double*)y.spinor, x.length);
} else if (x.precision == QUDA_SINGLE_PRECISION) { } else if (x.precision == QUDA_SINGLE_PRECISION) {
...@@ -581,6 +584,7 @@ void axpyCuda(double a, ParitySpinor x, ParitySpinor y) { ...@@ -581,6 +584,7 @@ void axpyCuda(double a, ParitySpinor x, ParitySpinor y) {
int blocks = min(REDUCE_MAX_BLOCKS, max(x.length/REDUCE_THREADS, 1)); int blocks = min(REDUCE_MAX_BLOCKS, max(x.length/REDUCE_THREADS, 1));
dim3 dimBlock(REDUCE_THREADS, 1, 1); dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1); dim3 dimGrid(blocks, 1, 1);
blas_quda_bytes += 3*x.length*sizeof(x.precision);
if (x.precision == QUDA_DOUBLE_PRECISION) { if (x.precision == QUDA_DOUBLE_PRECISION) {
axpyKernel<<<dimGrid, dimBlock>>>(a, (double*)x.spinor, (double*)y.spinor, x.length); axpyKernel<<<dimGrid, dimBlock>>>(a, (double*)x.spinor, (double*)y.spinor, x.length);
} else if (x.precision == QUDA_SINGLE_PRECISION) { } else if (x.precision == QUDA_SINGLE_PRECISION) {
...@@ -630,6 +634,7 @@ void xpayCuda(ParitySpinor x, double a, ParitySpinor y) { ...@@ -630,6 +634,7 @@ void xpayCuda(ParitySpinor x, double a, ParitySpinor y) {
int blocks = min(REDUCE_MAX_BLOCKS, max(x.length/REDUCE_THREADS, 1)); int blocks = min(REDUCE_MAX_BLOCKS, max(x.length/REDUCE_THREADS, 1));
dim3 dimBlock(REDUCE_THREADS, 1, 1); dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1); dim3 dimGrid(blocks, 1, 1);
blas_quda_bytes += 3*x.length*sizeof(x.precision);
if (x.precision == QUDA_DOUBLE_PRECISION) { if (x.precision == QUDA_DOUBLE_PRECISION) {
xpayKernel<<<dimGrid, dimBlock>>>((double*)x.spinor, a, (double*)y.spinor, x.length); xpayKernel<<<dimGrid, dimBlock>>>((double*)x.spinor, a, (double*)y.spinor, x.length);
} else if (x.precision == QUDA_SINGLE_PRECISION) { } else if (x.precision == QUDA_SINGLE_PRECISION) {
...@@ -675,11 +680,12 @@ __global__ void mxpyHKernel(short4 *yH, float *yN, int length) { ...@@ -675,11 +680,12 @@ __global__ void mxpyHKernel(short4 *yH, float *yN, int length) {
// performs the operation y[i] -= x[i] (minus x plus y) // performs the operation y[i] -= x[i] (minus x plus y)
void mxpyQuda(ParitySpinor x, ParitySpinor y) { void mxpyCuda(ParitySpinor x, ParitySpinor y) {
checkSpinor(x,y); checkSpinor(x,y);
int blocks = min(REDUCE_MAX_BLOCKS, max(x.length/REDUCE_THREADS, 1)); int blocks = min(REDUCE_MAX_BLOCKS, max(x.length/REDUCE_THREADS, 1));
dim3 dimBlock(REDUCE_THREADS, 1, 1); dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1); dim3 dimGrid(blocks, 1, 1);
blas_quda_bytes += 3*x.length*sizeof(x.precision);
if (x.precision == QUDA_DOUBLE_PRECISION) { if (x.precision == QUDA_DOUBLE_PRECISION) {
mxpyKernel<<<dimGrid, dimBlock>>>((double*)x.spinor, (double*)y.spinor, x.length); mxpyKernel<<<dimGrid, dimBlock>>>((double*)x.spinor, (double*)y.spinor, x.length);
} else if (x.precision == QUDA_SINGLE_PRECISION) { } else if (x.precision == QUDA_SINGLE_PRECISION) {
...@@ -723,6 +729,7 @@ void axCuda(double a, ParitySpinor x) { ...@@ -723,6 +729,7 @@ void axCuda(double a, ParitySpinor x) {
int blocks = min(REDUCE_MAX_BLOCKS, max(x.length/REDUCE_THREADS, 1)); int blocks = min(REDUCE_MAX_BLOCKS, max(x.length/REDUCE_THREADS, 1));
dim3 dimBlock(REDUCE_THREADS, 1, 1); dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1); dim3 dimGrid(blocks, 1, 1);
blas_quda_bytes += 2*x.length*sizeof(x.precision);
if (x.precision == QUDA_DOUBLE_PRECISION) { if (x.precision == QUDA_DOUBLE_PRECISION) {
axKernel<<<dimGrid, dimBlock>>>(a, (double*)x.spinor, x.length); axKernel<<<dimGrid, dimBlock>>>(a, (double*)x.spinor, x.length);
} else if (x.precision == QUDA_SINGLE_PRECISION) { } else if (x.precision == QUDA_SINGLE_PRECISION) {
...@@ -778,6 +785,7 @@ void caxpyCuda(double2 a, ParitySpinor x, ParitySpinor y) { ...@@ -778,6 +785,7 @@ void caxpyCuda(double2 a, ParitySpinor x, ParitySpinor y) {
int blocks = min(REDUCE_MAX_BLOCKS, max(length/REDUCE_THREADS, 1)); int blocks = min(REDUCE_MAX_BLOCKS, max(length/REDUCE_THREADS, 1));
dim3 dimBlock(REDUCE_THREADS, 1, 1); dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1); dim3 dimGrid(blocks, 1, 1);
blas_quda_bytes += 3*x.length*sizeof(x.precision);
if (x.precision == QUDA_DOUBLE_PRECISION) { if (x.precision == QUDA_DOUBLE_PRECISION) {
caxpyKernel<<<dimGrid, dimBlock>>>(a, (double2*)x.spinor, (double2*)y.spinor, length); caxpyKernel<<<dimGrid, dimBlock>>>(a, (double2*)x.spinor, (double2*)y.spinor, length);
} else if (x.precision == QUDA_SINGLE_PRECISION) { } else if (x.precision == QUDA_SINGLE_PRECISION) {
...@@ -838,6 +846,7 @@ void caxpbyCuda(double2 a, ParitySpinor x, double2 b, ParitySpinor y) { ...@@ -838,6 +846,7 @@ void caxpbyCuda(double2 a, ParitySpinor x, double2 b, ParitySpinor y) {
int blocks = min(REDUCE_MAX_BLOCKS, max(length/REDUCE_THREADS, 1)); int blocks = min(REDUCE_MAX_BLOCKS, max(length/REDUCE_THREADS, 1));
dim3 dimBlock(REDUCE_THREADS, 1, 1); dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1); dim3 dimGrid(blocks, 1, 1);
blas_quda_bytes += 3*x.length*sizeof(x.precision);
if (x.precision == QUDA_DOUBLE_PRECISION) { if (x.precision == QUDA_DOUBLE_PRECISION) {
caxpbyKernel<<<dimGrid, dimBlock>>>(a, (double2*)x.spinor, b, (double2*)y.spinor, length); caxpbyKernel<<<dimGrid, dimBlock>>>(a, (double2*)x.spinor, b, (double2*)y.spinor, length);
} else if (x.precision == QUDA_SINGLE_PRECISION) { } else if (x.precision == QUDA_SINGLE_PRECISION) {
...@@ -911,6 +920,7 @@ void cxpaypbzCuda(ParitySpinor x, double2 a, ParitySpinor y, double2 b, ParitySp ...@@ -911,6 +920,7 @@ void cxpaypbzCuda(ParitySpinor x, double2 a, ParitySpinor y, double2 b, ParitySp
int blocks = min(REDUCE_MAX_BLOCKS, max(length/REDUCE_THREADS, 1)); int blocks = min(REDUCE_MAX_BLOCKS, max(length/REDUCE_THREADS, 1));
dim3 dimBlock(REDUCE_THREADS, 1, 1); dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1); dim3 dimGrid(blocks, 1, 1);
blas_quda_bytes += 4*x.length*sizeof(x.precision);
if (x.precision == QUDA_DOUBLE_PRECISION) { if (x.precision == QUDA_DOUBLE_PRECISION) {
cxpaypbzKernel<<<dimGrid, dimBlock>>>((double2*)x.spinor, a, (double2*)y.spinor, b, (double2*)z.spinor, length); cxpaypbzKernel<<<dimGrid, dimBlock>>>((double2*)x.spinor, a, (double2*)y.spinor, b, (double2*)z.spinor, length);
} else if (x.precision == QUDA_SINGLE_PRECISION) { } else if (x.precision == QUDA_SINGLE_PRECISION) {
...@@ -978,6 +988,7 @@ void axpyZpbxCuda(double a, ParitySpinor x, ParitySpinor y, ParitySpinor z, doub ...@@ -978,6 +988,7 @@ void axpyZpbxCuda(double a, ParitySpinor x, ParitySpinor y, ParitySpinor z, doub
int blocks = min(REDUCE_MAX_BLOCKS, max(x.length/REDUCE_THREADS, 1)); int blocks = min(REDUCE_MAX_BLOCKS, max(x.length/REDUCE_THREADS, 1));
dim3 dimBlock(REDUCE_THREADS, 1, 1); dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1); dim3 dimGrid(blocks, 1, 1);
blas_quda_bytes += 5*x.length*sizeof(x.precision);
if (x.precision == QUDA_DOUBLE_PRECISION) { if (x.precision == QUDA_DOUBLE_PRECISION) {
axpyZpbxKernel<<<dimGrid, dimBlock>>>(a, (double*)x.spinor, (double*)y.spinor, (double*)z.spinor, b, x.length); axpyZpbxKernel<<<dimGrid, dimBlock>>>(a, (double*)x.spinor, (double*)y.spinor, (double*)z.spinor, b, x.length);
} else if (x.precision == QUDA_SINGLE_PRECISION) { } else if (x.precision == QUDA_SINGLE_PRECISION) {
...@@ -1062,6 +1073,7 @@ void caxpbypzYmbwCuda(double2 a, ParitySpinor x, double2 b, ParitySpinor y, ...@@ -1062,6 +1073,7 @@ void caxpbypzYmbwCuda(double2 a, ParitySpinor x, double2 b, ParitySpinor y,
int blocks = min(REDUCE_MAX_BLOCKS, max(length/REDUCE_THREADS, 1)); int blocks = min(REDUCE_MAX_BLOCKS, max(length/REDUCE_THREADS, 1));
dim3 dimBlock(REDUCE_THREADS, 1, 1); dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1); dim3 dimGrid(blocks, 1, 1);
blas_quda_bytes += 6*x.length*sizeof(x.precision);
if (x.precision == QUDA_DOUBLE_PRECISION) { if (x.precision == QUDA_DOUBLE_PRECISION) {
caxpbypzYmbwKernel<<<dimGrid, dimBlock>>>(a, (double2*)x.spinor, b, (double2*)y.spinor, caxpbypzYmbwKernel<<<dimGrid, dimBlock>>>(a, (double2*)x.spinor, b, (double2*)y.spinor,
(double2*)z.spinor, (double2*)w.spinor, length); (double2*)z.spinor, (double2*)w.spinor, length);
...@@ -1188,6 +1200,7 @@ template <typename Float> ...@@ -1188,6 +1200,7 @@ template <typename Float>
double sumCuda(ParitySpinor a) { double sumCuda(ParitySpinor a) {
blas_quda_flops += a.length; blas_quda_flops += a.length;
blas_quda_bytes += a.length*sizeof(a.precision);
if (a.precision == QUDA_DOUBLE_PRECISION) { if (a.precision == QUDA_DOUBLE_PRECISION) {
return sumFCuda((double*)a.spinor, a.length); return sumFCuda((double*)a.spinor, a.length);
} else if (a.precision == QUDA_SINGLE_PRECISION) { } else if (a.precision == QUDA_SINGLE_PRECISION) {
...@@ -1242,6 +1255,7 @@ template <typename Float> ...@@ -1242,6 +1255,7 @@ template <typename Float>
double normCuda(ParitySpinor a) { double normCuda(ParitySpinor a) {
blas_quda_flops += 2*a.length; blas_quda_flops += 2*a.length;
blas_quda_bytes += a.length*sizeof(a.precision);
if (a.precision == QUDA_DOUBLE_PRECISION) { if (a.precision == QUDA_DOUBLE_PRECISION) {
return normFCuda((double*)a.spinor, a.length); return normFCuda((double*)a.spinor, a.length);
} else if (a.precision == QUDA_SINGLE_PRECISION) { } else if (a.precision == QUDA_SINGLE_PRECISION) {
...@@ -1300,6 +1314,7 @@ template <typename Float> ...@@ -1300,6 +1314,7 @@ template <typename Float>
double reDotProductCuda(ParitySpinor a, ParitySpinor b) { double reDotProductCuda(ParitySpinor a, ParitySpinor b) {
blas_quda_flops += 2*a.length; blas_quda_flops += 2*a.length;
checkSpinor(a, b); checkSpinor(a, b);
blas_quda_bytes += 2*a.length*sizeof(a.precision);
if (a.precision == QUDA_DOUBLE_PRECISION) { if (a.precision == QUDA_DOUBLE_PRECISION) {
return reDotProductFCuda((double*)a.spinor, (double*)b.spinor, a.length); return reDotProductFCuda((double*)a.spinor, (double*)b.spinor, a.length);
} else if (a.precision == QUDA_SINGLE_PRECISION) { } else if (a.precision == QUDA_SINGLE_PRECISION) {
...@@ -1366,6 +1381,7 @@ template <typename Float> ...@@ -1366,6 +1381,7 @@ template <typename Float>
double axpyNormCuda(double a, ParitySpinor x, ParitySpinor y) { double axpyNormCuda(double a, ParitySpinor x, ParitySpinor y) {
blas_quda_flops += 4*x.length; blas_quda_flops += 4*x.length;
checkSpinor(x,y); checkSpinor(x,y);
blas_quda_bytes += 3*x.length*sizeof(x.precision);
if (x.precision == QUDA_DOUBLE_PRECISION) { if (x.precision == QUDA_DOUBLE_PRECISION) {
return axpyNormFCuda(a, (double*)x.spinor, (double*)y.spinor, x.length); return axpyNormFCuda(a, (double*)x.spinor, (double*)y.spinor, x.length);
} else if (x.precision == QUDA_SINGLE_PRECISION) { } else if (x.precision == QUDA_SINGLE_PRECISION) {
...@@ -1433,6 +1449,7 @@ template <typename Float> ...@@ -1433,6 +1449,7 @@ template <typename Float>
double xmyNormCuda(ParitySpinor x, ParitySpinor y) { double xmyNormCuda(ParitySpinor x, ParitySpinor y) {
blas_quda_flops +=3*x.length; blas_quda_flops +=3*x.length;
checkSpinor(x,y); checkSpinor(x,y);
blas_quda_bytes += 3*x.length*sizeof(x.precision);
if (x.precision == QUDA_DOUBLE_PRECISION) { if (x.precision == QUDA_DOUBLE_PRECISION) {
return xmyNormFCuda((double*)x.spinor, (double*)y.spinor, x.length); return xmyNormFCuda((double*)x.spinor, (double*)y.spinor, x.length);
} else if (x.precision == QUDA_SINGLE_PRECISION) { } else if (x.precision == QUDA_SINGLE_PRECISION) {
...@@ -1505,6 +1522,7 @@ double2 cDotProductCuda(ParitySpinor x, ParitySpinor y) { ...@@ -1505,6 +1522,7 @@ double2 cDotProductCuda(ParitySpinor x, ParitySpinor y) {
blas_quda_flops += 4*x.length; blas_quda_flops += 4*x.length;
checkSpinor(x,y); checkSpinor(x,y);
int length = x.length/2; int length = x.length/2;
blas_quda_bytes += 2*x.length*sizeof(x.precision);
if (x.precision == QUDA_DOUBLE_PRECISION) { if (x.precision == QUDA_DOUBLE_PRECISION) {
char c = NULL; char c = NULL;
return cDotProductFCuda((double2*)x.spinor, (double2*)y.spinor, c, length); return cDotProductFCuda((double2*)x.spinor, (double2*)y.spinor, c, length);
...@@ -1591,6 +1609,7 @@ double2 xpaycDotzyCuda(ParitySpinor x, double a, ParitySpinor y, ParitySpinor z) ...@@ -1591,6 +1609,7 @@ double2 xpaycDotzyCuda(ParitySpinor x, double a, ParitySpinor y, ParitySpinor z)
checkSpinor(x,y); checkSpinor(x,y);
checkSpinor(x,z); checkSpinor(x,z);
int length = x.length/2; int length = x.length/2;
blas_quda_bytes += 4*x.length*sizeof(x.precision);
if (x.precision == QUDA_DOUBLE_PRECISION) { if (x.precision == QUDA_DOUBLE_PRECISION) {
return xpaycDotzyFCuda((double2*)x.spinor, a, (double2*)y.spinor, (double2*)z.spinor, length); return xpaycDotzyFCuda((double2*)x.spinor, a, (double2*)y.spinor, (double2*)z.spinor, length);
} else if (x.precision == QUDA_SINGLE_PRECISION) { } else if (x.precision == QUDA_SINGLE_PRECISION) {
...@@ -1680,6 +1699,7 @@ double3 cDotProductNormACuda(ParitySpinor x, ParitySpinor y) { ...@@ -1680,6 +1699,7 @@ double3 cDotProductNormACuda(ParitySpinor x, ParitySpinor y) {
blas_quda_flops += 6*x.length; blas_quda_flops += 6*x.length;
checkSpinor(x,y); checkSpinor(x,y);
int length = x.length/2; int length = x.length/2;
blas_quda_bytes += 2*x.length*sizeof(x.precision);
if (x.precision == QUDA_DOUBLE_PRECISION) { if (x.precision == QUDA_DOUBLE_PRECISION) {
return cDotProductNormAFCuda((double2*)x.spinor, (double2*)y.spinor, length); return cDotProductNormAFCuda((double2*)x.spinor, (double2*)y.spinor, length);
} else if (x.precision == QUDA_SINGLE_PRECISION) { } else if (x.precision == QUDA_SINGLE_PRECISION) {
...@@ -1766,6 +1786,7 @@ double3 cDotProductNormBCuda(ParitySpinor x, ParitySpinor y) { ...@@ -1766,6 +1786,7 @@ double3 cDotProductNormBCuda(ParitySpinor x, ParitySpinor y) {
blas_quda_flops += 6*x.length; blas_quda_flops += 6*x.length;
checkSpinor(x,y); checkSpinor(x,y);
int length = x.length/2; int length = x.length/2;
blas_quda_bytes += 2*x.length*sizeof(x.precision);
if (x.precision == QUDA_DOUBLE_PRECISION) { if (x.precision == QUDA_DOUBLE_PRECISION) {
return cDotProductNormBFCuda((double2*)x.spinor, (double2*)y.spinor, length); return cDotProductNormBFCuda((double2*)x.spinor, (double2*)y.spinor, length);
} else if (x.precision == QUDA_SINGLE_PRECISION) { } else if (x.precision == QUDA_SINGLE_PRECISION) {
...@@ -1898,6 +1919,7 @@ double3 caxpbypzYmbwcDotProductWYNormYQuda(double2 a, ParitySpinor x, double2 b, ...@@ -1898,6 +1919,7 @@ double3 caxpbypzYmbwcDotProductWYNormYQuda(double2 a, ParitySpinor x, double2 b,
checkSpinor(x,w); checkSpinor(x,w);
checkSpinor(x,u); checkSpinor(x,u);
int length = x.length/2; int length = x.length/2;
blas_quda_bytes += 7*x.length*sizeof(x.precision);
if (x.precision == QUDA_DOUBLE_PRECISION) { if (x.precision == QUDA_DOUBLE_PRECISION) {
return caxpbypzYmbwcDotProductWYNormYFCuda(a, (double2*)x.spinor, b, (double2*)y.spinor, (double2*)z.spinor, return caxpbypzYmbwcDotProductWYNormYFCuda(a, (double2*)x.spinor, b, (double2*)y.spinor, (double2*)z.spinor,
(double2*)w.spinor, (double2*)u.spinor, length); (double2*)w.spinor, (double2*)u.spinor, length);
......
...@@ -45,6 +45,7 @@ extern "C" { ...@@ -45,6 +45,7 @@ extern "C" {
ParitySpinor z, ParitySpinor w, ParitySpinor u); ParitySpinor z, ParitySpinor w, ParitySpinor u);
extern unsigned long long blas_quda_flops; extern unsigned long long blas_quda_flops;
extern unsigned long long blas_quda_bytes;
#ifdef __cplusplus #ifdef __cplusplus
} }
......
...@@ -25,7 +25,7 @@ void init() { ...@@ -25,7 +25,7 @@ void init() {
X[3] = 32; X[3] = 32;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION; inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_HALF_PRECISION; inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
inv_param.verbosity = QUDA_VERBOSE; inv_param.verbosity = QUDA_VERBOSE;
invert_param = &inv_param; invert_param = &inv_param;
...@@ -136,7 +136,7 @@ double benchmark(int kernel) { ...@@ -136,7 +136,7 @@ double benchmark(int kernel) {
break; break;
case 17: case 17:
xpayDotzyCuda(x, a, y, z); xpaycDotzyCuda(x, a, y, z);
break; break;
// double3 // double3
...@@ -167,19 +167,45 @@ int main(int argc, char** argv) { ...@@ -167,19 +167,45 @@ int main(int argc, char** argv) {
init(); init();
int kernels[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}; int kernels[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
char names[][100] = {
"axpbyCuda(a, x, b, y): ",
"xpyCuda(x, y): ",
"axpyCuda(a, x, y): ",
"xpayCuda(x, a, y): ",
"mxpyCuda(x, y): ",
"axCuda(a, x): ",
"caxpyCuda(a2, x, y): ",
"caxpbyCuda(a2, x, b2, y): ",
"cxpaypbzCuda(x, a2, y, b2, z): ",
"axpyZpbxCuda(a, x, y, z, b): ",
"caxpbypzYmbwCuda(a2, x, b2, y, z, w): ",
"sumCuda(x): ",
"normCuda(x): ",
"reDotProductCuda(x, y): ",
"axpyNormCuda(a, x, y): ",
"xmyNormCuda(x, y): ",
"cDotProductCuda(x, y): ",
"xpaycDotzyCuda(x, a, y, z): ",
"cDotProductNormACuda(x, y): ",
"cDotProductNormBCuda(x, y): ",
"caxpbypzYmbwcDotProductWYNormYQuda(a2, x, b2, y, z, w, v): "
};
nIters = 1; nIters = 1;
// first do warmup run // first do warmup run
for (int i = 0; i < 19; i++) { for (int i = 0; i <= 20; i++) {
benchmark(kernels[i]); benchmark(kernels[i]);
} }
nIters = 1000; nIters = 1000;
for (int i = 0; i < 19; i++) { for (int i = 0; i <= 20; i++) {
blas_quda_flops = 0; blas_quda_flops = 0;
blas_quda_bytes = 0;
double secs = benchmark(kernels[i]); double secs = benchmark(kernels[i]);
double flops = blas_quda_flops / (double)nIters; double flops = blas_quda_flops / (double)nIters;
printf("Average time: %f s, flops = %e, Gflops/s = %f\n", secs, flops, flops/secs*1e-9); double bytes = blas_quda_bytes / (double)nIters;
printf("%s %f s, flops = %e, Gflops/s = %f, GiB/s = %f\n\n",
names[i], secs, flops, flops/secs*1e-9, bytes/(secs*(1<<30)));
//printf("Bandwidth: %f GiB/s\n\n", GiB / secs); //printf("Bandwidth: %f GiB/s\n\n", GiB / secs);
} }
} }
......
...@@ -33,9 +33,9 @@ extern "C" { ...@@ -33,9 +33,9 @@ extern "C" {
} QudaInverterType; } QudaInverterType;
typedef enum QudaPrecision_s { typedef enum QudaPrecision_s {
QUDA_HALF_PRECISION, QUDA_HALF_PRECISION = 2,
QUDA_SINGLE_PRECISION, QUDA_SINGLE_PRECISION = 4,
QUDA_DOUBLE_PRECISION QUDA_DOUBLE_PRECISION = 8
} QudaPrecision; } QudaPrecision;
// Whether the preconditioned matrix is (1-k^2 Deo Doe) or (1-k^2 Doe Deo) // Whether the preconditioned matrix is (1-k^2 Deo Doe) or (1-k^2 Doe Deo)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment