Revisions to blas benchmarking

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@488 be54200a-260c-0410-bdd7-ce6af2a381ab

Revisions to blas benchmarking
git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@488 be54200a-260c-0410-bdd7-ce6af2a381ab
6cf13493 · mikeaclark · 82646005 · 6cf13493 · 6cf13493 · 6cf13493
Commit 6cf13493 authored Oct 08, 2009 by mikeaclark
--- a/blas_quda.cu
+++ b/blas_quda.cu
@@ -36,6 +36,7 @@ int blocksComplex = 0;
 int blocksFloat3 = 0;

 unsigned long long blas_quda_flops;
+unsigned long long blas_quda_bytes;

 void initReduceFloat(int blocks) {
  if (blocks != blocksFloat) {
@@ -482,6 +483,7 @@ void axpbyCuda(double a, ParitySpinor x, double b, ParitySpinor y) {
  int blocks = min(REDUCE_MAX_BLOCKS, max(x.length/REDUCE_THREADS, 1));
  dim3 dimBlock(REDUCE_THREADS, 1, 1);
  dim3 dimGrid(blocks, 1, 1);
+  blas_quda_bytes += 3*x.length*sizeof(x.precision);
  if (x.precision == QUDA_DOUBLE_PRECISION) {
    axpbyKernel<<<dimGrid, dimBlock>>>(a, (double*)x.spinor, b, (double*)y.spinor, x.length);
  } else if (x.precision == QUDA_SINGLE_PRECISION) {
@@ -532,6 +534,7 @@ void xpyCuda(ParitySpinor x, ParitySpinor y) {
  int blocks = min(REDUCE_MAX_BLOCKS, max(x.length/REDUCE_THREADS, 1));
  dim3 dimBlock(REDUCE_THREADS, 1, 1);
  dim3 dimGrid(blocks, 1, 1);
+  blas_quda_bytes += 3*x.length*sizeof(x.precision);
  if (x.precision == QUDA_DOUBLE_PRECISION) {
    xpyKernel<<<dimGrid, dimBlock>>>((double*)x.spinor, (double*)y.spinor, x.length);
  } else if (x.precision == QUDA_SINGLE_PRECISION) {
@@ -581,6 +584,7 @@ void axpyCuda(double a, ParitySpinor x, ParitySpinor y) {
  int blocks = min(REDUCE_MAX_BLOCKS, max(x.length/REDUCE_THREADS, 1));
  dim3 dimBlock(REDUCE_THREADS, 1, 1);
  dim3 dimGrid(blocks, 1, 1);
+  blas_quda_bytes += 3*x.length*sizeof(x.precision);
  if (x.precision == QUDA_DOUBLE_PRECISION) {
    axpyKernel<<<dimGrid, dimBlock>>>(a, (double*)x.spinor, (double*)y.spinor, x.length);
  } else if (x.precision == QUDA_SINGLE_PRECISION) {
@@ -630,6 +634,7 @@ void xpayCuda(ParitySpinor x, double a, ParitySpinor y) {
  int blocks = min(REDUCE_MAX_BLOCKS, max(x.length/REDUCE_THREADS, 1));
  dim3 dimBlock(REDUCE_THREADS, 1, 1);
  dim3 dimGrid(blocks, 1, 1);
+  blas_quda_bytes += 3*x.length*sizeof(x.precision);
  if (x.precision == QUDA_DOUBLE_PRECISION) {
    xpayKernel<<<dimGrid, dimBlock>>>((double*)x.spinor, a, (double*)y.spinor, x.length);
  } else if (x.precision == QUDA_SINGLE_PRECISION) {
@@ -675,11 +680,12 @@ __global__ void mxpyHKernel(short4 *yH, float *yN, int length) {


 // performs the operation y[i] -= x[i] (minus x plus y)
-void mxpyQuda(ParitySpinor x, ParitySpinor y) {
+void mxpyCuda(ParitySpinor x, ParitySpinor y) {
  checkSpinor(x,y);
  int blocks = min(REDUCE_MAX_BLOCKS, max(x.length/REDUCE_THREADS, 1));
  dim3 dimBlock(REDUCE_THREADS, 1, 1);
  dim3 dimGrid(blocks, 1, 1);
+  blas_quda_bytes += 3*x.length*sizeof(x.precision);
  if (x.precision == QUDA_DOUBLE_PRECISION) {
    mxpyKernel<<<dimGrid, dimBlock>>>((double*)x.spinor, (double*)y.spinor, x.length);
  } else if (x.precision == QUDA_SINGLE_PRECISION) {
@@ -723,6 +729,7 @@ void axCuda(double a, ParitySpinor x) {
  int blocks = min(REDUCE_MAX_BLOCKS, max(x.length/REDUCE_THREADS, 1));
  dim3 dimBlock(REDUCE_THREADS, 1, 1);
  dim3 dimGrid(blocks, 1, 1);
+  blas_quda_bytes += 2*x.length*sizeof(x.precision);
  if (x.precision == QUDA_DOUBLE_PRECISION) {
    axKernel<<<dimGrid, dimBlock>>>(a, (double*)x.spinor, x.length);
  } else if (x.precision == QUDA_SINGLE_PRECISION) {
@@ -778,6 +785,7 @@ void caxpyCuda(double2 a, ParitySpinor x, ParitySpinor y) {
  int blocks = min(REDUCE_MAX_BLOCKS, max(length/REDUCE_THREADS, 1));
  dim3 dimBlock(REDUCE_THREADS, 1, 1);
  dim3 dimGrid(blocks, 1, 1);
+  blas_quda_bytes += 3*x.length*sizeof(x.precision);
  if (x.precision == QUDA_DOUBLE_PRECISION) {
    caxpyKernel<<<dimGrid, dimBlock>>>(a, (double2*)x.spinor, (double2*)y.spinor, length);
  } else if (x.precision == QUDA_SINGLE_PRECISION) {
@@ -838,6 +846,7 @@ void caxpbyCuda(double2 a, ParitySpinor x, double2 b, ParitySpinor y) {
  int blocks = min(REDUCE_MAX_BLOCKS, max(length/REDUCE_THREADS, 1));
  dim3 dimBlock(REDUCE_THREADS, 1, 1);
  dim3 dimGrid(blocks, 1, 1);
+  blas_quda_bytes += 3*x.length*sizeof(x.precision);
  if (x.precision == QUDA_DOUBLE_PRECISION) {
    caxpbyKernel<<<dimGrid, dimBlock>>>(a, (double2*)x.spinor, b, (double2*)y.spinor, length);
  } else if (x.precision == QUDA_SINGLE_PRECISION) {
@@ -911,6 +920,7 @@ void cxpaypbzCuda(ParitySpinor x, double2 a, ParitySpinor y, double2 b, ParitySp
  int blocks = min(REDUCE_MAX_BLOCKS, max(length/REDUCE_THREADS, 1));
  dim3 dimBlock(REDUCE_THREADS, 1, 1);
  dim3 dimGrid(blocks, 1, 1);
+  blas_quda_bytes += 4*x.length*sizeof(x.precision);
  if (x.precision == QUDA_DOUBLE_PRECISION) {
    cxpaypbzKernel<<<dimGrid, dimBlock>>>((double2*)x.spinor, a, (double2*)y.spinor, b, (double2*)z.spinor, length);
  } else if (x.precision == QUDA_SINGLE_PRECISION) {
@@ -978,6 +988,7 @@ void axpyZpbxCuda(double a, ParitySpinor x, ParitySpinor y, ParitySpinor z, doub
  int blocks = min(REDUCE_MAX_BLOCKS, max(x.length/REDUCE_THREADS, 1));
  dim3 dimBlock(REDUCE_THREADS, 1, 1);
  dim3 dimGrid(blocks, 1, 1);
+  blas_quda_bytes += 5*x.length*sizeof(x.precision);
  if (x.precision == QUDA_DOUBLE_PRECISION) {
    axpyZpbxKernel<<<dimGrid, dimBlock>>>(a, (double*)x.spinor, (double*)y.spinor, (double*)z.spinor, b, x.length);
  } else if (x.precision == QUDA_SINGLE_PRECISION) {
@@ -1062,6 +1073,7 @@ void caxpbypzYmbwCuda(double2 a, ParitySpinor x, double2 b, ParitySpinor y,
  int blocks = min(REDUCE_MAX_BLOCKS, max(length/REDUCE_THREADS, 1));
  dim3 dimBlock(REDUCE_THREADS, 1, 1);
  dim3 dimGrid(blocks, 1, 1);
+  blas_quda_bytes += 6*x.length*sizeof(x.precision);
  if (x.precision == QUDA_DOUBLE_PRECISION) {
    caxpbypzYmbwKernel<<<dimGrid, dimBlock>>>(a, (double2*)x.spinor, b, (double2*)y.spinor, 
 					  (double2*)z.spinor, (double2*)w.spinor, length); 
@@ -1188,6 +1200,7 @@ template <typename Float>

 double sumCuda(ParitySpinor a) {
  blas_quda_flops += a.length;
+  blas_quda_bytes += a.length*sizeof(a.precision);
  if (a.precision == QUDA_DOUBLE_PRECISION) {
    return sumFCuda((double*)a.spinor, a.length);
  } else if (a.precision == QUDA_SINGLE_PRECISION) {
@@ -1242,6 +1255,7 @@ template <typename Float>

 double normCuda(ParitySpinor a) {
  blas_quda_flops += 2*a.length;
+  blas_quda_bytes += a.length*sizeof(a.precision);
  if (a.precision == QUDA_DOUBLE_PRECISION) {
    return normFCuda((double*)a.spinor, a.length);
  } else if (a.precision == QUDA_SINGLE_PRECISION) {
@@ -1300,6 +1314,7 @@ template <typename Float>
 double reDotProductCuda(ParitySpinor a, ParitySpinor b) {
  blas_quda_flops += 2*a.length;
  checkSpinor(a, b);
+  blas_quda_bytes += 2*a.length*sizeof(a.precision);
  if (a.precision == QUDA_DOUBLE_PRECISION) {
    return reDotProductFCuda((double*)a.spinor, (double*)b.spinor, a.length);
  } else if (a.precision == QUDA_SINGLE_PRECISION) {
@@ -1366,6 +1381,7 @@ template <typename Float>
 double axpyNormCuda(double a, ParitySpinor x, ParitySpinor y) {
  blas_quda_flops += 4*x.length;
  checkSpinor(x,y);
+  blas_quda_bytes += 3*x.length*sizeof(x.precision);
  if (x.precision == QUDA_DOUBLE_PRECISION) {
    return axpyNormFCuda(a, (double*)x.spinor, (double*)y.spinor, x.length);
  } else if (x.precision == QUDA_SINGLE_PRECISION) {
@@ -1433,6 +1449,7 @@ template <typename Float>
 double xmyNormCuda(ParitySpinor x, ParitySpinor y) {
  blas_quda_flops +=3*x.length;
  checkSpinor(x,y);
+  blas_quda_bytes += 3*x.length*sizeof(x.precision);
  if (x.precision == QUDA_DOUBLE_PRECISION) {
    return xmyNormFCuda((double*)x.spinor, (double*)y.spinor, x.length);
  } else if (x.precision == QUDA_SINGLE_PRECISION) {
@@ -1505,6 +1522,7 @@ double2 cDotProductCuda(ParitySpinor x, ParitySpinor y) {
  blas_quda_flops += 4*x.length;
  checkSpinor(x,y);
  int length = x.length/2;
+  blas_quda_bytes += 2*x.length*sizeof(x.precision);
  if (x.precision == QUDA_DOUBLE_PRECISION) {
    char c = NULL;
    return cDotProductFCuda((double2*)x.spinor, (double2*)y.spinor, c, length);
@@ -1591,6 +1609,7 @@ double2 xpaycDotzyCuda(ParitySpinor x, double a, ParitySpinor y, ParitySpinor z)
  checkSpinor(x,y);
  checkSpinor(x,z);
  int length = x.length/2;
+  blas_quda_bytes += 4*x.length*sizeof(x.precision);
  if (x.precision == QUDA_DOUBLE_PRECISION) {
    return xpaycDotzyFCuda((double2*)x.spinor, a, (double2*)y.spinor, (double2*)z.spinor, length);
  } else if (x.precision == QUDA_SINGLE_PRECISION) {
@@ -1680,6 +1699,7 @@ double3 cDotProductNormACuda(ParitySpinor x, ParitySpinor y) {
  blas_quda_flops += 6*x.length;
  checkSpinor(x,y);
  int length = x.length/2;
+  blas_quda_bytes += 2*x.length*sizeof(x.precision);
  if (x.precision == QUDA_DOUBLE_PRECISION) {
    return cDotProductNormAFCuda((double2*)x.spinor, (double2*)y.spinor, length);
  } else if (x.precision == QUDA_SINGLE_PRECISION) {
@@ -1766,6 +1786,7 @@ double3 cDotProductNormBCuda(ParitySpinor x, ParitySpinor y) {
  blas_quda_flops += 6*x.length;
  checkSpinor(x,y);
  int length = x.length/2;
+  blas_quda_bytes += 2*x.length*sizeof(x.precision);
  if (x.precision == QUDA_DOUBLE_PRECISION) {
    return cDotProductNormBFCuda((double2*)x.spinor, (double2*)y.spinor, length);
  } else if (x.precision == QUDA_SINGLE_PRECISION) {
@@ -1898,6 +1919,7 @@ double3 caxpbypzYmbwcDotProductWYNormYQuda(double2 a, ParitySpinor x, double2 b,
  checkSpinor(x,w);
  checkSpinor(x,u);
  int length = x.length/2;
+  blas_quda_bytes += 7*x.length*sizeof(x.precision);
  if (x.precision == QUDA_DOUBLE_PRECISION) {
    return caxpbypzYmbwcDotProductWYNormYFCuda(a, (double2*)x.spinor, b, (double2*)y.spinor, (double2*)z.spinor, 
 					       (double2*)w.spinor, (double2*)u.spinor, length);

--- a/blas_quda.h
+++ b/blas_quda.h
@@ -45,6 +45,7 @@ extern "C" {
 					     ParitySpinor z, ParitySpinor w, ParitySpinor u);

  extern unsigned long long blas_quda_flops;
+  extern unsigned long long blas_quda_bytes;

 #ifdef __cplusplus
 }

--- a/blas_test.c
+++ b/blas_test.c
@@ -25,7 +25,7 @@ void init() {
  X[3] = 32;

  inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
-  inv_param.cuda_prec = QUDA_HALF_PRECISION;
+  inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
  inv_param.verbosity = QUDA_VERBOSE;

  invert_param = &inv_param;
@@ -136,7 +136,7 @@ double benchmark(int kernel) {
      break;

    case 17:
-      xpayDotzyCuda(x, a, y, z);
+      xpaycDotzyCuda(x, a, y, z);
      break;
      
      // double3
@@ -167,19 +167,45 @@ int main(int argc, char** argv) {
  init();

  int kernels[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
+  char names[][100] = {
+      "axpbyCuda(a, x, b, y):                                     ",
+      "xpyCuda(x, y):                                             ",
+      "axpyCuda(a, x, y):                                         ",
+      "xpayCuda(x, a, y):                                         ",
+      "mxpyCuda(x, y):                                            ",
+      "axCuda(a, x):                                              ",
+      "caxpyCuda(a2, x, y):                                       ",
+      "caxpbyCuda(a2, x, b2, y):                                  ",
+      "cxpaypbzCuda(x, a2, y, b2, z):                             ",
+      "axpyZpbxCuda(a, x, y, z, b):                               ",
+      "caxpbypzYmbwCuda(a2, x, b2, y, z, w):                      ",
+      "sumCuda(x):                                                ",
+      "normCuda(x):                                               ",
+      "reDotProductCuda(x, y):                                    ",
+      "axpyNormCuda(a, x, y):                                     ",
+      "xmyNormCuda(x, y):                                         ",
+      "cDotProductCuda(x, y):                                     ",
+      "xpaycDotzyCuda(x, a, y, z):                                ",
+      "cDotProductNormACuda(x, y):                                ",
+      "cDotProductNormBCuda(x, y):                                ",
+      "caxpbypzYmbwcDotProductWYNormYQuda(a2, x, b2, y, z, w, v): "
+  };

  nIters = 1;
  // first do warmup run
-  for (int i = 0; i < 19; i++) {
+  for (int i = 0; i <= 20; i++) {
    benchmark(kernels[i]);
  }

  nIters = 1000;
-  for (int i = 0; i < 19; i++) {
+  for (int i = 0; i <= 20; i++) {
    blas_quda_flops = 0;
+    blas_quda_bytes = 0;
    double secs = benchmark(kernels[i]);
    double flops = blas_quda_flops / (double)nIters;
-    printf("Average time: %f s, flops = %e, Gflops/s = %f\n", secs, flops, flops/secs*1e-9);
+    double bytes = blas_quda_bytes / (double)nIters;
+    printf("%s %f s, flops = %e, Gflops/s = %f, GiB/s = %f\n\n", 
+	   names[i], secs, flops, flops/secs*1e-9, bytes/(secs*(1<<30)));
    //printf("Bandwidth:    %f GiB/s\n\n", GiB / secs);
  }
 }

--- a/enum_quda.h
+++ b/enum_quda.h
@@ -33,9 +33,9 @@ extern "C" {
  } QudaInverterType;

  typedef enum QudaPrecision_s {
-    QUDA_HALF_PRECISION,
-    QUDA_SINGLE_PRECISION,
-    QUDA_DOUBLE_PRECISION
+    QUDA_HALF_PRECISION = 2,
+    QUDA_SINGLE_PRECISION = 4,
+    QUDA_DOUBLE_PRECISION = 8
  } QudaPrecision;

  // Whether the preconditioned matrix is (1-k^2 Deo Doe) or (1-k^2 Doe Deo)