Updated blas_test with better timer

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@490 be54200a-260c-0410-bdd7-ce6af2a381ab

Updated blas_test with better timer
git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@490 be54200a-260c-0410-bdd7-ce6af2a381ab
bf3ede9d · mikeaclark · a7974dba · bf3ede9d · bf3ede9d · bf3ede9d
Commit bf3ede9d authored Oct 08, 2009 by mikeaclark
--- a/blas_quda.cu
+++ b/blas_quda.cu
@@ -847,6 +847,7 @@ void caxpbyCuda(double2 a, ParitySpinor x, double2 b, ParitySpinor y) {
  dim3 dimBlock(REDUCE_THREADS, 1, 1);
  dim3 dimGrid(blocks, 1, 1);
  blas_quda_bytes += 3*x.length*sizeof(x.precision);
+  blas_quda_flops += 7*x.length;
  if (x.precision == QUDA_DOUBLE_PRECISION) {
    caxpbyKernel<<<dimGrid, dimBlock>>>(a, (double2*)x.spinor, b, (double2*)y.spinor, length);
  } else if (x.precision == QUDA_SINGLE_PRECISION) {
@@ -863,7 +864,6 @@ void caxpbyCuda(double2 a, ParitySpinor x, double2 b, ParitySpinor y) {
    float2 bf2 = make_float2((float)b.x, (float)b.y);
    caxpbyHKernel<<<dimGrid, dimBlock>>>(af2, bf2, (short4*)y.spinor, (float*)y.spinorNorm, y.length/spinorSiteSize);
  }
-  blas_quda_flops += 7*x.length;
 }

 template <typename Float2>

--- a/blas_test.c
+++ b/blas_test.c
@@ -13,7 +13,7 @@ QudaInvertParam inv_param;

 ParitySpinor x, y, z, w, v;

-int nIters = 1000;
+int nIters;

 void init() {

@@ -58,9 +58,10 @@ double benchmark(int kernel) {
  double a, b;
  double2 a2, b2;

-  //printf("Executing %d kernel loops...", nIters);
-  //fflush(stdout);
-  stopwatchStart();
+  cudaEvent_t start, end;
+  cudaEventCreate(&start);
+  cudaEventCreate(&end);
+  cudaEventRecord(start, 0);

  for (int i=0; i < nIters; ++i) {
    switch (kernel) {
@@ -158,7 +159,11 @@ double benchmark(int kernel) {
    }
  }
  
-  double secs = stopwatchReadSeconds() / nIters;
+  cudaEventRecord(end, 0);
+  cudaEventSynchronize(end);
+  float runTime;
+  cudaEventElapsedTime(&runTime, start, end);
+  double secs = runTime / 1000;
  return secs;
 }

@@ -202,10 +207,10 @@ int main(int argc, char** argv) {
    blas_quda_flops = 0;
    blas_quda_bytes = 0;
    double secs = benchmark(kernels[i]);
-    double flops = blas_quda_flops / (double)nIters;
-    double bytes = blas_quda_bytes / (double)nIters;
+    double flops = blas_quda_flops;
+    double bytes = blas_quda_bytes;
    printf("%s %f s, flops = %e, Gflops/s = %f, GiB/s = %f\n\n", 
-	   names[i], secs, flops, flops/secs*1e-9, bytes/(secs*(1<<30)));
+	   names[i], secs, flops, (flops*1e-9)/(secs), bytes/(secs*(1<<30)));
    //printf("Bandwidth:    %f GiB/s\n\n", GiB / secs);
  }
 }

--- a/dslash_test.c
+++ b/dslash_test.c
@@ -29,7 +29,7 @@ void *spinorGPU, *spinorGPUEven, *spinorGPUOdd;
 double kappa = 1.0;
 int ODD_BIT = 1;
 int DAGGER_BIT = 0;
-int TRANSFER = 1; // include transfer time in the benchmark?
+int TRANSFER = 0; // include transfer time in the benchmark?

 void init() {

@@ -45,7 +45,7 @@ void init() {
  gaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;

  gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION;
-  gaugeParam.cuda_prec = QUDA_DOUBLE_PRECISION;
+  gaugeParam.cuda_prec = QUDA_HALF_PRECISION;
  gaugeParam.reconstruct = QUDA_RECONSTRUCT_12;
  gaugeParam.reconstruct_sloppy = gaugeParam.reconstruct;
  gaugeParam.cuda_prec_sloppy = gaugeParam.cuda_prec;
@@ -64,14 +64,14 @@ void init() {
  inv_param.matpc_type = QUDA_MATPC_ODD_ODD;

  inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
-  inv_param.cuda_prec = QUDA_DOUBLE_PRECISION;
+  inv_param.cuda_prec = QUDA_HALF_PRECISION;

  if (test_type == 2) inv_param.dirac_order = QUDA_DIRAC_ORDER;
  else inv_param.dirac_order = QUDA_DIRAC_ORDER;

  if (clover_yes) {
    inv_param.clover_cpu_prec = QUDA_DOUBLE_PRECISION;
-    inv_param.clover_cuda_prec = QUDA_DOUBLE_PRECISION;
+    inv_param.clover_cuda_prec = QUDA_HALF_PRECISION;
    inv_param.clover_cuda_prec_sloppy = inv_param.clover_cuda_prec;
    inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER;
  }
@@ -308,8 +308,8 @@ void dslashTest() {
      
      printf("%d Test %s\n", i, (1 == res) ? "PASSED" : "FAILED");
      
-      //if (test_type < 2) strong_check(spinorRef, spinorOdd, Vh, inv_param.cpu_prec);
-      //else strong_check(spinorRef, spinorGPU, V, inv_param.cpu_prec);    
+      if (test_type < 2) strong_check(spinorRef, spinorOdd, Vh, inv_param.cpu_prec);
+      else strong_check(spinorRef, spinorGPU, V, inv_param.cpu_prec);    
  }    
  end();
 }