Advanced Computing Platform for Theoretical Physics

Commit bf3ede9d authored by mikeaclark's avatar mikeaclark
Browse files

Updated blas_test with better timer

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@490 be54200a-260c-0410-bdd7-ce6af2a381ab
parent a7974dba
...@@ -847,6 +847,7 @@ void caxpbyCuda(double2 a, ParitySpinor x, double2 b, ParitySpinor y) { ...@@ -847,6 +847,7 @@ void caxpbyCuda(double2 a, ParitySpinor x, double2 b, ParitySpinor y) {
dim3 dimBlock(REDUCE_THREADS, 1, 1); dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1); dim3 dimGrid(blocks, 1, 1);
blas_quda_bytes += 3*x.length*sizeof(x.precision); blas_quda_bytes += 3*x.length*sizeof(x.precision);
blas_quda_flops += 7*x.length;
if (x.precision == QUDA_DOUBLE_PRECISION) { if (x.precision == QUDA_DOUBLE_PRECISION) {
caxpbyKernel<<<dimGrid, dimBlock>>>(a, (double2*)x.spinor, b, (double2*)y.spinor, length); caxpbyKernel<<<dimGrid, dimBlock>>>(a, (double2*)x.spinor, b, (double2*)y.spinor, length);
} else if (x.precision == QUDA_SINGLE_PRECISION) { } else if (x.precision == QUDA_SINGLE_PRECISION) {
...@@ -863,7 +864,6 @@ void caxpbyCuda(double2 a, ParitySpinor x, double2 b, ParitySpinor y) { ...@@ -863,7 +864,6 @@ void caxpbyCuda(double2 a, ParitySpinor x, double2 b, ParitySpinor y) {
float2 bf2 = make_float2((float)b.x, (float)b.y); float2 bf2 = make_float2((float)b.x, (float)b.y);
caxpbyHKernel<<<dimGrid, dimBlock>>>(af2, bf2, (short4*)y.spinor, (float*)y.spinorNorm, y.length/spinorSiteSize); caxpbyHKernel<<<dimGrid, dimBlock>>>(af2, bf2, (short4*)y.spinor, (float*)y.spinorNorm, y.length/spinorSiteSize);
} }
blas_quda_flops += 7*x.length;
} }
template <typename Float2> template <typename Float2>
......
...@@ -13,7 +13,7 @@ QudaInvertParam inv_param; ...@@ -13,7 +13,7 @@ QudaInvertParam inv_param;
ParitySpinor x, y, z, w, v; ParitySpinor x, y, z, w, v;
int nIters = 1000; int nIters;
void init() { void init() {
...@@ -58,9 +58,10 @@ double benchmark(int kernel) { ...@@ -58,9 +58,10 @@ double benchmark(int kernel) {
double a, b; double a, b;
double2 a2, b2; double2 a2, b2;
//printf("Executing %d kernel loops...", nIters); cudaEvent_t start, end;
//fflush(stdout); cudaEventCreate(&start);
stopwatchStart(); cudaEventCreate(&end);
cudaEventRecord(start, 0);
for (int i=0; i < nIters; ++i) { for (int i=0; i < nIters; ++i) {
switch (kernel) { switch (kernel) {
...@@ -158,7 +159,11 @@ double benchmark(int kernel) { ...@@ -158,7 +159,11 @@ double benchmark(int kernel) {
} }
} }
double secs = stopwatchReadSeconds() / nIters; cudaEventRecord(end, 0);
cudaEventSynchronize(end);
float runTime;
cudaEventElapsedTime(&runTime, start, end);
double secs = runTime / 1000;
return secs; return secs;
} }
...@@ -202,10 +207,10 @@ int main(int argc, char** argv) { ...@@ -202,10 +207,10 @@ int main(int argc, char** argv) {
blas_quda_flops = 0; blas_quda_flops = 0;
blas_quda_bytes = 0; blas_quda_bytes = 0;
double secs = benchmark(kernels[i]); double secs = benchmark(kernels[i]);
double flops = blas_quda_flops / (double)nIters; double flops = blas_quda_flops;
double bytes = blas_quda_bytes / (double)nIters; double bytes = blas_quda_bytes;
printf("%s %f s, flops = %e, Gflops/s = %f, GiB/s = %f\n\n", printf("%s %f s, flops = %e, Gflops/s = %f, GiB/s = %f\n\n",
names[i], secs, flops, flops/secs*1e-9, bytes/(secs*(1<<30))); names[i], secs, flops, (flops*1e-9)/(secs), bytes/(secs*(1<<30)));
//printf("Bandwidth: %f GiB/s\n\n", GiB / secs); //printf("Bandwidth: %f GiB/s\n\n", GiB / secs);
} }
} }
......
...@@ -29,7 +29,7 @@ void *spinorGPU, *spinorGPUEven, *spinorGPUOdd; ...@@ -29,7 +29,7 @@ void *spinorGPU, *spinorGPUEven, *spinorGPUOdd;
double kappa = 1.0; double kappa = 1.0;
int ODD_BIT = 1; int ODD_BIT = 1;
int DAGGER_BIT = 0; int DAGGER_BIT = 0;
int TRANSFER = 1; // include transfer time in the benchmark? int TRANSFER = 0; // include transfer time in the benchmark?
void init() { void init() {
...@@ -45,7 +45,7 @@ void init() { ...@@ -45,7 +45,7 @@ void init() {
gaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T; gaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;
gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION; gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION;
gaugeParam.cuda_prec = QUDA_DOUBLE_PRECISION; gaugeParam.cuda_prec = QUDA_HALF_PRECISION;
gaugeParam.reconstruct = QUDA_RECONSTRUCT_12; gaugeParam.reconstruct = QUDA_RECONSTRUCT_12;
gaugeParam.reconstruct_sloppy = gaugeParam.reconstruct; gaugeParam.reconstruct_sloppy = gaugeParam.reconstruct;
gaugeParam.cuda_prec_sloppy = gaugeParam.cuda_prec; gaugeParam.cuda_prec_sloppy = gaugeParam.cuda_prec;
...@@ -64,14 +64,14 @@ void init() { ...@@ -64,14 +64,14 @@ void init() {
inv_param.matpc_type = QUDA_MATPC_ODD_ODD; inv_param.matpc_type = QUDA_MATPC_ODD_ODD;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION; inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_DOUBLE_PRECISION; inv_param.cuda_prec = QUDA_HALF_PRECISION;
if (test_type == 2) inv_param.dirac_order = QUDA_DIRAC_ORDER; if (test_type == 2) inv_param.dirac_order = QUDA_DIRAC_ORDER;
else inv_param.dirac_order = QUDA_DIRAC_ORDER; else inv_param.dirac_order = QUDA_DIRAC_ORDER;
if (clover_yes) { if (clover_yes) {
inv_param.clover_cpu_prec = QUDA_DOUBLE_PRECISION; inv_param.clover_cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.clover_cuda_prec = QUDA_DOUBLE_PRECISION; inv_param.clover_cuda_prec = QUDA_HALF_PRECISION;
inv_param.clover_cuda_prec_sloppy = inv_param.clover_cuda_prec; inv_param.clover_cuda_prec_sloppy = inv_param.clover_cuda_prec;
inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER; inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER;
} }
...@@ -308,8 +308,8 @@ void dslashTest() { ...@@ -308,8 +308,8 @@ void dslashTest() {
printf("%d Test %s\n", i, (1 == res) ? "PASSED" : "FAILED"); printf("%d Test %s\n", i, (1 == res) ? "PASSED" : "FAILED");
//if (test_type < 2) strong_check(spinorRef, spinorOdd, Vh, inv_param.cpu_prec); if (test_type < 2) strong_check(spinorRef, spinorOdd, Vh, inv_param.cpu_prec);
//else strong_check(spinorRef, spinorGPU, V, inv_param.cpu_prec); else strong_check(spinorRef, spinorGPU, V, inv_param.cpu_prec);
} }
end(); end();
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment