Advanced Computing Platform for Theoretical Physics

Commit 7ac1f1ee authored by rbabich's avatar rbabich
Browse files

quda: added missing calls to cudaEventDestroy() in blas_test


git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@607 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 7364b3e7
......@@ -60,9 +60,6 @@ extern "C" {
extern unsigned long long blas_quda_flops;
extern unsigned long long blas_quda_bytes;
extern int blas_threads[3][22];
extern int blas_blocks[3][22];
#ifdef __cplusplus
}
#endif
......
......@@ -102,7 +102,7 @@ blas_threads[1][2] = 128;
blas_blocks[1][2] = 128;
// Kernel: axpyCuda
blas_threads[1][3] = 64;
blas_threads[1][3] = 128;
blas_blocks[1][3] = 128;
// Kernel: xpayCuda
......@@ -130,8 +130,8 @@ blas_threads[1][9] = 64;
blas_blocks[1][9] = 128;
// Kernel: axpyZpbxCuda
blas_threads[1][10] = 128;
blas_blocks[1][10] = 128;
blas_threads[1][10] = 64;
blas_blocks[1][10] = 2048;
// Kernel: caxpbypzYmbwCuda
blas_threads[1][11] = 64;
......@@ -211,11 +211,11 @@ blas_blocks[2][7] = 2048;
// Kernel: caxpbyCuda
blas_threads[2][8] = 64;
blas_blocks[2][8] = 128;
blas_blocks[2][8] = 2048;
// Kernel: cxpaypbzCuda
blas_threads[2][9] = 64;
blas_blocks[2][9] = 1024;
blas_blocks[2][9] = 2048;
// Kernel: axpyZpbxCuda
blas_threads[2][10] = 64;
......
......@@ -34,9 +34,9 @@ unsigned long long blas_quda_flops;
unsigned long long blas_quda_bytes;
// Number of threads used for each blas kernel
int blas_threads[3][22];
static int blas_threads[3][22];
// Number of thread blocks for each blas kernel
int blas_blocks[3][22];
static int blas_blocks[3][22];
static dim3 blasBlock;
static dim3 blasGrid;
......
......@@ -187,6 +187,9 @@ double benchmark(int kernel) {
cudaEventSynchronize(end);
float runTime;
cudaEventElapsedTime(&runTime, start, end);
cudaEventDestroy(start);
cudaEventDestroy(end);
double secs = runTime / 1000;
return secs;
}
......@@ -242,8 +245,6 @@ int main(int argc, char** argv) {
for (int grid=0; grid<Ngrids; grid++) {
setBlasParam(i, prec, blockSizes[thread], gridSizes[grid]);
if (i==12) printfQuda("warmup %d %d\n", blockSizes[thread], gridSizes[grid]); // DEBUG
// first do warmup run
nIters = 1;
benchmark(kernels[i]);
......@@ -252,14 +253,6 @@ int main(int argc, char** argv) {
blas_quda_flops = 0;
blas_quda_bytes = 0;
// DEBUG
{
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) warningQuda("%s", cudaGetErrorString(error));
}
if (i==12) printfQuda("running %d %d\n", blockSizes[thread], gridSizes[grid]);
// END DEBUG
double secs = benchmark(kernels[i]);
double flops = blas_quda_flops;
double bytes = blas_quda_bytes;
......@@ -269,8 +262,6 @@ int main(int argc, char** argv) {
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) warningQuda("%s", cudaGetErrorString(error)); // DEBUG
if (gbytes > gbytes_max && error == cudaSuccess) { // prevents selection of failed parameters
gflops_max = gflops;
gbytes_max = gbytes;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment