Advanced Computing Platform for Theoretical Physics

Commit 7ac1f1ee authored by rbabich's avatar rbabich
Browse files

quda: added missing calls to cudaEventDestroy() in blas_test


git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@607 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 7364b3e7
......@@ -60,9 +60,6 @@ extern "C" {
extern unsigned long long blas_quda_flops;
extern unsigned long long blas_quda_bytes;
extern int blas_threads[3][22];
extern int blas_blocks[3][22];
#ifdef __cplusplus
}
#endif
......
/*
Auto-tuned blas CUDA parameters, generated by blas_test
*/
// Kernel: copyCuda
// Kernel: copyCuda
blas_threads[0][0] = 64;
blas_blocks[0][0] = 2048;
// Kernel: axpbyCuda
// Kernel: axpbyCuda
blas_threads[0][1] = 64;
blas_blocks[0][1] = 2048;
// Kernel: xpyCuda
// Kernel: xpyCuda
blas_threads[0][2] = 64;
blas_blocks[0][2] = 2048;
// Kernel: axpyCuda
// Kernel: axpyCuda
blas_threads[0][3] = 64;
blas_blocks[0][3] = 2048;
// Kernel: xpayCuda
// Kernel: xpayCuda
blas_threads[0][4] = 64;
blas_blocks[0][4] = 2048;
// Kernel: mxpyCuda
// Kernel: mxpyCuda
blas_threads[0][5] = 64;
blas_blocks[0][5] = 2048;
// Kernel: axCuda
// Kernel: axCuda
blas_threads[0][6] = 64;
blas_blocks[0][6] = 2048;
// Kernel: caxpyCuda
// Kernel: caxpyCuda
blas_threads[0][7] = 64;
blas_blocks[0][7] = 2048;
// Kernel: caxpbyCuda
// Kernel: caxpbyCuda
blas_threads[0][8] = 64;
blas_blocks[0][8] = 2048;
// Kernel: cxpaypbzCuda
// Kernel: cxpaypbzCuda
blas_threads[0][9] = 64;
blas_blocks[0][9] = 2048;
// Kernel: axpyZpbxCuda
// Kernel: axpyZpbxCuda
blas_threads[0][10] = 64;
blas_blocks[0][10] = 512;
// Kernel: caxpbypzYmbwCuda
// Kernel: caxpbypzYmbwCuda
blas_threads[0][11] = 64;
blas_blocks[0][11] = 1024;
// Kernel: sumCuda
// Kernel: sumCuda
blas_threads[0][12] = 64;
blas_blocks[0][12] = 128;
// Kernel: normCuda
// Kernel: normCuda
blas_threads[0][13] = 64;
blas_blocks[0][13] = 128;
// Kernel: reDotProductCuda
// Kernel: reDotProductCuda
blas_threads[0][14] = 64;
blas_blocks[0][14] = 64;
// Kernel: axpyNormCuda
// Kernel: axpyNormCuda
blas_threads[0][15] = 64;
blas_blocks[0][15] = 256;
// Kernel: xmyNormCuda
// Kernel: xmyNormCuda
blas_threads[0][16] = 64;
blas_blocks[0][16] = 512;
// Kernel: cDotProductCuda
// Kernel: cDotProductCuda
blas_threads[0][17] = 64;
blas_blocks[0][17] = 64;
// Kernel: xpaycDotzyCuda
// Kernel: xpaycDotzyCuda
blas_threads[0][18] = 64;
blas_blocks[0][18] = 256;
// Kernel: cDotProductNormACuda
// Kernel: cDotProductNormACuda
blas_threads[0][19] = 64;
blas_blocks[0][19] = 64;
// Kernel: cDotProductNormBCuda
// Kernel: cDotProductNormBCuda
blas_threads[0][20] = 64;
blas_blocks[0][20] = 64;
// Kernel: caxpbypzYmbwcDotProductWYNormYQuda
// Kernel: caxpbypzYmbwcDotProductWYNormYQuda
blas_threads[0][21] = 64;
blas_blocks[0][21] = 512;
// Kernel: copyCuda
// Kernel: copyCuda
blas_threads[1][0] = 64;
blas_blocks[1][0] = 1024;
// Kernel: axpbyCuda
// Kernel: axpbyCuda
blas_threads[1][1] = 128;
blas_blocks[1][1] = 128;
// Kernel: xpyCuda
// Kernel: xpyCuda
blas_threads[1][2] = 128;
blas_blocks[1][2] = 128;
// Kernel: axpyCuda
blas_threads[1][3] = 64;
// Kernel: axpyCuda
blas_threads[1][3] = 128;
blas_blocks[1][3] = 128;
// Kernel: xpayCuda
// Kernel: xpayCuda
blas_threads[1][4] = 128;
blas_blocks[1][4] = 128;
// Kernel: mxpyCuda
// Kernel: mxpyCuda
blas_threads[1][5] = 128;
blas_blocks[1][5] = 128;
// Kernel: axCuda
// Kernel: axCuda
blas_threads[1][6] = 64;
blas_blocks[1][6] = 128;
// Kernel: caxpyCuda
// Kernel: caxpyCuda
blas_threads[1][7] = 64;
blas_blocks[1][7] = 128;
// Kernel: caxpbyCuda
// Kernel: caxpbyCuda
blas_threads[1][8] = 64;
blas_blocks[1][8] = 128;
// Kernel: cxpaypbzCuda
// Kernel: cxpaypbzCuda
blas_threads[1][9] = 64;
blas_blocks[1][9] = 128;
// Kernel: axpyZpbxCuda
blas_threads[1][10] = 128;
blas_blocks[1][10] = 128;
// Kernel: axpyZpbxCuda
blas_threads[1][10] = 64;
blas_blocks[1][10] = 2048;
// Kernel: caxpbypzYmbwCuda
// Kernel: caxpbypzYmbwCuda
blas_threads[1][11] = 64;
blas_blocks[1][11] = 128;
// Kernel: sumCuda
// Kernel: sumCuda
blas_threads[1][12] = 128;
blas_blocks[1][12] = 1024;
// Kernel: normCuda
// Kernel: normCuda
blas_threads[1][13] = 128;
blas_blocks[1][13] = 1024;
// Kernel: reDotProductCuda
// Kernel: reDotProductCuda
blas_threads[1][14] = 128;
blas_blocks[1][14] = 1024;
// Kernel: axpyNormCuda
// Kernel: axpyNormCuda
blas_threads[1][15] = 128;
blas_blocks[1][15] = 1024;
// Kernel: xmyNormCuda
// Kernel: xmyNormCuda
blas_threads[1][16] = 128;
blas_blocks[1][16] = 1024;
// Kernel: cDotProductCuda
// Kernel: cDotProductCuda
blas_threads[1][17] = 128;
blas_blocks[1][17] = 512;
// Kernel: xpaycDotzyCuda
// Kernel: xpaycDotzyCuda
blas_threads[1][18] = 64;
blas_blocks[1][18] = 128;
// Kernel: cDotProductNormACuda
// Kernel: cDotProductNormACuda
blas_threads[1][19] = 64;
blas_blocks[1][19] = 1024;
// Kernel: cDotProductNormBCuda
// Kernel: cDotProductNormBCuda
blas_threads[1][20] = 64;
blas_blocks[1][20] = 1024;
// Kernel: caxpbypzYmbwcDotProductWYNormYQuda
// Kernel: caxpbypzYmbwcDotProductWYNormYQuda
blas_threads[1][21] = 64;
blas_blocks[1][21] = 128;
// Kernel: copyCuda
// Kernel: copyCuda
blas_threads[2][0] = 64;
blas_blocks[2][0] = 128;
// Kernel: axpbyCuda
// Kernel: axpbyCuda
blas_threads[2][1] = 64;
blas_blocks[2][1] = 128;
// Kernel: xpyCuda
// Kernel: xpyCuda
blas_threads[2][2] = 64;
blas_blocks[2][2] = 128;
// Kernel: axpyCuda
// Kernel: axpyCuda
blas_threads[2][3] = 64;
blas_blocks[2][3] = 128;
// Kernel: xpayCuda
// Kernel: xpayCuda
blas_threads[2][4] = 64;
blas_blocks[2][4] = 128;
// Kernel: mxpyCuda
// Kernel: mxpyCuda
blas_threads[2][5] = 64;
blas_blocks[2][5] = 128;
// Kernel: axCuda
// Kernel: axCuda
blas_threads[2][6] = 64;
blas_blocks[2][6] = 2048;
// Kernel: caxpyCuda
// Kernel: caxpyCuda
blas_threads[2][7] = 64;
blas_blocks[2][7] = 2048;
// Kernel: caxpbyCuda
// Kernel: caxpbyCuda
blas_threads[2][8] = 64;
blas_blocks[2][8] = 128;
blas_blocks[2][8] = 2048;
// Kernel: cxpaypbzCuda
// Kernel: cxpaypbzCuda
blas_threads[2][9] = 64;
blas_blocks[2][9] = 1024;
blas_blocks[2][9] = 2048;
// Kernel: axpyZpbxCuda
// Kernel: axpyZpbxCuda
blas_threads[2][10] = 64;
blas_blocks[2][10] = 128;
// Kernel: caxpbypzYmbwCuda
// Kernel: caxpbypzYmbwCuda
blas_threads[2][11] = 64;
blas_blocks[2][11] = 128;
// Kernel: sumCuda
// Kernel: sumCuda
blas_threads[2][12] = 128;
blas_blocks[2][12] = 128;
// Kernel: normCuda
// Kernel: normCuda
blas_threads[2][13] = 128;
blas_blocks[2][13] = 128;
// Kernel: reDotProductCuda
// Kernel: reDotProductCuda
blas_threads[2][14] = 128;
blas_blocks[2][14] = 128;
// Kernel: axpyNormCuda
// Kernel: axpyNormCuda
blas_threads[2][15] = 64;
blas_blocks[2][15] = 128;
// Kernel: xmyNormCuda
// Kernel: xmyNormCuda
blas_threads[2][16] = 64;
blas_blocks[2][16] = 128;
// Kernel: cDotProductCuda
// Kernel: cDotProductCuda
blas_threads[2][17] = 64;
blas_blocks[2][17] = 128;
// Kernel: xpaycDotzyCuda
// Kernel: xpaycDotzyCuda
blas_threads[2][18] = 64;
blas_blocks[2][18] = 128;
// Kernel: cDotProductNormACuda
// Kernel: cDotProductNormACuda
blas_threads[2][19] = 64;
blas_blocks[2][19] = 128;
// Kernel: cDotProductNormBCuda
// Kernel: cDotProductNormBCuda
blas_threads[2][20] = 64;
blas_blocks[2][20] = 128;
// Kernel: caxpbypzYmbwcDotProductWYNormYQuda
// Kernel: caxpbypzYmbwcDotProductWYNormYQuda
blas_threads[2][21] = 64;
blas_blocks[2][21] = 256;
......@@ -34,9 +34,9 @@ unsigned long long blas_quda_flops;
unsigned long long blas_quda_bytes;
// Number of threads used for each blas kernel
int blas_threads[3][22];
static int blas_threads[3][22];
// Number of thread blocks for each blas kernel
int blas_blocks[3][22];
static int blas_blocks[3][22];
static dim3 blasBlock;
static dim3 blasGrid;
......
......@@ -187,6 +187,9 @@ double benchmark(int kernel) {
cudaEventSynchronize(end);
float runTime;
cudaEventElapsedTime(&runTime, start, end);
cudaEventDestroy(start);
cudaEventDestroy(end);
double secs = runTime / 1000;
return secs;
}
......@@ -242,8 +245,6 @@ int main(int argc, char** argv) {
for (int grid=0; grid<Ngrids; grid++) {
setBlasParam(i, prec, blockSizes[thread], gridSizes[grid]);
if (i==12) printfQuda("warmup %d %d\n", blockSizes[thread], gridSizes[grid]); // DEBUG
// first do warmup run
nIters = 1;
benchmark(kernels[i]);
......@@ -252,14 +253,6 @@ int main(int argc, char** argv) {
blas_quda_flops = 0;
blas_quda_bytes = 0;
// DEBUG
{
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) warningQuda("%s", cudaGetErrorString(error));
}
if (i==12) printfQuda("running %d %d\n", blockSizes[thread], gridSizes[grid]);
// END DEBUG
double secs = benchmark(kernels[i]);
double flops = blas_quda_flops;
double bytes = blas_quda_bytes;
......@@ -269,8 +262,6 @@ int main(int argc, char** argv) {
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) warningQuda("%s", cudaGetErrorString(error)); // DEBUG
if (gbytes > gbytes_max && error == cudaSuccess) { // prevents selection of failed parameters
gflops_max = gflops;
gbytes_max = gbytes;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment