Advanced Computing Platform for Theoretical Physics

Commit c3ded658 authored by mikeaclark's avatar mikeaclark
Browse files

Out of memory check for dslash kernel

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@591 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 6cbfc6be
......@@ -3,43 +3,43 @@
*/
// Kernel: copyCuda
blas_threads[0][0] = 64;
blas_blocks[0][0] = 256;
blas_blocks[0][0] = 2048;
// Kernel: axpbyCuda
blas_threads[0][1] = 64;
blas_blocks[0][1] = 1024;
blas_blocks[0][1] = 2048;
// Kernel: xpyCuda
blas_threads[0][2] = 64;
blas_blocks[0][2] = 1024;
blas_blocks[0][2] = 2048;
// Kernel: axpyCuda
blas_threads[0][3] = 64;
blas_blocks[0][3] = 1024;
blas_blocks[0][3] = 2048;
// Kernel: xpayCuda
blas_threads[0][4] = 64;
blas_blocks[0][4] = 1024;
blas_blocks[0][4] = 2048;
// Kernel: mxpyCuda
blas_threads[0][5] = 64;
blas_blocks[0][5] = 1024;
blas_blocks[0][5] = 2048;
// Kernel: axCuda
blas_threads[0][6] = 64;
blas_blocks[0][6] = 1024;
blas_blocks[0][6] = 2048;
// Kernel: caxpyCuda
blas_threads[0][7] = 64;
blas_blocks[0][7] = 1024;
blas_blocks[0][7] = 2048;
// Kernel: caxpbyCuda
blas_threads[0][8] = 64;
blas_blocks[0][8] = 1024;
blas_blocks[0][8] = 2048;
// Kernel: cxpaypbzCuda
blas_threads[0][9] = 64;
blas_blocks[0][9] = 1024;
blas_blocks[0][9] = 2048;
// Kernel: axpyZpbxCuda
blas_threads[0][10] = 64;
......@@ -102,7 +102,7 @@ blas_threads[1][2] = 128;
blas_blocks[1][2] = 128;
// Kernel: axpyCuda
blas_threads[1][3] = 128;
blas_threads[1][3] = 64;
blas_blocks[1][3] = 128;
// Kernel: xpayCuda
......@@ -203,11 +203,11 @@ blas_blocks[2][5] = 128;
// Kernel: axCuda
blas_threads[2][6] = 64;
blas_blocks[2][6] = 1024;
blas_blocks[2][6] = 2048;
// Kernel: caxpyCuda
blas_threads[2][7] = 64;
blas_blocks[2][7] = 128;
blas_blocks[2][7] = 2048;
// Kernel: caxpbyCuda
blas_threads[2][8] = 64;
......
......@@ -122,6 +122,13 @@ void initDslashConstants(FullGauge gauge, int sp_stride, int cl_stride) {
float h_pi_f = M_PI;
cudaMemcpyToSymbol("pi_f", &(h_pi_f), sizeof(float));
cudaError_t error = cudaGetLastError();
cudaGetLastError();
if(error != cudaSuccess) {
printf("initDslashConstants error: %s\n", cudaGetErrorString(error));
exit(0);
}
initDslash = 1;
}
......@@ -170,6 +177,14 @@ void dslashCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity,
} else if (in.precision == QUDA_HALF_PRECISION) {
dslashHCuda(out, gauge, in, parity, dagger);
}
cudaError_t error = cudaGetLastError();
cudaGetLastError();
if(error != cudaSuccess) {
printf("dslashCuda error: %s\n", cudaGetErrorString(error));
exit(0);
}
dslash_quda_flops += 1320*in.volume;
}
......@@ -374,6 +389,13 @@ void dslashXpayCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int pari
dslashXpayHCuda(out, gauge, in, parity, dagger, x, a);
}
cudaError_t error = cudaGetLastError();
cudaGetLastError();
if(error != cudaSuccess) {
printf("dslashXpayCuda error: %s\n", cudaGetErrorString(error));
exit(0);
}
dslash_quda_flops += (1320+48)*in.volume;
}
......@@ -647,6 +669,13 @@ void cloverDslashCuda(ParitySpinor out, FullGauge gauge, FullClover cloverInv,
cloverDslashHCuda(out, gauge, cloverInv, in, parity, dagger);
}
cudaError_t error = cudaGetLastError();
cudaGetLastError();
if(error != cudaSuccess) {
printf("cloverDslashCuda error: %s\n", cudaGetErrorString(error));
exit(0);
}
dslash_quda_flops += (1320+504)*in.volume;
}
......@@ -1153,6 +1182,13 @@ void cloverDslashXpayCuda(ParitySpinor out, FullGauge gauge, FullClover cloverIn
cloverDslashXpayHCuda(out, gauge, cloverInv, in, parity, dagger, x, a);
}
cudaError_t error = cudaGetLastError();
cudaGetLastError();
if(error != cudaSuccess) {
printf("cloverDslashXpayCuda error: %s\n", cudaGetErrorString(error));
exit(0);
}
dslash_quda_flops += (1320+504+48)*in.volume;
}
......@@ -1770,6 +1806,13 @@ void cloverCuda(ParitySpinor out, FullGauge gauge, FullClover clover,
cloverHCuda(out, gauge, clover, in, parity);
}
cudaError_t error = cudaGetLastError();
cudaGetLastError();
if(error != cudaSuccess) {
printf("cloverCuda error: %s\n", cudaGetErrorString(error));
exit(0);
}
dslash_quda_flops += 504*in.volume;
}
......
......@@ -11,6 +11,14 @@
void MatVec(ParitySpinor out, FullGauge gauge, FullClover clover, FullClover cloverInv, ParitySpinor in,
QudaInvertParam *invert_param, ParitySpinor tmp, DagType dag_type) {
{cudaError_t error = cudaGetLastError();
cudaGetLastError();
if(error != cudaSuccess) {
printf("B4 MatVec: %s\n", cudaGetErrorString(error));
exit(0);
}}
double kappa = invert_param->kappa;
if (invert_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER)
kappa *= cudaGaugePrecise.anisotropy;
......@@ -21,6 +29,14 @@ void MatVec(ParitySpinor out, FullGauge gauge, FullClover clover, FullClover cl
cloverMatPCCuda(out, gauge, clover, cloverInv, in, kappa, tmp,
invert_param->matpc_type, dag_type);
}
cudaError_t error = cudaGetLastError();
cudaGetLastError();
if(error != cudaSuccess) {
printf("MatVec: %s\n", cudaGetErrorString(error));
exit(0);
}
}
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, ParitySpinor r,
......@@ -48,14 +64,6 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, ParitySpinor r,
zeroCuda(x_sloppy);
copyCuda(r_sloppy, b);
/*{
cuDoubleComplex rv;
MatVec(v, cudaGaugeSloppy, cudaCloverSloppy, cudaCloverInvSloppy, r_sloppy, invert_param, tmp, dag_type);
// rv = (r0,v)
rv = cDotProductCuda(r0, v);
printf("%e %e %e %e %e\n", rv.x, rv.y, normCuda(r_sloppy), normCuda(v), normCuda(tmp)); exit(0);
} */
zeroCuda(y);
double b2 = normCuda(b);
......@@ -109,6 +117,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, ParitySpinor r,
}
MatVec(v, cudaGaugeSloppy, cudaCloverSloppy, cudaCloverInvSloppy, p, invert_param, tmp, dag_type);
// rv = (r0,v)
rv = cDotProductCuda(r0, v);
......
......@@ -14,7 +14,7 @@ ParitySpinor x, y, z, w, v, p;
int nIters;
int Nthreads = 3;
int Ngrids = 5;
int Ngrids = 6;
int blockSizes[] = {64, 128, 256};
int gridSizes[] = {64, 128, 256, 512, 1024, 2048};
......
......@@ -50,7 +50,7 @@ void init() {
gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T;
gauge_param.cpu_prec = QUDA_DOUBLE_PRECISION;
gauge_param.cuda_prec = QUDA_SINGLE_PRECISION;
gauge_param.cuda_prec = QUDA_DOUBLE_PRECISION;
gauge_param.reconstruct = QUDA_RECONSTRUCT_12;
gauge_param.reconstruct_sloppy = gauge_param.reconstruct;
gauge_param.cuda_prec_sloppy = gauge_param.cuda_prec;
......
......@@ -21,7 +21,7 @@ int main(int argc, char **argv)
gauge_param.X[0] = 24;
gauge_param.X[1] = 24;
gauge_param.X[2] = 24;
gauge_param.X[3] = 64;
gauge_param.X[3] = 48;
setDims(gauge_param.X);
gauge_param.anisotropy = 1.0;
......@@ -29,9 +29,9 @@ int main(int argc, char **argv)
gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T;
gauge_param.cpu_prec = QUDA_DOUBLE_PRECISION;
gauge_param.cuda_prec = QUDA_SINGLE_PRECISION;
gauge_param.cuda_prec = QUDA_DOUBLE_PRECISION;
gauge_param.reconstruct = QUDA_RECONSTRUCT_12;
gauge_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
gauge_param.cuda_prec_sloppy = QUDA_DOUBLE_PRECISION;
gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12;
gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
......@@ -55,8 +55,8 @@ int main(int argc, char **argv)
inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
inv_param.cuda_prec_sloppy = QUDA_HALF_PRECISION;
inv_param.cuda_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec_sloppy = QUDA_DOUBLE_PRECISION;
inv_param.preserve_source = QUDA_PRESERVE_SOURCE_YES;
inv_param.dirac_order = QUDA_DIRAC_ORDER;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment