Advanced Computing Platform for Theoretical Physics

commit大文件会使得服务器变得不稳定,请大家尽量只commit代码,不要commit大的文件。

Commit 06609409 authored by mikeaclark's avatar mikeaclark
Browse files

Added initial blas benchmarking

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@486 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 7b168c34
......@@ -34,7 +34,7 @@ NVCC = $(CUDA_INSTALL_PATH)/bin/nvcc
NVCCFLAGS = -O3 $(NVCCOPT) -arch=$(GPU_ARCH) $(INC)
LDFLAGS = -fPIC $(LIB)
all: dslash_test invert_test su3_test pack_test
all: dslash_test invert_test su3_test pack_test blas_test
QUDA = libquda.a
QUDA_OBJS = blas_quda.o blas_reference.o clover_quda.o dslash_quda.o \
......@@ -62,6 +62,9 @@ su3_test: su3_test.o $(QUDA)
pack_test: pack_test.o $(QUDA)
$(CXX) $(LDFLAGS) $< $(QUDA) -o $@
blas_test: blas_test.o $(QUDA)
$(CXX) $(LDFLAGS) $< $(QUDA) -o $@
gen:
$(PYTHON) dslash_cuda_gen.py
......
......@@ -35,6 +35,8 @@ int blocksFloat = 0;
int blocksComplex = 0;
int blocksFloat3 = 0;
unsigned long long blas_quda_flops;
void initReduceFloat(int blocks) {
if (blocks != blocksFloat) {
if (blocksFloat > 0) cudaFree(h_reduceFloat);
......@@ -46,7 +48,7 @@ void initReduceFloat(int blocks) {
blocksFloat = blocks;
printf("Initialized reduce floats %d\n", blocksFloat);
//printf("Initialized reduce floats %d\n", blocksFloat);
}
}
......@@ -60,7 +62,7 @@ void initReduceComplex(int blocks) {
}
blocksComplex = blocks;
printf("Initialized reduce complex %d\n", blocksComplex);
//printf("Initialized reduce complex %d\n", blocksComplex);
}
}
......@@ -74,7 +76,7 @@ void initReduceFloat3(int blocks) {
}
blocksFloat3 = blocks;
printf("Initialized reduce float3 %d\n", blocksFloat3);
//printf("Initialized reduce float3 %d\n", blocksFloat3);
}
}
......@@ -493,6 +495,7 @@ void axpbyCuda(double a, ParitySpinor x, double b, ParitySpinor y) {
axpbyHKernel<<<dimGrid, dimBlock>>>((float)a, (float)b, (short4*)y.spinor,
(float*)y.spinorNorm, y.length/spinorSiteSize);
}
blas_quda_flops += 3*x.length;
}
template <typename Float>
......@@ -541,6 +544,7 @@ void xpyCuda(ParitySpinor x, ParitySpinor y) {
cudaBindTexture(0, texNorm2, y.spinorNorm, spinor_bytes/12);
xpyHKernel<<<dimGrid, dimBlock>>>((short4*)y.spinor, (float*)y.spinorNorm, y.length/spinorSiteSize);
}
blas_quda_flops += x.length;
}
template <typename Float>
......@@ -589,6 +593,7 @@ void axpyCuda(double a, ParitySpinor x, ParitySpinor y) {
cudaBindTexture(0, texNorm2, y.spinorNorm, spinor_bytes/12);
axpyHKernel<<<dimGrid, dimBlock>>>((float)a, (short4*)y.spinor, (float*)y.spinorNorm, y.length/spinorSiteSize);
}
blas_quda_flops += 2*x.length;
}
template <typename Float>
......@@ -637,6 +642,7 @@ void xpayCuda(ParitySpinor x, double a, ParitySpinor y) {
cudaBindTexture(0, texNorm2, y.spinorNorm, spinor_bytes/12);
xpayHKernel<<<dimGrid, dimBlock>>>((float)a, (short4*)y.spinor, (float*)y.spinorNorm, y.length/spinorSiteSize);
}
blas_quda_flops += 2*x.length;
}
template <typename Float>
......@@ -686,6 +692,7 @@ void mxpyQuda(ParitySpinor x, ParitySpinor y) {
cudaBindTexture(0, texNorm2, y.spinorNorm, spinor_bytes/12);
mxpyHKernel<<<dimGrid, dimBlock>>>((short4*)y.spinor, (float*)y.spinorNorm, y.length/spinorSiteSize);
}
blas_quda_flops += x.length;
}
template <typename Float>
......@@ -726,6 +733,7 @@ void axCuda(double a, ParitySpinor x) {
cudaBindTexture(0, texNorm1, x.spinorNorm, spinor_bytes/12);
axHKernel<<<dimGrid, dimBlock>>>((float)a, (short4*)x.spinor, (float*)x.spinorNorm, x.length/spinorSiteSize);
}
blas_quda_flops += x.length;
}
template <typename Float2>
......@@ -784,6 +792,7 @@ void caxpyCuda(double2 a, ParitySpinor x, ParitySpinor y) {
float2 af2 = make_float2((float)a.x, (float)a.y);
caxpyHKernel<<<dimGrid, dimBlock>>>(af2, (short4*)y.spinor, (float*)y.spinorNorm, y.length/spinorSiteSize);
}
blas_quda_flops += 4*x.length;
}
template <typename Float2>
......@@ -823,7 +832,7 @@ __global__ void caxpbyHKernel(float2 a, float2 b, short4 *yH, float *yN, int len
// performs the operation y[i] = c*x[i] + b*y[i]
void caxbyCuda(double2 a, ParitySpinor x, double2 b, ParitySpinor y) {
void caxpbyCuda(double2 a, ParitySpinor x, double2 b, ParitySpinor y) {
checkSpinor(x,y);
int length = x.length/2;
int blocks = min(REDUCE_MAX_BLOCKS, max(length/REDUCE_THREADS, 1));
......@@ -845,6 +854,7 @@ void caxbyCuda(double2 a, ParitySpinor x, double2 b, ParitySpinor y) {
float2 bf2 = make_float2((float)b.x, (float)b.y);
caxpbyHKernel<<<dimGrid, dimBlock>>>(af2, bf2, (short4*)y.spinor, (float*)y.spinorNorm, y.length/spinorSiteSize);
}
blas_quda_flops += 7*x.length;
}
template <typename Float2>
......@@ -919,6 +929,7 @@ void cxpaypbzCuda(ParitySpinor x, double2 a, ParitySpinor y, double2 b, ParitySp
float2 bf2 = make_float2((float)b.x, (float)b.y);
cxpaypbzHKernel<<<dimGrid, dimBlock>>>(af2, bf2, (short4*)z.spinor, (float*)z.spinorNorm, z.length/spinorSiteSize);
}
blas_quda_flops += 8*x.length;
}
template <typename Float>
......@@ -960,7 +971,7 @@ __global__ void axpyZpbxHKernel(float a, float b, short4 *xH, float *xN, short4
}
// performs the operations: {y[i] = a x[i] + y[i]; x[i] = z[i] + b x[i]}
// performs the operations: {y[i] = a*x[i] + y[i]; x[i] = z[i] + b*x[i]}
void axpyZpbxCuda(double a, ParitySpinor x, ParitySpinor y, ParitySpinor z, double b) {
checkSpinor(x,y);
checkSpinor(x,z);
......@@ -982,6 +993,7 @@ void axpyZpbxCuda(double a, ParitySpinor x, ParitySpinor y, ParitySpinor z, doub
axpyZpbxHKernel<<<dimGrid, dimBlock>>>((float)a, (float)b, (short4*)x.spinor, (float*)x.spinorNorm,
(short4*)y.spinor, (float*)y.spinorNorm, z.length/spinorSiteSize);
}
blas_quda_flops += 8*x.length;
}
template <typename Float2>
......@@ -1073,6 +1085,7 @@ void caxpbypzYmbwCuda(double2 a, ParitySpinor x, double2 b, ParitySpinor y,
caxpbypzYmbwHKernel<<<dimGrid, dimBlock>>>(af2, bf2, (short4*)y.spinor, (float*)y.spinorNorm,
(short4*)z.spinor, (float*)z.spinorNorm, z.length/spinorSiteSize);
}
blas_quda_flops += 12*x.length;
}
......@@ -1174,6 +1187,7 @@ template <typename Float>
#undef REDUCE_OPERATION
double sumCuda(ParitySpinor a) {
blas_quda_flops += a.length;
if (a.precision == QUDA_DOUBLE_PRECISION) {
return sumFCuda((double*)a.spinor, a.length);
} else if (a.precision == QUDA_SINGLE_PRECISION) {
......@@ -1227,6 +1241,7 @@ template <typename Float>
#undef REDUCE_OPERATION
double normCuda(ParitySpinor a) {
blas_quda_flops += 2*a.length;
if (a.precision == QUDA_DOUBLE_PRECISION) {
return normFCuda((double*)a.spinor, a.length);
} else if (a.precision == QUDA_SINGLE_PRECISION) {
......@@ -1283,6 +1298,7 @@ template <typename Float>
#undef REDUCE_OPERATION
double reDotProductCuda(ParitySpinor a, ParitySpinor b) {
blas_quda_flops += 2*a.length;
checkSpinor(a, b);
if (a.precision == QUDA_DOUBLE_PRECISION) {
return reDotProductFCuda((double*)a.spinor, (double*)b.spinor, a.length);
......@@ -1348,6 +1364,7 @@ template <typename Float>
#undef REDUCE_OPERATION
double axpyNormCuda(double a, ParitySpinor x, ParitySpinor y) {
blas_quda_flops += 4*x.length;
checkSpinor(x,y);
if (x.precision == QUDA_DOUBLE_PRECISION) {
return axpyNormFCuda(a, (double*)x.spinor, (double*)y.spinor, x.length);
......@@ -1414,6 +1431,7 @@ template <typename Float>
#undef REDUCE_OPERATION
double xmyNormCuda(ParitySpinor x, ParitySpinor y) {
blas_quda_flops +=3*x.length;
checkSpinor(x,y);
if (x.precision == QUDA_DOUBLE_PRECISION) {
return xmyNormFCuda((double*)x.spinor, (double*)y.spinor, x.length);
......@@ -1484,6 +1502,7 @@ template <typename Float, typename Float2>
#undef REDUCE_IMAG_OPERATION
double2 cDotProductCuda(ParitySpinor x, ParitySpinor y) {
blas_quda_flops += 4*x.length;
checkSpinor(x,y);
int length = x.length/2;
if (x.precision == QUDA_DOUBLE_PRECISION) {
......@@ -1568,6 +1587,7 @@ template <typename Float, typename Float2>
#undef REDUCE_IMAG_OPERATION
double2 xpayDotzyCuda(ParitySpinor x, double a, ParitySpinor y, ParitySpinor z) {
blas_quda_flops += 6*x.length;
checkSpinor(x,y);
checkSpinor(x,z);
int length = x.length/2;
......@@ -1657,6 +1677,7 @@ template <typename Float2>
#undef REDUCE_Z_OPERATION
double3 cDotProductNormACuda(ParitySpinor x, ParitySpinor y) {
blas_quda_flops += 6*x.length;
checkSpinor(x,y);
int length = x.length/2;
if (x.precision == QUDA_DOUBLE_PRECISION) {
......@@ -1742,6 +1763,7 @@ template <typename Float2>
#undef REDUCE_Z_OPERATION
double3 cDotProductNormBCuda(ParitySpinor x, ParitySpinor y) {
blas_quda_flops += 6*x.length;
checkSpinor(x,y);
int length = x.length/2;
if (x.precision == QUDA_DOUBLE_PRECISION) {
......@@ -1870,6 +1892,7 @@ template <typename Float2>
// This convoluted kernel does the following: z += a*x + b*y, y -= b*w, norm = (y,y), dot = (u, y)
double3 caxpbypzYmbwcDotProductWYNormYQuda(double2 a, ParitySpinor x, double2 b, ParitySpinor y,
ParitySpinor z, ParitySpinor w, ParitySpinor u) {
blas_quda_flops += 18*x.length;
checkSpinor(x,y);
checkSpinor(x,z);
checkSpinor(x,w);
......
......@@ -43,7 +43,9 @@ extern "C" {
double3 cDotProductNormBCuda(ParitySpinor a, ParitySpinor b);
double3 caxpbypzYmbwcDotProductWYNormYQuda(double2 a, ParitySpinor x, double2 b, ParitySpinor y,
ParitySpinor z, ParitySpinor w, ParitySpinor u);
extern unsigned long long blas_quda_flops;
#ifdef __cplusplus
}
#endif
......
#include <stdio.h>
#include <stdlib.h>
#include <quda.h>
#include <util_quda.h>
#include <spinor_quda.h>
#include <blas_quda.h>
// What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat)
int test_type = 1;
QudaInvertParam inv_param;
ParitySpinor x, y, z, w, v;
int nIters = 1000;
void init() {
int X[4];
X[0] = 32;
X[1] = 32;
X[2] = 32;
X[3] = 32;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_HALF_PRECISION;
inv_param.verbosity = QUDA_VERBOSE;
invert_param = &inv_param;
int dev = 0;
initQuda(dev);
// need single parity dimensions
X[0] /= 2;
v = allocateParitySpinor(X, inv_param.cuda_prec);
w = allocateParitySpinor(X, inv_param.cuda_prec);
x = allocateParitySpinor(X, inv_param.cuda_prec);
y = allocateParitySpinor(X, inv_param.cuda_prec);
z = allocateParitySpinor(X, inv_param.cuda_prec);
}
void end() {
// release memory
freeParitySpinor(v);
freeParitySpinor(w);
freeParitySpinor(x);
freeParitySpinor(y);
freeParitySpinor(z);
endQuda();
}
double benchmark(int kernel) {
double a, b;
double2 a2, b2;
//printf("Executing %d kernel loops...", nIters);
//fflush(stdout);
stopwatchStart();
for (int i=0; i < nIters; ++i) {
switch (kernel) {
case 0:
axpbyCuda(a, x, b, y);
break;
case 1:
xpyCuda(x, y);
break;
case 2:
axpyCuda(a, x, y);
break;
case 3:
xpayCuda(x, a, y);
break;
case 4:
//mxpyQuda(x, y);
break;
case 5:
axCuda(a, x);
break;
case 6:
caxpyCuda(a2, x, y);
break;
case 7:
caxpbyCuda(a2, x, b2, y);
break;
case 8:
cxpaypbzCuda(x, a2, y, b2, z);
break;
case 9:
axpyZpbxCuda(a, x, y, z, b);
break;
case 10:
caxpbypzYmbwCuda(a2, x, b2, y, z, w);
break;
// double
case 11:
sumCuda(x);
break;
case 12:
normCuda(x);
break;
case 13:
reDotProductCuda(x, y);
break;
case 14:
axpyNormCuda(a, x, y);
break;
case 15:
xmyNormCuda(x, y);
break;
// double2
case 16:
cDotProductCuda(x, y);
break;
case 17:
//xpayDotzyCuda(x, a, y, z);
break;
// double3
case 18:
cDotProductNormACuda(x, y);
break;
case 19:
cDotProductNormBCuda(x, y);
break;
case 20:
caxpbypzYmbwcDotProductWYNormYQuda(a2, x, b2, y, z, w, v);
break;
default:
printf("Undefined blas kernel %d\n", kernel);
exit(1);
}
}
double secs = stopwatchReadSeconds() / nIters;
return secs;
}
int main(int argc, char** argv) {
init();
int kernels[] = {0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20};
nIters = 1;
// first do warmup run
for (int i = 0; i < 19; i++) {
benchmark(kernels[i]);
}
nIters = 1000;
for (int i = 0; i < 19; i++) {
blas_quda_flops = 0;
double secs = benchmark(kernels[i]);
double flops = blas_quda_flops / (double)nIters;
printf("Average time: %f s, flops = %e, Gflops/s = %f\n", secs, flops, flops/secs*1e-9);
//printf("Bandwidth: %f GiB/s\n\n", GiB / secs);
}
}
......@@ -10,7 +10,7 @@
// What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat)
int test_type = 1;
// clover-improved? (0 = plain Wilson, 1 = clover)
int clover_yes = 1;
int clover_yes = 0;
QudaGaugeParam gaugeParam;
QudaInvertParam inv_param;
......@@ -45,11 +45,11 @@ void init() {
gaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;
gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION;
gaugeParam.cuda_prec = QUDA_SINGLE_PRECISION;
gaugeParam.cuda_prec = QUDA_DOUBLE_PRECISION;
gaugeParam.reconstruct = QUDA_RECONSTRUCT_12;
gaugeParam.reconstruct_sloppy = gaugeParam.reconstruct;
gaugeParam.cuda_prec_sloppy = gaugeParam.cuda_prec;
gaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
gaugeParam.gauge_fix = QUDA_GAUGE_FIXED_YES;
gaugeParam.blockDim = 64;
......@@ -64,7 +64,7 @@ void init() {
inv_param.matpc_type = QUDA_MATPC_ODD_ODD_ASYMMETRIC;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
inv_param.cuda_prec = QUDA_DOUBLE_PRECISION;
if (test_type == 2) inv_param.dirac_order = QUDA_DIRAC_ORDER;
else inv_param.dirac_order = QUDA_DIRAC_ORDER;
......@@ -307,8 +307,8 @@ void dslashTest() {
printf("%d Test %s\n", i, (1 == res) ? "PASSED" : "FAILED");
if (test_type < 2) strong_check(spinorRef, spinorOdd, Vh, inv_param.cpu_prec);
else strong_check(spinorRef, spinorGPU, V, inv_param.cpu_prec);
//if (test_type < 2) strong_check(spinorRef, spinorOdd, Vh, inv_param.cpu_prec);
//else strong_check(spinorRef, spinorGPU, V, inv_param.cpu_prec);
}
end();
}
......
......@@ -18,7 +18,7 @@ int main(int argc, char **argv)
Gauge_param.X[0] = 24;
Gauge_param.X[1] = 24;
Gauge_param.X[2] = 24;
Gauge_param.X[3] = 32;
Gauge_param.X[3] = 48;
setDims(Gauge_param.X);
Gauge_param.anisotropy = 1.0;
......@@ -26,11 +26,11 @@ int main(int argc, char **argv)
Gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T;
Gauge_param.cpu_prec = QUDA_DOUBLE_PRECISION;
Gauge_param.cuda_prec = QUDA_SINGLE_PRECISION;
Gauge_param.cuda_prec = QUDA_DOUBLE_PRECISION;
Gauge_param.reconstruct = QUDA_RECONSTRUCT_12;
Gauge_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
Gauge_param.cuda_prec_sloppy = QUDA_DOUBLE_PRECISION;
Gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12;
Gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
Gauge_param.gauge_fix = QUDA_GAUGE_FIXED_YES;
Gauge_param.blockDim = 64;
Gauge_param.blockDim_sloppy = 64;
......@@ -49,16 +49,16 @@ int main(int argc, char **argv)
double mass = -0.95;
inv_param.kappa = 1.0 / (2.0*(1 + 3/gauge_param->anisotropy + mass));
inv_param.tol = 1e-12;
inv_param.maxiter = 10000;
inv_param.reliable_delta = 1e-3;
inv_param.maxiter = 200;
inv_param.reliable_delta = 1e-13;
inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
inv_param.solution_type = QUDA_MAT_SOLUTION;
inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
inv_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
inv_param.cuda_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec_sloppy = QUDA_DOUBLE_PRECISION;
inv_param.preserve_source = QUDA_PRESERVE_SOURCE_YES;
inv_param.dirac_order = QUDA_DIRAC_ORDER;
......
......@@ -509,7 +509,8 @@ void construct_spinor_field(void *spinor, int type, int i0, int s0, int c0, Prec
template <typename Float>
void compareSpinor(Float *spinorRef, Float *spinorGPU, int len) {
int fail_check = 16;
int res = 1;
int fail_check = 16*res;
int fail[fail_check];
for (int f=0; f<fail_check; f++) fail[f] = 0;
......@@ -521,7 +522,7 @@ void compareSpinor(Float *spinorRef, Float *spinorGPU, int len) {
int is = i*24+j;
double diff = fabs(spinorRef[is]-spinorGPU[is]);
for (int f=0; f<fail_check; f++)
if (diff > pow(10.0,-(f+1))) fail[f]++;
if (diff > pow(10.0,-(f+1)/(double)res)) fail[f]++;
//if (diff > 1e-1) printf("%d %d %e\n", i, j, diff);
if (diff > 1e-3) iter[j]++;
}
......@@ -530,7 +531,7 @@ void compareSpinor(Float *spinorRef, Float *spinorGPU, int len) {
for (int i=0; i<24; i++) printf("%d fails = %d\n", i, iter[i]);
for (int f=0; f<fail_check; f++) {
printf("%e Failures: %d / %d = %e\n", pow(10.0,-(f+1)), fail[f], len*24, fail[f] / (double)(len*24));
printf("%e Failures: %d / %d = %e\n", pow(10.0,-(f+1)/(double)res), fail[f], len*24, fail[f] / (double)(len*24));
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment