Advanced Computing Platform for Theoretical Physics

commit大文件会使得服务器变得不稳定,请大家尽量只commit代码,不要commit大的文件。

Commit 86503876 authored by mikeaclark's avatar mikeaclark
Browse files

Fully implemented half precision BLAS routines

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@400 be54200a-260c-0410-bdd7-ce6af2a381ab
parent b7252b18
This diff is collapsed.
......@@ -10,37 +10,37 @@ extern "C" {
// ---------- blas_quda.cu ----------
void zeroQuda(ParitySpinor a);
void copyQuda(ParitySpinor dst, ParitySpinor src);
void zeroCuda(ParitySpinor a);
void copyCuda(ParitySpinor dst, ParitySpinor src);
double axpyNormQuda(double a, ParitySpinor x, ParitySpinor y);
double sumQuda(ParitySpinor b);
double normQuda(ParitySpinor b);
double reDotProductQuda(ParitySpinor a, ParitySpinor b);
double xmyNormQuda(ParitySpinor a, ParitySpinor b);
double axpyNormCuda(double a, ParitySpinor x, ParitySpinor y);
double sumCuda(ParitySpinor b);
double normCuda(ParitySpinor b);
double reDotProductCuda(ParitySpinor a, ParitySpinor b);
double xmyNormCuda(ParitySpinor a, ParitySpinor b);
void axpbyQuda(double a, ParitySpinor x, double b, ParitySpinor y);
void axpyQuda(double a, ParitySpinor x, ParitySpinor y);
void axQuda(double a, ParitySpinor x);
void xpyQuda(ParitySpinor x, ParitySpinor y);
void xpayQuda(ParitySpinor x, double a, ParitySpinor y);
void mxpyQuda(ParitySpinor x, ParitySpinor y);
void axpbyCuda(double a, ParitySpinor x, double b, ParitySpinor y);
void axpyCuda(double a, ParitySpinor x, ParitySpinor y);
void axCuda(double a, ParitySpinor x);
void xpyCuda(ParitySpinor x, ParitySpinor y);
void xpayCuda(ParitySpinor x, double a, ParitySpinor y);
void mxpyCuda(ParitySpinor x, ParitySpinor y);
void axpyZpbxQuda(double a, ParitySpinor x, ParitySpinor y, ParitySpinor z, double b);
void axpyZpbxCuda(double a, ParitySpinor x, ParitySpinor y, ParitySpinor z, double b);
void caxpbyQuda(double2 a, ParitySpinor x, double2 b, ParitySpinor y);
void caxpyQuda(double2 a, ParitySpinor x, ParitySpinor y);
void cxpaypbzQuda(ParitySpinor, double2 b, ParitySpinor y, double2 c, ParitySpinor z);
void caxpbypzYmbwQuda(double2, ParitySpinor, double2, ParitySpinor, ParitySpinor, ParitySpinor);
void caxpbyCuda(double2 a, ParitySpinor x, double2 b, ParitySpinor y);
void caxpyCuda(double2 a, ParitySpinor x, ParitySpinor y);
void cxpaypbzCuda(ParitySpinor, double2 b, ParitySpinor y, double2 c, ParitySpinor z);
void caxpbypzYmbwCuda(double2, ParitySpinor, double2, ParitySpinor, ParitySpinor, ParitySpinor);
cuDoubleComplex cDotProductQuda(ParitySpinor, ParitySpinor);
cuDoubleComplex xpaycDotzyQuda(ParitySpinor x, double a, ParitySpinor y, ParitySpinor z);
cuDoubleComplex cDotProductCuda(ParitySpinor, ParitySpinor);
cuDoubleComplex xpaycDotzyCuda(ParitySpinor x, double a, ParitySpinor y, ParitySpinor z);
void blasTest();
void axpbyTest();
double3 cDotProductNormAQuda(ParitySpinor a, ParitySpinor b);
double3 cDotProductNormBQuda(ParitySpinor a, ParitySpinor b);
double3 cDotProductNormACuda(ParitySpinor a, ParitySpinor b);
double3 cDotProductNormBCuda(ParitySpinor a, ParitySpinor b);
double3 caxpbypzYmbwcDotProductWYNormYQuda(double2 a, ParitySpinor x, double2 b, ParitySpinor y,
ParitySpinor z, ParitySpinor w, ParitySpinor u);
......
......@@ -8,7 +8,7 @@
#include <gauge_quda.h>
// What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat)
int test_type = 2;
int test_type = 0;
QudaGaugeParam gaugeParam;
QudaInvertParam inv_param;
......@@ -25,12 +25,12 @@ void *spinorEven, *spinorOdd;
double kappa = 1.0;
int ODD_BIT = 0;
int DAGGER_BIT = 0;
int TRANSFER = 1; // include transfer time in the benchmark?
int TRANSFER = 0; // include transfer time in the benchmark?
void init() {
gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION;
gaugeParam.cuda_prec = QUDA_SINGLE_PRECISION;
gaugeParam.cuda_prec = QUDA_HALF_PRECISION;
gaugeParam.reconstruct = QUDA_RECONSTRUCT_12;
gaugeParam.reconstruct_sloppy = gaugeParam.reconstruct;
gaugeParam.cuda_prec_sloppy = gaugeParam.cuda_prec;
......@@ -45,7 +45,7 @@ void init() {
gauge_param = &gaugeParam;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
inv_param.cuda_prec = QUDA_HALF_PRECISION;
if (test_type == 2) inv_param.dirac_order = QUDA_DIRAC_ORDER;
else inv_param.dirac_order = QUDA_DIRAC_ORDER;
inv_param.kappa = kappa;
......
......@@ -31,15 +31,15 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugeSloppy,
r_sloppy = allocateParitySpinor(x.length/spinorSiteSize, invert_param->cuda_prec_sloppy);
src_sloppy = allocateParitySpinor(x.length/spinorSiteSize, invert_param->cuda_prec_sloppy);
tmp_sloppy = allocateParitySpinor(x.length/spinorSiteSize, invert_param->cuda_prec_sloppy);
copyQuda(src_sloppy, src);
copyCuda(src_sloppy, src);
}
zeroQuda(x_sloppy);
copyQuda(b, src);
copyQuda(r_sloppy, src_sloppy);
zeroQuda(y);
zeroCuda(x_sloppy);
copyCuda(b, src);
copyCuda(r_sloppy, src_sloppy);
zeroCuda(y);
double b2 = normQuda(b);
double b2 = normCuda(b);
double r2 = b2;
double stop = b2*invert_param->tol*invert_param->tol; // stopping condition of solver
......@@ -68,13 +68,13 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugeSloppy,
int k=0;
int xUpdate = 0, rUpdate = 0;
printf("%d iterations, r2 = %e, x2 = %e\n", k, r2, normQuda(x_sloppy));
printf("%d iterations, r2 = %e\n", k, r2);
stopwatchStart();
while (r2 > stop && k<invert_param->maxiter) {
if (k==0) {
rho = make_cuDoubleComplex(r2, 0.0);
copyQuda(p, r_sloppy);
copyCuda(p, r_sloppy);
} else {
alpha_omega = cuCdiv(alpha, omega);
rho_rho0 = cuCdiv(rho, rho0);
......@@ -82,7 +82,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugeSloppy,
// p = r - beta*omega*v + beta*(p)
beta_omega = cuCmul(beta, omega); beta_omega.x *= -1.0; beta_omega.y *= -1.0;
cxpaypbzQuda(r_sloppy, beta_omega, v, beta, p); // 8
cxpaypbzCuda(r_sloppy, beta_omega, v, beta, p); // 8
}
if (dag_type == QUDA_DAG_NO)
......@@ -90,12 +90,13 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugeSloppy,
else
MatPCDagCuda(v, gaugeSloppy, p, invert_param->kappa, tmp_sloppy, invert_param->matpc_type);
rv = cDotProductQuda(src_sloppy, v);
rv = cDotProductCuda(src_sloppy, v);
alpha = cuCdiv(rho, rv);
// r -= alpha*v
alpha.x *= -1.0; alpha.y *= -1.0;
caxpyQuda(alpha, v, r_sloppy); // 4
caxpyCuda(alpha, v, r_sloppy); // 4
alpha.x *= -1.0; alpha.y *= -1.0;
if (dag_type == QUDA_DAG_NO)
......@@ -104,7 +105,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugeSloppy,
MatPCDagCuda(t, gaugeSloppy, r_sloppy, invert_param->kappa, tmp_sloppy, invert_param->matpc_type);
// omega = (t, r) / (t, t)
omega_t2 = cDotProductNormAQuda(t, r_sloppy); // 6
omega_t2 = cDotProductNormACuda(t, r_sloppy); // 6
omega.x = omega_t2.x / omega_t2.z; omega.y = omega_t2.y/omega_t2.z;
//x += alpha*p + omega*r, r -= omega*t, r2 = (r,r), rho = (r0, r)
......@@ -119,24 +120,24 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugeSloppy,
int updateR = ((rNorm < delta*maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0;
if (updateR) {
if (x.precision != x_sloppy.precision) copyQuda(x, x_sloppy);
if (x.precision != x_sloppy.precision) copyCuda(x, x_sloppy);
if (dag_type == QUDA_DAG_NO)
MatPCCuda(r, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
else
MatPCDagCuda(r, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
r2 = xmyNormQuda(b, r);
if (x.precision != r_sloppy.precision) copyQuda(r_sloppy, r);
r2 = xmyNormCuda(b, r);
if (x.precision != r_sloppy.precision) copyCuda(r_sloppy, r);
rNorm = sqrt(r2);
maxrr = rNorm;
rUpdate++;
if (updateX) {
xpyQuda(x, y);
zeroQuda(x_sloppy);
copyQuda(b, r);
xpyCuda(x, y);
zeroCuda(x_sloppy);
copyCuda(b, r);
r0Norm = rNorm;
maxrx = rNorm;
......@@ -146,11 +147,11 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugeSloppy,
}
k++;
printf("%d iterations, r2 = %e, x2 = %e\n", k, r2, normQuda(x_sloppy));
printf("%d iterations, r2 = %e\n", k, r2);
}
if (x.precision != x_sloppy.precision) copyQuda(x, x_sloppy);
xpyQuda(y, x);
if (x.precision != x_sloppy.precision) copyCuda(x, x_sloppy);
xpyCuda(y, x);
invert_param->secs += stopwatchReadSeconds();
......@@ -165,16 +166,16 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugeSloppy,
invert_param->gflops += gflops;
invert_param->iter += k;
//#if 0
#if 0
// Calculate the true residual
if (dag_type == QUDA_DAG_NO)
MatPCCuda(r, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
else
MatPCDagCuda(r, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
double true_res = xmyNormQuda(src, r);
double true_res = xmyNormCuda(src, r);
printf("Converged after %d iterations, r2 = %e, true_r2 = %e\n", k, r2, sqrt(true_res / b2));
//#endif
printf("Converged after %d iterations, r2 = %e, true_r2 = %e\n", k, sqrt(r2/b2), sqrt(true_res / b2));
#endif
if (invert_param->cuda_prec_sloppy != x.precision) {
freeParitySpinor(src_sloppy);
......
......@@ -15,7 +15,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge,
ParitySpinor Ap = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
double b2 = 0.0;
b2 = normQuda(b);
b2 = normCuda(b);
double r2 = b2;
double r2_old;
......@@ -26,12 +26,12 @@ void invertCgCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge,
if (perf->preserve_source == QUDA_PRESERVE_SOURCE_YES) {
r = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
copyQuda(r, b);
copyCuda(r, b);
} else {
r = b;
}
copyQuda(p, r);
zeroQuda(x);
copyCuda(p, r);
zeroCuda(x);
int k=0;
printf("%d iterations, r2 = %e\n", k, r2);
......@@ -39,15 +39,15 @@ void invertCgCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge,
while (r2 > stop && k<perf->maxiter) {
MatPCDagMatPCCuda(Ap, gauge, p, perf->kappa, tmp, perf->matpc_type);
pAp = reDotProductQuda(p, Ap);
pAp = reDotProductCuda(p, Ap);
alpha = r2 / pAp;
r2_old = r2;
r2 = axpyNormQuda(-alpha, Ap, r);
r2 = axpyNormCuda(-alpha, Ap, r);
beta = r2 / r2_old;
axpyZpbxQuda(alpha, p, x, r, beta);
axpyZpbxCuda(alpha, p, x, r, beta);
k++;
printf("%d iterations, r2 = %e\n", k, r2);
......@@ -65,9 +65,9 @@ void invertCgCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge,
#if 0
// Calculate the true residual
MatPCDagMatPCCuda(Ap, gauge, x, perf->kappa, tmp, perf->matpc_type);
copyQuda(r, b);
mxpyQuda(Ap, r);
double true_res = normQuda(r);
copyCuda(r, b);
mxpyCuda(Ap, r);
double true_res = normCuda(r);
printf("Converged after %d iterations, r2 = %e, true_r2 = %e\n",
k, r2, true_res / b2);
......
......@@ -251,8 +251,8 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
// multiply the source to get the mass normalization
if (param->mass_normalization == QUDA_MASS_NORMALIZATION) {
axQuda(2.0*kappa, b.even);
axQuda(2.0*kappa, b.odd);
axCuda(2.0*kappa, b.even);
axCuda(2.0*kappa, b.odd);
}
if (param->matpc_type == QUDA_MATPC_EVEN_EVEN) {
......@@ -268,15 +268,15 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
// multiply the source to get the mass normalization
if (param->mass_normalization == QUDA_MASS_NORMALIZATION)
if (param->solution_type == QUDA_MATPC_SOLUTION)
axQuda(4.0*kappa*kappa, in);
axCuda(4.0*kappa*kappa, in);
else
axQuda(16.0*pow(kappa,4), in);
axCuda(16.0*pow(kappa,4), in);
}
switch (param->inv_type) {
case QUDA_CG_INVERTER:
if (param->solution_type != QUDA_MATPCDAG_MATPC_SOLUTION) {
copyQuda(out, in);
copyCuda(out, in);
MatPCDagCuda(in, cudaGaugePrecise, out, kappa, tmp, param->matpc_type);
}
invertCgCuda(out, in, cudaGaugeSloppy, tmp, param);
......@@ -284,7 +284,7 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
case QUDA_BICGSTAB_INVERTER:
if (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) {
invertBiCGstabCuda(out, in, cudaGaugeSloppy, cudaGaugePrecise, tmp, param, QUDA_DAG_YES);
copyQuda(in, out);
copyCuda(in, out);
}
invertBiCGstabCuda(out, in, cudaGaugeSloppy, cudaGaugePrecise, tmp, param, QUDA_DAG_NO);
break;
......
......@@ -21,7 +21,7 @@ int main(int argc, char **argv)
Gauge_param.reconstruct = QUDA_RECONSTRUCT_12;
Gauge_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
Gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_8;
Gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12;
Gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
Gauge_param.X = L1;
......@@ -36,15 +36,15 @@ int main(int argc, char **argv)
Gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
gauge_param = &Gauge_param;
double mass = -0.97;
double mass = -0.96;
inv_param.kappa = 1.0 / (2.0*(4 + mass));
inv_param.tol = 1e-12;
inv_param.tol = 1e-7;
inv_param.maxiter = 5000;
inv_param.reliable_delta = 1e-1;
inv_param.reliable_delta = 1e-2;
inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec_sloppy = QUDA_HALF_PRECISION;
inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
inv_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
inv_param.solution_type = QUDA_MAT_SOLUTION;
inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO;
......
......@@ -704,7 +704,7 @@ void loadParitySpinor(ParitySpinor ret, void *spinor, Precision cpu_prec,
} else {
ParitySpinor tmp = allocateParitySpinor(ret.length/spinorSiteSize, QUDA_SINGLE_PRECISION);
loadParitySpinor(tmp, spinor, cpu_prec, dirac_order);
copyQuda(ret, tmp);
copyCuda(ret, tmp);
freeParitySpinor(tmp);
}
......@@ -745,8 +745,8 @@ void loadFullSpinor(FullSpinor ret, void *spinor, Precision cpu_prec) {
} else {
FullSpinor tmp = allocateSpinorField(2*ret.even.length/spinorSiteSize, QUDA_SINGLE_PRECISION);
loadFullSpinor(tmp, spinor, cpu_prec);
copyQuda(ret.even, tmp.even);
copyQuda(ret.odd, tmp.odd);
copyCuda(ret.even, tmp.even);
copyCuda(ret.odd, tmp.odd);
freeSpinorField(tmp);
}
......@@ -799,7 +799,7 @@ void retrieveParitySpinor(void *res, ParitySpinor spinor, Precision cpu_prec, Di
}
} else {
ParitySpinor tmp = allocateParitySpinor(spinor.length/spinorSiteSize, QUDA_SINGLE_PRECISION);
copyQuda(tmp, spinor);
copyCuda(tmp, spinor);
retrieveParitySpinor(res, tmp, cpu_prec, dirac_order);
freeParitySpinor(tmp);
}
......@@ -832,8 +832,8 @@ void retrieveFullSpinor(void *res, FullSpinor spinor, Precision cpu_prec) {
packedSpinor2 = 0;
} else {
FullSpinor tmp = allocateSpinorField(2*spinor.even.length/spinorSiteSize, QUDA_SINGLE_PRECISION);
copyQuda(tmp.even, spinor.even);
copyQuda(tmp.odd, spinor.odd);
copyCuda(tmp.even, spinor.even);
copyCuda(tmp.odd, spinor.odd);
retrieveFullSpinor(res, tmp, cpu_prec);
freeSpinorField(tmp);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment