Advanced Computing Platform for Theoretical Physics

commit大文件会使得服务器变得不稳定,请大家尽量只commit代码,不要commit大的文件。

Commit 7ee1d3d5 authored by mikeaclark's avatar mikeaclark
Browse files

Reduced memory requirement for CG and BiCGstab

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@518 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 2bbcd340
......@@ -22,34 +22,30 @@ void MatVec(ParitySpinor out, FullGauge gauge, FullClover clover, FullClover cl
}
}
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, ParitySpinor tmp,
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, ParitySpinor r,
QudaInvertParam *invert_param, DagType dag_type)
{
ParitySpinor r = allocateParitySpinor(x.X, x.precision);
ParitySpinor y = allocateParitySpinor(x.X, x.precision);
ParitySpinor p = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
ParitySpinor v = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
ParitySpinor t = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
ParitySpinor tmp = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
ParitySpinor y = allocateParitySpinor(x.X, x.precision);
ParitySpinor b = allocateParitySpinor(x.X, x.precision);
ParitySpinor x_sloppy, r_sloppy, tmp_sloppy, src_sloppy;
ParitySpinor x_sloppy, r_sloppy, r0;
if (invert_param->cuda_prec_sloppy == x.precision) {
x_sloppy = x;
r_sloppy = r;
tmp_sloppy = tmp;
src_sloppy = src;
r0 = b;
} else {
x_sloppy = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
r_sloppy = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
src_sloppy = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
tmp_sloppy = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
copyCuda(src_sloppy, src);
r0 = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
copyCuda(r0, b);
}
zeroCuda(x_sloppy);
copyCuda(b, src);
copyCuda(r_sloppy, src);
copyCuda(r_sloppy, b);
zeroCuda(y);
......@@ -59,7 +55,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, ParitySpinor tmp,
double delta = invert_param->reliable_delta;
int k = 0;
int xUpdate = 0, rUpdate = 0;
int rUpdate = 0;
cuDoubleComplex rho = make_cuDoubleComplex(1.0, 0.0);
cuDoubleComplex rho0 = rho;
......@@ -77,8 +73,8 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, ParitySpinor tmp,
double rNorm = sqrt(r2);
double r0Norm = rNorm;
double maxrx = rNorm;
double maxrr = rNorm;
double maxrx = rNorm;
if (invert_param->verbosity >= QUDA_VERBOSE) printf("%d iterations, r2 = %e\n", k, r2);
......@@ -90,7 +86,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, ParitySpinor tmp,
while (r2 > stop && k<invert_param->maxiter) {
if (k==0) {
rho = make_cuDoubleComplex(r2, 0.0); // cDotProductCuda(src_sloppy, r_sloppy); // BiCRstab
rho = make_cuDoubleComplex(r2, 0.0); // cDotProductCuda(r0, r_sloppy); // BiCRstab
copyCuda(p, r_sloppy);
} else {
alpha_omega = cuCdiv(alpha, omega);
......@@ -102,9 +98,9 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, ParitySpinor tmp,
cxpaypbzCuda(r_sloppy, beta_omega, v, beta, p); // 8
}
MatVec(v, cudaGaugeSloppy, cudaCloverSloppy, cudaCloverInvSloppy, p, invert_param, tmp_sloppy, dag_type);
MatVec(v, cudaGaugeSloppy, cudaCloverSloppy, cudaCloverInvSloppy, p, invert_param, tmp, dag_type);
// rv = (r0,v)
rv = cDotProductCuda(src_sloppy, v);
rv = cDotProductCuda(r0, v);
alpha = cuCdiv(rho, rv);
......@@ -113,44 +109,38 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, ParitySpinor tmp,
caxpyCuda(alpha, v, r_sloppy); // 4
alpha.x *= -1.0; alpha.y *= -1.0;
MatVec(t, cudaGaugeSloppy, cudaCloverSloppy, cudaCloverInvSloppy, r_sloppy, invert_param, tmp_sloppy, dag_type);
MatVec(t, cudaGaugeSloppy, cudaCloverSloppy, cudaCloverInvSloppy, r_sloppy, invert_param, tmp, dag_type);
// omega = (t, r) / (t, t)
omega_t2 = cDotProductNormACuda(t, r_sloppy); // 6
omega.x = omega_t2.x / omega_t2.z; omega.y = omega_t2.y/omega_t2.z;
//x += alpha*p + omega*r, r -= omega*t, r2 = (r,r), rho = (r0, r)
rho_r2 = caxpbypzYmbwcDotProductWYNormYQuda(alpha, p, omega, r_sloppy, x_sloppy, t, src_sloppy);
rho_r2 = caxpbypzYmbwcDotProductWYNormYQuda(alpha, p, omega, r_sloppy, x_sloppy, t, r0);
rho0 = rho; rho.x = rho_r2.x; rho.y = rho_r2.y; r2 = rho_r2.z;
// reliable updates
rNorm = sqrt(r2);
if (rNorm > maxrx) maxrx = rNorm;
if (rNorm > maxrr) maxrr = rNorm;
int updateR = (rNorm < delta*maxrr && r0Norm <= maxrr) ? 1 : 0;
int updateX = (rNorm < delta*r0Norm && r0Norm <= maxrx) ? 1 : 0;
int updateR = ((rNorm < delta*maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0;
if (updateR) {
if (updateR || updateX) {
if (x.precision != x_sloppy.precision) copyCuda(x, x_sloppy);
MatVec(r, cudaGaugePrecise, cudaCloverPrecise, cudaCloverInvPrecise, x, invert_param, tmp, dag_type);
xpyCuda(x, y);
MatVec(r, cudaGaugePrecise, cudaCloverPrecise, cudaCloverInvPrecise, y, invert_param, x, dag_type);
r2 = xmyNormCuda(b, r);
if (x.precision != r_sloppy.precision) copyCuda(r_sloppy, r);
if (x.precision != r_sloppy.precision) copyCuda(r_sloppy, r);
zeroCuda(x_sloppy);
rNorm = sqrt(r2);
maxrr = rNorm;
maxrx = rNorm;
r0Norm = rNorm;
rUpdate++;
if (updateX) {
xpyCuda(x, y);
zeroCuda(x_sloppy);
copyCuda(b, r);
r0Norm = rNorm;
maxrx = rNorm;
xUpdate++;
}
}
k++;
......@@ -162,8 +152,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, ParitySpinor tmp,
if (k==invert_param->maxiter) printf("Exceeded maximum iterations %d\n", invert_param->maxiter);
if (invert_param->verbosity >= QUDA_VERBOSE)
printf("Residual updates = %d, Solution updates = %d\n", rUpdate, xUpdate);
if (invert_param->verbosity >= QUDA_VERBOSE) printf("Reliable updates = %d\n", rUpdate);
invert_param->secs += stopwatchReadSeconds();
......@@ -174,7 +163,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, ParitySpinor tmp,
#if 0
// Calculate the true residual
MatVec(r, cudaGaugePrecise, cudaCloverPrecise, cudaCloverInvPrecise, x, invert_param, tmp, dag_type);
MatVec(r, cudaGaugePrecise, cudaCloverPrecise, cudaCloverInvPrecise, x, invert_param, y, dag_type);
double true_res = xmyNormCuda(src, r);
copyCuda(b, src);
......@@ -183,18 +172,17 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, ParitySpinor tmp,
#endif
if (invert_param->cuda_prec_sloppy != x.precision) {
freeParitySpinor(src_sloppy);
freeParitySpinor(tmp_sloppy);
freeParitySpinor(r0);
freeParitySpinor(r_sloppy);
freeParitySpinor(x_sloppy);
}
freeParitySpinor(b);
freeParitySpinor(y);
freeParitySpinor(r);
freeParitySpinor(tmp);
freeParitySpinor(v);
freeParitySpinor(t);
freeParitySpinor(p);
freeParitySpinor(y);
return;
}
......@@ -7,30 +7,34 @@
#include <spinor_quda.h>
#include <gauge_quda.h>
void invertCgCuda(ParitySpinor x, ParitySpinor source, ParitySpinor tmp, QudaInvertParam *perf)
void MatVec(ParitySpinor out, FullGauge gauge, FullClover clover, FullClover cloverInv, ParitySpinor in,
QudaInvertParam *invert_param, ParitySpinor tmp) {
double kappa = invert_param->kappa;
if (invert_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER)
kappa *= cudaGaugePrecise.anisotropy;
if (invert_param->dslash_type == QUDA_WILSON_DSLASH) {
MatPCDagMatPCCuda(out, gauge, in, kappa, tmp, invert_param->matpc_type);
} else {
cloverMatPCDagMatPCCuda(out, gauge, clover, cloverInv, in, kappa, tmp, invert_param->matpc_type);
}
}
void invertCgCuda(ParitySpinor x, ParitySpinor b, ParitySpinor r, QudaInvertParam *invert_param)
{
ParitySpinor p = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
ParitySpinor Ap = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
ParitySpinor y = allocateParitySpinor(x.X, x.precision);
ParitySpinor r = allocateParitySpinor(x.X, x.precision);
ParitySpinor b;
if (perf->preserve_source == QUDA_PRESERVE_SOURCE_YES) {
b = allocateParitySpinor(x.X, x.precision);
copyCuda(b, source);
} else {
b = source;
}
ParitySpinor p = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
ParitySpinor Ap = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
ParitySpinor tmp = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
ParitySpinor x_sloppy, r_sloppy, tmp_sloppy;
ParitySpinor x_sloppy, r_sloppy;
if (invert_param->cuda_prec_sloppy == x.precision) {
x_sloppy = x;
r_sloppy = r;
tmp_sloppy = tmp;
} else {
x_sloppy = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
r_sloppy = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
tmp_sloppy = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
}
copyCuda(r, b);
......@@ -39,15 +43,12 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, ParitySpinor tmp, QudaInv
zeroCuda(x_sloppy);
zeroCuda(y);
double kappa = invert_param->kappa;
if (invert_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) kappa *= cudaGaugePrecise.anisotropy;
double b2 = 0.0;
b2 = normCuda(b);
double r2 = b2;
double r2_old;
double stop = r2*perf->tol*perf->tol; // stopping condition of solver
double stop = r2*invert_param->tol*invert_param->tol; // stopping condition of solver
double alpha, beta;
double pAp;
......@@ -59,22 +60,16 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, ParitySpinor tmp, QudaInv
double delta = invert_param->reliable_delta;
int k=0;
int xUpdate = 0, rUpdate = 0;
int rUpdate = 0;
if (invert_param->verbosity >= QUDA_VERBOSE)
printf("%d iterations, r2 = %e\n", k, r2);
if (invert_param->verbosity >= QUDA_VERBOSE) printf("%d iterations, r2 = %e\n", k, r2);
blas_quda_flops = 0;
stopwatchStart();
while (r2 > stop && k<perf->maxiter) {
while (r2 > stop && k<invert_param->maxiter) {
if (invert_param->dslash_type == QUDA_WILSON_DSLASH) {
MatPCDagMatPCCuda(Ap, cudaGaugeSloppy, p, kappa, tmp_sloppy, perf->matpc_type);
} else {
cloverMatPCDagMatPCCuda(Ap, cudaGaugeSloppy, cudaCloverSloppy, cudaCloverInvSloppy, p, kappa,
tmp_sloppy, perf->matpc_type);
}
MatVec(Ap, cudaGaugeSloppy, cudaCloverSloppy, cudaCloverInvSloppy, p, invert_param, tmp);
pAp = reDotProductCuda(p, Ap);
......@@ -89,37 +84,25 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, ParitySpinor tmp, QudaInv
int updateX = (rNorm < delta*r0Norm && r0Norm <= maxrx) ? 1 : 0;
int updateR = ((rNorm < delta*maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0;
if (!updateR) {
if (!(updateR || updateX)) {
beta = r2 / r2_old;
axpyZpbxCuda(alpha, p, x_sloppy, r_sloppy, beta);
} else {
axpyCuda(alpha, p, x_sloppy);
if (x.precision != x_sloppy.precision) copyCuda(x, x_sloppy);
if (invert_param->dslash_type == QUDA_WILSON_DSLASH) {
MatPCDagMatPCCuda(r, cudaGaugePrecise, x, kappa, tmp, invert_param->matpc_type);
} else {
cloverMatPCDagMatPCCuda(r, cudaGaugePrecise, cudaCloverPrecise, cudaCloverInvPrecise, x, kappa,
tmp, invert_param->matpc_type);
}
xpyCuda(x, y);
MatVec(r, cudaGaugePrecise, cudaCloverPrecise, cudaCloverInvPrecise, y, invert_param, x);
r2 = xmyNormCuda(b, r);
if (x.precision != r_sloppy.precision) copyCuda(r_sloppy, r);
rNorm = sqrt(r2);
if (x.precision != r_sloppy.precision) copyCuda(r_sloppy, r);
zeroCuda(x_sloppy);
rNorm = sqrt(r2);
maxrr = rNorm;
maxrx = rNorm;
r0Norm = rNorm;
rUpdate++;
if (updateX) {
xpyCuda(x, y);
zeroCuda(x_sloppy);
copyCuda(b, r);
r0Norm = rNorm;
maxrx = rNorm;
xUpdate++;
}
beta = r2 / r2_old;
xpayCuda(r_sloppy, beta, p);
......@@ -133,49 +116,40 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, ParitySpinor tmp, QudaInv
if (x.precision != x_sloppy.precision) copyCuda(x, x_sloppy);
xpyCuda(y, x);
perf->secs = stopwatchReadSeconds();
invert_param->secs = stopwatchReadSeconds();
if (k==invert_param->maxiter)
printf("Exceeded maximum iterations %d\n", invert_param->maxiter);
if (invert_param->verbosity >= QUDA_SUMMARIZE)
printf("Residual updates = %d, Solution updates = %d\n", rUpdate, xUpdate);
printf("Reliable updates = %d\n", rUpdate);
float gflops = (blas_quda_flops + dslash_quda_flops)*1e-9;
// printf("%f gflops\n", gflops / stopwatchReadSeconds());
perf->gflops = gflops;
perf->iter = k;
invert_param->gflops = gflops;
invert_param->iter = k;
blas_quda_flops = 0;
dslash_quda_flops = 0;
#if 0
// Calculate the true residual
if (invert_param->dslash_type == QUDA_WILSON_DSLASH) {
MatPCDagMatPCCuda(Ap, cudaGaugePrecise, x, kappa, tmp, perf->matpc_type);
} else {
cloverMatPCDagMatPCCuda(Ap, cudaGaugePrecise, cudaCloverPrecise, cudaCloverInvPrecise, x, kappa,
tmp, perf->matpc_type);
}
copyCuda(r, b);
mxpyCuda(Ap, r);
double true_res = normCuda(r);
MatVec(r, cudaGaugePrecise, cudaCloverPrecise, cudaCloverInvPrecise, x, y);
double true_res = xmyNormCuda(b, r);
printf("Converged after %d iterations, r2 = %e, true_r2 = %e\n",
k, r2, true_res / b2);
#endif
if (invert_param->cuda_prec_sloppy != x.precision) {
freeParitySpinor(tmp_sloppy);
freeParitySpinor(r_sloppy);
freeParitySpinor(x_sloppy);
}
if (perf->preserve_source == QUDA_PRESERVE_SOURCE_YES) freeParitySpinor(b);
freeParitySpinor(r);
freeParitySpinor(p);
freeParitySpinor(Ap);
freeParitySpinor(tmp);
freeParitySpinor(y);
......
......@@ -18,7 +18,7 @@ int main(int argc, char **argv)
Gauge_param.X[0] = 24;
Gauge_param.X[1] = 24;
Gauge_param.X[2] = 24;
Gauge_param.X[3] = 24;
Gauge_param.X[3] = 128;
setDims(Gauge_param.X);
Gauge_param.anisotropy = 1.0;
......@@ -26,9 +26,9 @@ int main(int argc, char **argv)
Gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T;
Gauge_param.cpu_prec = QUDA_DOUBLE_PRECISION;
Gauge_param.cuda_prec = QUDA_DOUBLE_PRECISION;
Gauge_param.cuda_prec = QUDA_SINGLE_PRECISION;
Gauge_param.reconstruct = QUDA_RECONSTRUCT_12;
Gauge_param.cuda_prec_sloppy = QUDA_HALF_PRECISION;
Gauge_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
Gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12;
Gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
......@@ -44,22 +44,22 @@ int main(int argc, char **argv)
} else {
inv_param.dslash_type = QUDA_WILSON_DSLASH;
}
inv_param.inv_type = QUDA_BICGSTAB_INVERTER;
inv_param.inv_type = QUDA_CG_INVERTER;
double mass = -0.95;
double mass = -0.94;
inv_param.kappa = 1.0 / (2.0*(1 + 3/gauge_param->anisotropy + mass));
inv_param.tol = 1e-12;
inv_param.maxiter = 1000;
inv_param.reliable_delta = 1e-1;
inv_param.tol = 5e-7;
inv_param.maxiter = 10000;
inv_param.reliable_delta = 1e-3;
inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
inv_param.solution_type = QUDA_MAT_SOLUTION;
inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec_sloppy = QUDA_HALF_PRECISION;
inv_param.preserve_source = QUDA_PRESERVE_SOURCE_YES;
inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
inv_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO;
inv_param.dirac_order = QUDA_DIRAC_ORDER;
if (clover_yes) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment