Advanced Computing Platform for Theoretical Physics

commit大文件会使得服务器变得不稳定,请大家尽量只commit代码,不要commit大的文件。

Commit 08aea0d0 authored by mikeaclark's avatar mikeaclark
Browse files

Fixed CPS anisotropy and created dslash flops counter

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@494 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 5d0defb3
...@@ -95,6 +95,9 @@ __constant__ double t_boundary; ...@@ -95,6 +95,9 @@ __constant__ double t_boundary;
static int initDslash = 0; static int initDslash = 0;
unsigned long long dslash_quda_flops;
unsigned long long dslash_quda_bytes;
//#include <dslash_def.h> // Dslash kernel definitions //#include <dslash_def.h> // Dslash kernel definitions
// kludge to avoid '#include nested too deeply' error // kludge to avoid '#include nested too deeply' error
...@@ -306,6 +309,7 @@ void dslashCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity, ...@@ -306,6 +309,7 @@ void dslashCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity,
} else if (in.precision == QUDA_HALF_PRECISION) { } else if (in.precision == QUDA_HALF_PRECISION) {
dslashHCuda(out, gauge, in, parity, dagger); dslashHCuda(out, gauge, in, parity, dagger);
} }
dslash_quda_flops += 1320*in.volume;
} }
...@@ -508,6 +512,8 @@ void dslashXpayCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int pari ...@@ -508,6 +512,8 @@ void dslashXpayCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int pari
} else if (in.precision == QUDA_HALF_PRECISION) { } else if (in.precision == QUDA_HALF_PRECISION) {
dslashXpayHCuda(out, gauge, in, parity, dagger, x, a); dslashXpayHCuda(out, gauge, in, parity, dagger, x, a);
} }
dslash_quda_flops += (1320+48)*in.volume;
} }
...@@ -768,6 +774,8 @@ void cloverDslashCuda(ParitySpinor out, FullGauge gauge, FullClover cloverInv, ...@@ -768,6 +774,8 @@ void cloverDslashCuda(ParitySpinor out, FullGauge gauge, FullClover cloverInv,
} else if (in.precision == QUDA_HALF_PRECISION) { } else if (in.precision == QUDA_HALF_PRECISION) {
cloverDslashHCuda(out, gauge, cloverInv, in, parity, dagger); cloverDslashHCuda(out, gauge, cloverInv, in, parity, dagger);
} }
dslash_quda_flops += (1320+504)*in.volume;
} }
void cloverDslashDCuda(ParitySpinor res, FullGauge gauge, FullClover cloverInv, void cloverDslashDCuda(ParitySpinor res, FullGauge gauge, FullClover cloverInv,
...@@ -1272,6 +1280,8 @@ void cloverDslashXpayCuda(ParitySpinor out, FullGauge gauge, FullClover cloverIn ...@@ -1272,6 +1280,8 @@ void cloverDslashXpayCuda(ParitySpinor out, FullGauge gauge, FullClover cloverIn
} else if (in.precision == QUDA_HALF_PRECISION) { } else if (in.precision == QUDA_HALF_PRECISION) {
cloverDslashXpayHCuda(out, gauge, cloverInv, in, parity, dagger, x, a); cloverDslashXpayHCuda(out, gauge, cloverInv, in, parity, dagger, x, a);
} }
dslash_quda_flops += (1320+504+48)*in.volume;
} }
...@@ -1887,6 +1897,8 @@ void cloverCuda(ParitySpinor out, FullGauge gauge, FullClover clover, ...@@ -1887,6 +1897,8 @@ void cloverCuda(ParitySpinor out, FullGauge gauge, FullClover clover,
} else if (in.precision == QUDA_HALF_PRECISION) { } else if (in.precision == QUDA_HALF_PRECISION) {
cloverHCuda(out, gauge, clover, in, parity); cloverHCuda(out, gauge, clover, in, parity);
} }
dslash_quda_flops += 504*in.volume;
} }
void cloverDCuda(ParitySpinor res, FullGauge gauge, FullClover clover, void cloverDCuda(ParitySpinor res, FullGauge gauge, FullClover clover,
......
...@@ -122,6 +122,9 @@ extern "C" { ...@@ -122,6 +122,9 @@ extern "C" {
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, ParitySpinor tmp, void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, ParitySpinor tmp,
QudaInvertParam *param, DagType dag_type); QudaInvertParam *param, DagType dag_type);
extern unsigned long long dslash_quda_flops;
extern unsigned long long dslash_quda_bytes;
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
......
...@@ -36,7 +36,7 @@ void init() { ...@@ -36,7 +36,7 @@ void init() {
gaugeParam.X[0] = 24; gaugeParam.X[0] = 24;
gaugeParam.X[1] = 24; gaugeParam.X[1] = 24;
gaugeParam.X[2] = 24; gaugeParam.X[2] = 24;
gaugeParam.X[3] = 24; gaugeParam.X[3] = 48;
setDims(gaugeParam.X); setDims(gaugeParam.X);
gaugeParam.anisotropy = 2.3; gaugeParam.anisotropy = 2.3;
...@@ -45,7 +45,7 @@ void init() { ...@@ -45,7 +45,7 @@ void init() {
gaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T; gaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;
gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION; gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION;
gaugeParam.cuda_prec = QUDA_HALF_PRECISION; gaugeParam.cuda_prec = QUDA_SINGLE_PRECISION;
gaugeParam.reconstruct = QUDA_RECONSTRUCT_12; gaugeParam.reconstruct = QUDA_RECONSTRUCT_12;
gaugeParam.reconstruct_sloppy = gaugeParam.reconstruct; gaugeParam.reconstruct_sloppy = gaugeParam.reconstruct;
gaugeParam.cuda_prec_sloppy = gaugeParam.cuda_prec; gaugeParam.cuda_prec_sloppy = gaugeParam.cuda_prec;
...@@ -64,7 +64,7 @@ void init() { ...@@ -64,7 +64,7 @@ void init() {
inv_param.matpc_type = QUDA_MATPC_ODD_ODD; inv_param.matpc_type = QUDA_MATPC_ODD_ODD;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION; inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_HALF_PRECISION; inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
if (test_type == 2) inv_param.dirac_order = QUDA_DIRAC_ORDER; if (test_type == 2) inv_param.dirac_order = QUDA_DIRAC_ORDER;
else inv_param.dirac_order = QUDA_DIRAC_ORDER; else inv_param.dirac_order = QUDA_DIRAC_ORDER;
...@@ -308,8 +308,8 @@ void dslashTest() { ...@@ -308,8 +308,8 @@ void dslashTest() {
printf("%d Test %s\n", i, (1 == res) ? "PASSED" : "FAILED"); printf("%d Test %s\n", i, (1 == res) ? "PASSED" : "FAILED");
if (test_type < 2) strong_check(spinorRef, spinorOdd, Vh, inv_param.cpu_prec); //if (test_type < 2) strong_check(spinorRef, spinorOdd, Vh, inv_param.cpu_prec);
else strong_check(spinorRef, spinorGPU, V, inv_param.cpu_prec); //else strong_check(spinorRef, spinorGPU, V, inv_param.cpu_prec);
} }
end(); end();
} }
......
...@@ -8,6 +8,20 @@ ...@@ -8,6 +8,20 @@
#include <spinor_quda.h> #include <spinor_quda.h>
#include <gauge_quda.h> #include <gauge_quda.h>
void MatVec(ParitySpinor out, FullGauge gauge, FullClover clover, FullClover cloverInv, ParitySpinor in,
QudaInvertParam *invert_param, ParitySpinor tmp, DagType dag_type) {
double kappa = invert_param->kappa;
if (invert_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER)
kappa *= cudaGaugePrecise.anisotropy;
if (invert_param->dslash_type == QUDA_WILSON_DSLASH) {
MatPCCuda(out, gauge, in, kappa, tmp, invert_param->matpc_type, dag_type);
} else {
cloverMatPCCuda(out, gauge, clover, cloverInv, in, kappa, tmp,
invert_param->matpc_type, dag_type);
}
}
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, ParitySpinor tmp, void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, ParitySpinor tmp,
QudaInvertParam *invert_param, DagType dag_type) QudaInvertParam *invert_param, DagType dag_type)
{ {
...@@ -37,45 +51,44 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, ParitySpinor tmp, ...@@ -37,45 +51,44 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, ParitySpinor tmp,
copyCuda(b, src); copyCuda(b, src);
copyCuda(r_sloppy, src); copyCuda(r_sloppy, src);
/*MatPCDagCuda(y, cudaGaugePrecise, src, invert_param->kappa, tmp, invert_param->matpc_type);
copyCuda(src_sloppy, y);*/ // uncomment for BiCRstab
zeroCuda(y); zeroCuda(y);
double b2 = normCuda(b); double b2 = normCuda(b);
double r2 = b2; double r2 = b2;
double stop = b2*invert_param->tol*invert_param->tol; // stopping condition of solver double stop = b2*invert_param->tol*invert_param->tol; // stopping condition of solver
double delta = invert_param->reliable_delta;
int k = 0;
int xUpdate = 0, rUpdate = 0;
cuDoubleComplex rho = make_cuDoubleComplex(1.0, 0.0); cuDoubleComplex rho = make_cuDoubleComplex(1.0, 0.0);
cuDoubleComplex rho0 = rho; cuDoubleComplex rho0 = rho;
cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0); cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0);
cuDoubleComplex omega = make_cuDoubleComplex(1.0, 0.0); cuDoubleComplex omega = make_cuDoubleComplex(1.0, 0.0);
cuDoubleComplex beta; cuDoubleComplex beta;
cuDoubleComplex rv; cuDoubleComplex rv;
cuDoubleComplex rho_rho0; cuDoubleComplex rho_rho0;
cuDoubleComplex alpha_omega; cuDoubleComplex alpha_omega;
cuDoubleComplex beta_omega; cuDoubleComplex beta_omega;
cuDoubleComplex one = make_cuDoubleComplex(1.0, 0.0); cuDoubleComplex one = make_cuDoubleComplex(1.0, 0.0);
double3 rho_r2; double3 rho_r2;
double3 omega_t2; double3 omega_t2;
double rNorm = sqrt(r2); double rNorm = sqrt(r2);
double r0Norm = rNorm; double r0Norm = rNorm;
double maxrx = rNorm; double maxrx = rNorm;
double maxrr = rNorm; double maxrr = rNorm;
double delta = invert_param->reliable_delta;
int k=0; if (invert_param->verbosity >= QUDA_VERBOSE) printf("%d iterations, r2 = %e\n", k, r2);
int xUpdate = 0, rUpdate = 0;
blas_quda_flops = 0;
dslash_quda_flops = 0;
if (invert_param->verbosity >= QUDA_VERBOSE)
printf("%d iterations, r2 = %e\n", k, r2);
stopwatchStart(); stopwatchStart();
while (r2 > stop && k<invert_param->maxiter) {
while (r2 > stop && k<invert_param->maxiter) {
if (k==0) { if (k==0) {
rho = make_cuDoubleComplex(r2, 0.0); // cDotProductCuda(src_sloppy, r_sloppy); // BiCRstab rho = make_cuDoubleComplex(r2, 0.0); // cDotProductCuda(src_sloppy, r_sloppy); // BiCRstab
copyCuda(p, r_sloppy); copyCuda(p, r_sloppy);
...@@ -83,119 +96,90 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, ParitySpinor tmp, ...@@ -83,119 +96,90 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, ParitySpinor tmp,
alpha_omega = cuCdiv(alpha, omega); alpha_omega = cuCdiv(alpha, omega);
rho_rho0 = cuCdiv(rho, rho0); rho_rho0 = cuCdiv(rho, rho0);
beta = cuCmul(rho_rho0, alpha_omega); beta = cuCmul(rho_rho0, alpha_omega);
// p = r - beta*omega*v + beta*(p) // p = r - beta*omega*v + beta*(p)
beta_omega = cuCmul(beta, omega); beta_omega.x *= -1.0; beta_omega.y *= -1.0; beta_omega = cuCmul(beta, omega); beta_omega.x *= -1.0; beta_omega.y *= -1.0;
cxpaypbzCuda(r_sloppy, beta_omega, v, beta, p); // 8 cxpaypbzCuda(r_sloppy, beta_omega, v, beta, p); // 8
} }
if (invert_param->dslash_type == QUDA_WILSON_DSLASH) { MatVec(v, cudaGaugeSloppy, cudaCloverSloppy, cudaCloverInvSloppy, p, invert_param, tmp_sloppy, dag_type);
MatPCCuda(v, cudaGaugeSloppy, p, invert_param->kappa, tmp_sloppy, invert_param->matpc_type, dag_type);
} else {
cloverMatPCCuda(v, cudaGaugeSloppy, cudaCloverSloppy, cudaCloverInvSloppy, p, invert_param->kappa, tmp_sloppy,
invert_param->matpc_type, dag_type);
}
// rv = (r0,v) // rv = (r0,v)
rv = cDotProductCuda(src_sloppy, v); rv = cDotProductCuda(src_sloppy, v);
alpha = cuCdiv(rho, rv); alpha = cuCdiv(rho, rv);
// r -= alpha*v // r -= alpha*v
alpha.x *= -1.0; alpha.y *= -1.0; alpha.x *= -1.0; alpha.y *= -1.0;
caxpyCuda(alpha, v, r_sloppy); // 4 caxpyCuda(alpha, v, r_sloppy); // 4
alpha.x *= -1.0; alpha.y *= -1.0; alpha.x *= -1.0; alpha.y *= -1.0;
if (invert_param->dslash_type == QUDA_WILSON_DSLASH) { MatVec(t, cudaGaugeSloppy, cudaCloverSloppy, cudaCloverInvSloppy, r_sloppy, invert_param, tmp_sloppy, dag_type);
MatPCCuda(t, cudaGaugeSloppy, r_sloppy, invert_param->kappa, tmp_sloppy, invert_param->matpc_type, dag_type);
} else {
cloverMatPCCuda(t, cudaGaugeSloppy, cudaCloverSloppy, cudaCloverInvSloppy, r_sloppy, invert_param->kappa, tmp_sloppy,
invert_param->matpc_type,dag_type);
}
// omega = (t, r) / (t, t) // omega = (t, r) / (t, t)
omega_t2 = cDotProductNormACuda(t, r_sloppy); // 6 omega_t2 = cDotProductNormACuda(t, r_sloppy); // 6
omega.x = omega_t2.x / omega_t2.z; omega.y = omega_t2.y/omega_t2.z; omega.x = omega_t2.x / omega_t2.z; omega.y = omega_t2.y/omega_t2.z;
//x += alpha*p + omega*r, r -= omega*t, r2 = (r,r), rho = (r0, r) //x += alpha*p + omega*r, r -= omega*t, r2 = (r,r), rho = (r0, r)
rho_r2 = caxpbypzYmbwcDotProductWYNormYQuda(alpha, p, omega, r_sloppy, x_sloppy, t, src_sloppy); rho_r2 = caxpbypzYmbwcDotProductWYNormYQuda(alpha, p, omega, r_sloppy, x_sloppy, t, src_sloppy);
rho0 = rho; rho.x = rho_r2.x; rho.y = rho_r2.y; r2 = rho_r2.z; rho0 = rho; rho.x = rho_r2.x; rho.y = rho_r2.y; r2 = rho_r2.z;
// reliable updates // reliable updates
rNorm = sqrt(r2); rNorm = sqrt(r2);
if (rNorm > maxrx) maxrx = rNorm; if (rNorm > maxrx) maxrx = rNorm;
if (rNorm > maxrr) maxrr = rNorm; if (rNorm > maxrr) maxrr = rNorm;
int updateX = (rNorm < delta*r0Norm && r0Norm <= maxrx) ? 1 : 0; int updateX = (rNorm < delta*r0Norm && r0Norm <= maxrx) ? 1 : 0;
int updateR = ((rNorm < delta*maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0; int updateR = ((rNorm < delta*maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0;
if (updateR) { if (updateR) {
if (x.precision != x_sloppy.precision) copyCuda(x, x_sloppy); if (x.precision != x_sloppy.precision) copyCuda(x, x_sloppy);
if (invert_param->dslash_type == QUDA_WILSON_DSLASH) { MatVec(r, cudaGaugePrecise, cudaCloverPrecise, cudaCloverInvPrecise, x, invert_param, tmp, dag_type);
MatPCCuda(r, cudaGaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type, dag_type);
} else {
cloverMatPCCuda(r, cudaGaugePrecise, cudaCloverPrecise, cudaCloverInvPrecise, x, invert_param->kappa, tmp,
invert_param->matpc_type, dag_type);
}
r2 = xmyNormCuda(b, r); r2 = xmyNormCuda(b, r);
if (x.precision != r_sloppy.precision) copyCuda(r_sloppy, r); if (x.precision != r_sloppy.precision) copyCuda(r_sloppy, r);
rNorm = sqrt(r2); rNorm = sqrt(r2);
maxrr = rNorm; maxrr = rNorm;
rUpdate++; rUpdate++;
if (updateX) { if (updateX) {
xpyCuda(x, y); xpyCuda(x, y);
zeroCuda(x_sloppy); zeroCuda(x_sloppy);
copyCuda(b, r); copyCuda(b, r);
r0Norm = rNorm; r0Norm = rNorm;
maxrx = rNorm; maxrx = rNorm;
xUpdate++; xUpdate++;
} }
} }
k++; k++;
if (invert_param->verbosity >= QUDA_VERBOSE) printf("%d iterations, r2 = %e\n", k, r2);
if (invert_param->verbosity >= QUDA_VERBOSE)
printf("%d iterations, r2 = %e\n", k, r2);
} }
if (x.precision != x_sloppy.precision) copyCuda(x, x_sloppy); if (x.precision != x_sloppy.precision) copyCuda(x, x_sloppy);
xpyCuda(y, x); xpyCuda(y, x);
if (k==invert_param->maxiter) printf("Exceeded maximum iterations %d\n", invert_param->maxiter);
invert_param->secs += stopwatchReadSeconds(); if (invert_param->verbosity >= QUDA_VERBOSE)
if (k==invert_param->maxiter)
printf("Exceeded maximum iterations %d\n", invert_param->maxiter);
if (invert_param->verbosity >= QUDA_SUMMARIZE)
printf("Residual updates = %d, Solution updates = %d\n", rUpdate, xUpdate); printf("Residual updates = %d, Solution updates = %d\n", rUpdate, xUpdate);
float gflops = (1.0e-9*x.volume)*(2*(2*1320+48)*k + (32*k + 8*(k-1))*spinorSiteSize); invert_param->secs += stopwatchReadSeconds();
gflops += 1.0e-9*x.volume*rUpdate*((2*1320+48) + 3*spinorSiteSize);
gflops += 1.0e-9*x.volume*xUpdate*spinorSiteSize; float gflops = (blas_quda_flops + dslash_quda_flops)*1e-9;
if (invert_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) { // printf("%f gflops\n", gflops / stopwatchReadSeconds());
gflops += (1.0e-9*x.volume)*4*504*k;
gflops += (1.0e-9*x.volume)*rUpdate*2*504;
}
//printf("%f gflops\n", k*gflops / stopwatchReadSeconds());
invert_param->gflops += gflops; invert_param->gflops += gflops;
invert_param->iter += k; invert_param->iter += k;
#if 0 #if 0
// Calculate the true residual // Calculate the true residual
if (invert_param->dslash_type == QUDA_WILSON_DSLASH) { MatVec(r, cudaGaugePrecise, cudaCloverPrecise, cudaCloverInvPrecise, x, invert_param, tmp, dag_type);
MatPCCuda(r, cudaGaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type, dag_type);
} else {
cloverMatPCCuda(r, cudaGaugePrecise, cudaCloverPrecise, cudaCloverInvPrecise, x, invert_param->kappa, tmp,
invert_param->matpc_type, dag_type);
}
double true_res = xmyNormCuda(src, r); double true_res = xmyNormCuda(src, r);
copyCuda(b, src);
printf("Converged after %d iterations, r2 = %e, true_r2 = %e\n", k, sqrt(r2/b2), sqrt(true_res / b2));
if (invert_param->verbosity >= QUDA_SUMMARIZE)
printf("Converged after %d iterations, r2 = %e, true_r2 = %e\n", k, sqrt(r2/b2), sqrt(true_res / b2));
#endif #endif
if (invert_param->cuda_prec_sloppy != x.precision) { if (invert_param->cuda_prec_sloppy != x.precision) {
......
...@@ -39,6 +39,9 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, ParitySpinor tmp, QudaInv ...@@ -39,6 +39,9 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, ParitySpinor tmp, QudaInv
zeroCuda(x_sloppy); zeroCuda(x_sloppy);
zeroCuda(y); zeroCuda(y);
double kappa = invert_param->kappa;
if (invert_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) kappa *= cudaGaugePrecise.anisotropy;
double b2 = 0.0; double b2 = 0.0;
b2 = normCuda(b); b2 = normCuda(b);
...@@ -60,13 +63,16 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, ParitySpinor tmp, QudaInv ...@@ -60,13 +63,16 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, ParitySpinor tmp, QudaInv
if (invert_param->verbosity >= QUDA_VERBOSE) if (invert_param->verbosity >= QUDA_VERBOSE)
printf("%d iterations, r2 = %e\n", k, r2); printf("%d iterations, r2 = %e\n", k, r2);
blas_quda_flops = 0;
stopwatchStart(); stopwatchStart();
while (r2 > stop && k<perf->maxiter) { while (r2 > stop && k<perf->maxiter) {
if (invert_param->dslash_type == QUDA_WILSON_DSLASH) { if (invert_param->dslash_type == QUDA_WILSON_DSLASH) {
MatPCDagMatPCCuda(Ap, cudaGaugeSloppy, p, perf->kappa, tmp_sloppy, perf->matpc_type); MatPCDagMatPCCuda(Ap, cudaGaugeSloppy, p, kappa, tmp_sloppy, perf->matpc_type);
} else { } else {
cloverMatPCDagMatPCCuda(Ap, cudaGaugeSloppy, cudaCloverSloppy, cudaCloverInvSloppy, p, perf->kappa, cloverMatPCDagMatPCCuda(Ap, cudaGaugeSloppy, cudaCloverSloppy, cudaCloverInvSloppy, p, kappa,
tmp_sloppy, perf->matpc_type); tmp_sloppy, perf->matpc_type);
} }
...@@ -92,9 +98,9 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, ParitySpinor tmp, QudaInv ...@@ -92,9 +98,9 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, ParitySpinor tmp, QudaInv
if (x.precision != x_sloppy.precision) copyCuda(x, x_sloppy); if (x.precision != x_sloppy.precision) copyCuda(x, x_sloppy);
if (invert_param->dslash_type == QUDA_WILSON_DSLASH) { if (invert_param->dslash_type == QUDA_WILSON_DSLASH) {
MatPCDagMatPCCuda(r, cudaGaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type); MatPCDagMatPCCuda(r, cudaGaugePrecise, x, kappa, tmp, invert_param->matpc_type);
} else { } else {
cloverMatPCDagMatPCCuda(r, cudaGaugePrecise, cudaCloverPrecise, cudaCloverInvPrecise, x, invert_param->kappa, cloverMatPCDagMatPCCuda(r, cudaGaugePrecise, cudaCloverPrecise, cudaCloverInvPrecise, x, kappa,
tmp, invert_param->matpc_type); tmp, invert_param->matpc_type);
} }
...@@ -128,6 +134,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, ParitySpinor tmp, QudaInv ...@@ -128,6 +134,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, ParitySpinor tmp, QudaInv
xpyCuda(y, x); xpyCuda(y, x);
perf->secs = stopwatchReadSeconds(); perf->secs = stopwatchReadSeconds();
if (k==invert_param->maxiter) if (k==invert_param->maxiter)
printf("Exceeded maximum iterations %d\n", invert_param->maxiter); printf("Exceeded maximum iterations %d\n", invert_param->maxiter);
...@@ -135,18 +142,20 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, ParitySpinor tmp, QudaInv ...@@ -135,18 +142,20 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, ParitySpinor tmp, QudaInv
if (invert_param->verbosity >= QUDA_SUMMARIZE) if (invert_param->verbosity >= QUDA_SUMMARIZE)
printf("Residual updates = %d, Solution updates = %d\n", rUpdate, xUpdate); printf("Residual updates = %d, Solution updates = %d\n", rUpdate, xUpdate);
float gflops = k*(1.0e-9*x.volume)*(2*(2*1320+48) + 10*spinorSiteSize); float gflops = (blas_quda_flops + dslash_quda_flops)*1e-9;
if (invert_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) gflops += k*(1.0e-9*x.volume)*4*504; // printf("%f gflops\n", gflops / stopwatchReadSeconds());
//printf("%f gflops\n", k*gflops / stopwatchReadSeconds());
perf->gflops = gflops; perf->gflops = gflops;
perf->iter = k; perf->iter = k;
blas_quda_flops = 0;
dslash_quda_flops = 0;
#if 0 #if 0
// Calculate the true residual // Calculate the true residual
if (invert_param->dslash_type == QUDA_WILSON_DSLASH) { if (invert_param->dslash_type == QUDA_WILSON_DSLASH) {
MatPCDagMatPCCuda(Ap, cudaGaugePrecise, x, perf->kappa, tmp, perf->matpc_type); MatPCDagMatPCCuda(Ap, cudaGaugePrecise, x, kappa, tmp, perf->matpc_type);
} else { } else {
cloverMatPCDagMatPCCuda(Ap, cudaGaugePrecise, cudaCloverPrecise, cudaCloverInvPrecise, x, perf->kappa, cloverMatPCDagMatPCCuda(Ap, cudaGaugePrecise, cudaCloverPrecise, cudaCloverInvPrecise, x, kappa,
tmp, perf->matpc_type); tmp, perf->matpc_type);
} }
copyCuda(r, b); copyCuda(r, b);
......
...@@ -227,6 +227,12 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int parity, ...@@ -227,6 +227,12 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int parity,
ParitySpinor out = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec); ParitySpinor out = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec);
loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->dirac_order); loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->dirac_order);
if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
parity = (parity+1)%2;
axCuda(gauge_param->anisotropy, in);
}
if (inv_param->dslash_type == QUDA_WILSON_DSLASH) { if (inv_param->dslash_type == QUDA_WILSON_DSLASH) {
dslashCuda(out, cudaGaugePrecise, in, parity, dagger); dslashCuda(out, cudaGaugePrecise, in, parity, dagger);
} else if (inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) { } else if (inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
...@@ -248,12 +254,17 @@ void MatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int dagger) ...@@ -248,12 +254,17 @@ void MatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int dagger)
ParitySpinor in = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec); ParitySpinor in = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec);
ParitySpinor out = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec); ParitySpinor out = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec);
ParitySpinor tmp = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec); ParitySpinor tmp = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec);
loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->dirac_order); loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->dirac_order);
double kappa = inv_param->kappa;
if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER)
kappa *= cudaGaugePrecise.anisotropy;
if (inv_param->dslash_type == QUDA_WILSON_DSLASH) { if (inv_param->dslash_type == QUDA_WILSON_DSLASH) {
MatPCCuda(out, cudaGaugePrecise, in, inv_param->kappa, tmp, inv_param->matpc_type, dagger); MatPCCuda(out, cudaGaugePrecise, in, kappa, tmp, inv_param->matpc_type, dagger);
} else if (inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) { } else if (inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
cloverMatPCCuda(out, cudaGaugePrecise, cudaCloverPrecise, cudaCloverInvPrecise, in, inv_param->kappa, cloverMatPCCuda(out, cudaGaugePrecise, cudaCloverPrecise, cudaCloverInvPrecise, in, kappa,
tmp, inv_param->matpc_type, dagger); tmp, inv_param->matpc_type, dagger);
} else { } else {
printf("QUDA error: unsupported dslash_type\n"); printf("QUDA error: unsupported dslash_type\n");
...@@ -275,10 +286,15 @@ void MatPCDagMatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) ...@@ -275,10 +286,15 @@ void MatPCDagMatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
ParitySpinor tmp = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec);