Advanced Computing Platform for Theoretical Physics

commit大文件会使得服务器变得不稳定,请大家尽量只commit代码,不要commit大的文件。

Commit 5bf45c2a authored by rbabich's avatar rbabich
Browse files

finished quda clover (untested)


git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@463 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 458d290a
......@@ -13,8 +13,6 @@ these kernels. Mixed-precision implementations of both CG and
BiCGstab are provided, with support for double, single, and half
(16-bit fixed-point) precision.
NOTE: In this pre-release, only the BiCGstab inverter supports clover.
Software compatibility:
......@@ -53,7 +51,8 @@ Using the library:
Include the header file "invert_quda.h" in your application, link
against libquda.a, and study invert_test.c for an example of the
interface.
interface. The various inverter options are enumerated in
enum_quda.h.
Known issues:
......
......@@ -63,20 +63,24 @@ void freeCloverField(FullClover *clover)
template <typename Float>
static inline void packCloverMatrix(float4* a, Float *b, int Vh)
{
const Float half = 0.5; // pre-include factor of 1/2 introduced by basis change
for (int i=0; i<18; i++) {
a[i*Vh].x = b[4*i+0];
a[i*Vh].y = b[4*i+1];
a[i*Vh].z = b[4*i+2];
a[i*Vh].w = b[4*i+3];
a[i*Vh].x = half * b[4*i+0];
a[i*Vh].y = half * b[4*i+1];
a[i*Vh].z = half * b[4*i+2];
a[i*Vh].w = half * b[4*i+3];
}
}
template <typename Float>
static inline void packCloverMatrix(double2* a, Float *b, int Vh)
{
const Float half = 0.5; // pre-include factor of 1/2 introduced by basis change
for (int i=0; i<36; i++) {
a[i*Vh].x = b[2*i+0];
a[i*Vh].y = b[2*i+1];
a[i*Vh].x = half * b[2*i+0];
a[i*Vh].y = half * b[2*i+1];
}
}
......@@ -113,6 +117,7 @@ static void packFullClover(FloatN *even, FloatN *odd, Float *clover, int *X)
template<typename Float>
static inline void packCloverMatrixHalf(short4 *res, float *norm, Float *clover, int Vh)
{
const Float half = 0.5; // pre-include factor of 1/2 introduced by basis change
Float max, a, c;
// treat the two chiral blocks separately
......@@ -128,7 +133,7 @@ static inline void packCloverMatrixHalf(short4 *res, float *norm, Float *clover,
res[i*Vh].z = (short) (c * clover[4*i+2]);
res[i*Vh].w = (short) (c * clover[4*i+3]);
}
norm[chi*Vh] = 1/c;
norm[chi*Vh] = half/c;
res += 9;
clover += 36;
}
......@@ -310,3 +315,17 @@ void loadCloverField(FullClover ret, void *clover, Precision cpu_prec, CloverFie
exit(-1);
}
}
/*
void createCloverField(FullClover *cudaClover, void *cpuClover, int *X, Precision precision)
{
if (invert_param->clover_cpu_prec == QUDA_HALF_PRECISION) {
printf("QUDA error: half precision not supported on cpu\n");
exit(-1);
}
// X should contain the dimensions of the even/odd sublattice
*cudaClover = allocateCloverField(X, precision);
loadCloverField(*cudaClover, cpuClover, precision, invert_param->clover_order);
}
*/
......@@ -724,9 +724,12 @@ void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa,
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashCuda(tmp, gauge, in, 1, dagger);
dslashXpayCuda(out, gauge, tmp, 0, dagger, in, kappa2);
} else {
} else if (matpc_type == QUDA_MATPC_ODD_ODD) {
dslashCuda(tmp, gauge, in, 0, dagger);
dslashXpayCuda(out, gauge, tmp, 1, dagger, in, kappa2);
} else {
printf("QUDA error: matpc_type not valid for plain Wilson\n");
exit(-1);
}
}
......@@ -1798,18 +1801,32 @@ void cloverDslashXpayHCuda(ParitySpinor res, FullGauge gauge, FullClover cloverI
}
// Apply the even-odd preconditioned clover-improved Dirac operator
void cloverMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover cloverInv, ParitySpinor in,
void cloverMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover clover, FullClover cloverInv, ParitySpinor in,
double kappa, ParitySpinor tmp, MatPCType matpc_type, int dagger)
{
double kappa2 = -kappa*kappa;
if (((matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) || (matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC))
&& (clover.even.clover == NULL)) {
printf("QUDA error: For asymmetric matpc_type, the uninverted clover term must be loaded\n");
}
if (!dagger) {
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
cloverDslashCuda(tmp, gauge, cloverInv, in, 1, dagger);
cloverDslashXpayCuda(out, gauge, cloverInv, tmp, 0, dagger, in, kappa2);
} else {
} else if (matpc_type == QUDA_MATPC_ODD_ODD) {
cloverDslashCuda(tmp, gauge, cloverInv, in, 0, dagger);
cloverDslashXpayCuda(out, gauge, cloverInv, tmp, 1, dagger, in, kappa2);
} else if (matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) {
printf("QUDA error: matpc_type QUDA_MATPC_EVEN_EVEN_ASYMMETRIC not implemented yet\n");
exit(1);
} else if (matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
printf("QUDA error: matpc_type QUDA_MATPC_ODD_ODD_ASYMMETRIC not implemented yet\n");
exit(-1);
} else {
printf("QUDA error: invalid matpc_type\n");
exit(-1);
}
} else { // very inefficient (FIXME)
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
......@@ -1817,20 +1834,29 @@ void cloverMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover cloverInv, Pa
cloverDslashCuda(out, gauge, cloverInv, tmp, 1, dagger);
copyCuda(tmp, out);
dslashXpayCuda(out, gauge, tmp, 0, dagger, in, kappa2);
} else {
} else if (matpc_type == QUDA_MATPC_ODD_ODD) {
cloverCuda(tmp, gauge, cloverInv, in, 1);
cloverDslashCuda(out, gauge, cloverInv, tmp, 0, dagger);
copyCuda(tmp, out);
dslashXpayCuda(out, gauge, tmp, 1, dagger, in, kappa2);
} else if (matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) {
printf("QUDA error: matpc_type QUDA_MATPC_EVEN_EVEN_ASYMMETRIC not implemented yet\n");
exit(1);
} else if (matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
printf("QUDA error: matpc_type QUDA_MATPC_ODD_ODD_ASYMMETRIC not implemented yet\n");
exit(-1);
} else {
printf("QUDA error: invalid matpc_type\n");
exit(-1);
}
}
}
void cloverMatPCDagMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover cloverInv, ParitySpinor in,
void cloverMatPCDagMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover clover, FullClover cloverInv, ParitySpinor in,
double kappa, ParitySpinor tmp, MatPCType matpc_type)
{
cloverMatPCCuda(out, gauge, cloverInv, in, kappa, tmp, matpc_type, 0);
cloverMatPCCuda(out, gauge, cloverInv, out, kappa, tmp, matpc_type, 1);
cloverMatPCCuda(out, gauge, clover, cloverInv, in, kappa, tmp, matpc_type, 0);
cloverMatPCCuda(out, gauge, clover, cloverInv, out, kappa, tmp, matpc_type, 1);
}
// Apply the full operator (FIXME: create kernel to eliminate tmp)
......
......@@ -16,9 +16,12 @@ extern "C" {
extern FullGauge cudaGaugePrecise;
extern FullGauge cudaGaugeSloppy;
extern FullClover cudaClover;
extern FullClover cudaCloverPrecise;
extern FullClover cudaCloverSloppy;
extern FullClover cudaCloverInvPrecise;
extern FullClover cudaCloverInvSloppy;
extern QudaGaugeParam *gauge_param;
extern QudaInvertParam *invert_param;
......@@ -90,12 +93,12 @@ extern "C" {
int oddBit, int daggerBit, ParitySpinor x,
double a);
void cloverMatPCCuda(ParitySpinor out, FullGauge gauge,
void cloverMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover clover,
FullClover cloverInv, ParitySpinor in, double kappa,
ParitySpinor tmp, MatPCType matpc_type, int dagger);
void cloverMatPCDagMatPCCuda(ParitySpinor out, FullGauge gauge,
FullClover cloverInv, ParitySpinor in,
double kappa, ParitySpinor tmp,
FullClover clover, FullClover cloverInv,
ParitySpinor in, double kappa, ParitySpinor tmp,
MatPCType matpc_type);
void cloverMatCuda(FullSpinor out, FullGauge gauge, FullClover clover,
FullSpinor in, double kappa, ParitySpinor tmp,
......@@ -112,13 +115,11 @@ extern "C" {
ParitySpinor spinor, int oddBit);
// -- inv_cg_cuda.cpp
void invertCgCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge,
FullGauge gaugeSloppy, ParitySpinor tmp,
void invertCgCuda(ParitySpinor x, ParitySpinor b, ParitySpinor tmp,
QudaInvertParam *param);
// -- inv_bicgstab_cuda.cpp
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge,
FullGauge gaugeSloppy, ParitySpinor tmp,
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, ParitySpinor tmp,
QudaInvertParam *param, DagType dag_type);
#ifdef __cplusplus
......
......@@ -10,17 +10,18 @@
// What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat)
int test_type = 1;
// clover-improved? (0 = plain Wilson, 1 = clover)
int dslash_type = 0;
int clover_yes = 0;
QudaGaugeParam gaugeParam;
QudaInvertParam inv_param;
FullGauge gauge;
FullClover clover, cloverInv;
FullSpinor cudaSpinor;
FullSpinor cudaSpinorOut;
ParitySpinor tmp;
void *hostGauge[4];
void *hostGauge[4], *hostClover, *hostCloverInv;
void *spinor, *spinorEven, *spinorOdd;
void *spinorRef, *spinorRefEven, *spinorRefOdd;
void *spinorGPU, *spinorGPUEven, *spinorGPUOdd;
......@@ -36,35 +37,42 @@ void init() {
gaugeParam.X[1] = 24;
gaugeParam.X[2] = 24;
gaugeParam.X[3] = 32;
setDims(gaugeParam.X);
gaugeParam.blockDim = 64;
gaugeParam.anisotropy = 2.3;
gaugeParam.gauge_order = QUDA_QDP_GAUGE_ORDER;
gaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;
gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION;
gaugeParam.cuda_prec = QUDA_SINGLE_PRECISION;
gaugeParam.reconstruct = QUDA_RECONSTRUCT_12;
gaugeParam.reconstruct_sloppy = gaugeParam.reconstruct;
gaugeParam.cuda_prec_sloppy = gaugeParam.cuda_prec;
gaugeParam.anisotropy = 2.3;
gaugeParam.gauge_order = QUDA_QDP_GAUGE_ORDER;
gaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;
gaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
gaugeParam.blockDim = 64;
if (clover_yes) {
inv_param.dslash_type = QUDA_CLOVER_WILSON_DSLASH;
} else {
inv_param.dslash_type = QUDA_WILSON_DSLASH;
}
inv_param.kappa = kappa;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
if (test_type == 2) inv_param.dirac_order = QUDA_DIRAC_ORDER;
else inv_param.dirac_order = QUDA_DIRAC_ORDER;
inv_param.kappa = kappa;
if (dslash_type) {
inv_param.dslash_type = QUDA_CLOVER_WILSON_DSLASH;
inv_param.clover_cpu_prec = QUDA_SINGLE_PRECISION;
if (clover_yes) {
inv_param.clover_cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.clover_cuda_prec = QUDA_SINGLE_PRECISION;
} else {
inv_param.dslash_type = QUDA_WILSON_DSLASH;
inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER;
}
inv_param.verbosity = QUDA_VERBOSE;
gauge_param = &gaugeParam;
invert_param = &inv_param;
......@@ -75,6 +83,17 @@ void init() {
// construct input fields
for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gSize);
if (clover_yes) {
size_t cSize = (inv_param.clover_cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
if (test_type == 2) {
hostClover = malloc(V*cloverSiteSize*cSize);
hostCloverInv = hostClover; // fake it
} else {
hostClover = NULL;
hostCloverInv = malloc(V*cloverSiteSize*cSize);
}
}
spinor = malloc(V*spinorSiteSize*sSize);
spinorRef = malloc(V*spinorSiteSize*sSize);
spinorGPU = malloc(V*spinorSiteSize*sSize);
......@@ -91,20 +110,36 @@ void init() {
spinorGPUOdd = (void*)((float*)spinorGPU + Vh*spinorSiteSize);
}
printf("Randomizing fields...");
printf("Randomizing fields... ");
construct_gauge_field(hostGauge, 1, gaugeParam.cpu_prec);
construct_spinor_field(spinor, 1, 0, 0, 0, inv_param.cpu_prec);
if (clover_yes) {
double norm = 1.0; // random components range between -norm and norm
double diag = 1.0; // constant added to the diagonal
if (test_type == 2) {
construct_clover_field(hostClover, norm, diag, inv_param.clover_cpu_prec);
} else {
construct_clover_field(hostCloverInv, norm, diag, inv_param.clover_cpu_prec);
}
}
printf("done.\n"); fflush(stdout);
int dev = 0;
initQuda(dev);
loadGaugeQuda(hostGauge, &gaugeParam);
loadGaugeQuda(hostGauge, &gaugeParam);
gauge = cudaGaugePrecise;
printf("Sending fields to GPU..."); fflush(stdout);
if (clover_yes) {
loadCloverQuda(hostClover, hostCloverInv, &inv_param);
clover = cudaCloverPrecise;
cloverInv = cudaCloverInvPrecise;
}
printf("Sending fields to GPU... "); fflush(stdout);
if (!TRANSFER) {
......@@ -129,6 +164,10 @@ void init() {
void end() {
// release memory
for (int dir = 0; dir < 4; dir++) free(hostGauge[dir]);
if (clover_yes) {
if (test_type == 2) free(hostClover);
else free(hostCloverInv);
}
free(spinorGPU);
free(spinor);
free(spinorRef);
......@@ -150,16 +189,32 @@ double dslashCUDA() {
for (int i = 0; i < LOOPS; i++) {
switch (test_type) {
case 0:
if (TRANSFER) dslashQuda(spinorOdd, spinorEven, &inv_param, ODD_BIT, DAGGER_BIT);
else dslashCuda(cudaSpinor.odd, gauge, cudaSpinor.even, ODD_BIT, DAGGER_BIT);
if (TRANSFER) {
dslashQuda(spinorOdd, spinorEven, &inv_param, ODD_BIT, DAGGER_BIT);
} else if (!clover_yes) {
dslashCuda(cudaSpinor.odd, gauge, cudaSpinor.even, ODD_BIT, DAGGER_BIT);
} else {
cloverDslashCuda(cudaSpinor.odd, gauge, cloverInv, cudaSpinor.even, ODD_BIT, DAGGER_BIT);
}
break;
case 1:
if (TRANSFER) MatPCQuda(spinorOdd, spinorEven, &inv_param, DAGGER_BIT);
else MatPCCuda(cudaSpinor.odd, gauge, cudaSpinor.even, kappa, tmp, QUDA_MATPC_EVEN_EVEN, DAGGER_BIT);
if (TRANSFER) {
MatPCQuda(spinorOdd, spinorEven, &inv_param, DAGGER_BIT);
} else if (!clover_yes) {
MatPCCuda(cudaSpinor.odd, gauge, cudaSpinor.even, kappa, tmp, QUDA_MATPC_EVEN_EVEN, DAGGER_BIT);
} else {
cloverMatPCCuda(cudaSpinor.odd, gauge, clover, cloverInv, cudaSpinor.even, kappa, tmp,
QUDA_MATPC_EVEN_EVEN, DAGGER_BIT);
}
break;
case 2:
if (TRANSFER) MatQuda(spinorGPU, spinor, &inv_param, DAGGER_BIT);
else MatCuda(cudaSpinorOut, gauge, cudaSpinor, kappa, DAGGER_BIT);
if (TRANSFER) {
MatQuda(spinorGPU, spinor, &inv_param, DAGGER_BIT);
} else if (!clover_yes) {
MatCuda(cudaSpinorOut, gauge, cudaSpinor, kappa, DAGGER_BIT);
} else {
cloverMatCuda(cudaSpinorOut, gauge, clover, cudaSpinor, kappa, tmp, DAGGER_BIT);
}
}
}
......@@ -230,7 +285,7 @@ void dslashTest() {
int flops = test_type ? 1320*2 + 48 : 1320;
int floats = test_type ? 2*(7*24+8*gaugeParam.packed_size+24)+24 : 7*24+8*gaugeParam.packed_size+24;
if (dslash_type) {
if (clover_yes) {
flops += test_type ? 504*2 : 504;
floats += test_type ? 72*2 : 72;
}
......
......@@ -39,16 +39,26 @@ extern "C" {
} QudaPrecision;
// Whether the preconditioned matrix is (1-k^2 Deo Doe) or (1-k^2 Doe Deo)
//
// For the clover-improved Wilson Dirac operator, QUDA_MATPC_EVEN_EVEN
// defaults to the "symmetric" form, (1 - k^2 A_ee^-1 D_eo A_oo^-1 D_oe),
// and likewise for QUDA_MATPC_ODD_ODD.
//
// For the "asymmetric" form, (A_ee - k^2 D_eo A_oo^-1 D_oe), select
// QUDA_MATPC_EVEN_EVEN_ASYMMETRIC.
//
typedef enum QudaMatPCType_s {
QUDA_MATPC_EVEN_EVEN,
QUDA_MATPC_ODD_ODD
QUDA_MATPC_ODD_ODD,
QUDA_MATPC_EVEN_EVEN_ASYMMETRIC,
QUDA_MATPC_ODD_ODD_ASYMMETRIC
} QudaMatPCType;
// The different solutions supported
typedef enum QudaSolutionType_s {
QUDA_MAT_SOLUTION,
QUDA_MATPC_SOLUTION,
QUDA_MATPCDAG_SOLUTION,
QUDA_MATPCDAG_SOLUTION, // not implemented
QUDA_MATPCDAG_MATPC_SOLUTION,
} QudaSolutionType;
......
......@@ -8,8 +8,7 @@
#include <spinor_quda.h>
#include <gauge_quda.h>
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise,
FullGauge gaugeSloppy, ParitySpinor tmp,
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, ParitySpinor tmp,
QudaInvertParam *invert_param, DagType dag_type)
{
ParitySpinor r = allocateParitySpinor(x.X, x.precision);
......@@ -38,7 +37,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise
copyCuda(b, src);
copyCuda(r_sloppy, src);
/*MatPCDagCuda(y, gaugePrecise, src, invert_param->kappa, tmp, invert_param->matpc_type);
/*MatPCDagCuda(y, cudaGaugePrecise, src, invert_param->kappa, tmp, invert_param->matpc_type);
copyCuda(src_sloppy, y);*/ // uncomment for BiCRstab
zeroCuda(y);
......@@ -90,7 +89,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise
cxpaypbzCuda(r_sloppy, beta_omega, v, beta, p); // 8
}
MatPCCuda(v, gaugeSloppy, p, invert_param->kappa, tmp_sloppy, invert_param->matpc_type, dag_type);
MatPCCuda(v, cudaGaugeSloppy, p, invert_param->kappa, tmp_sloppy, invert_param->matpc_type, dag_type);
// rv = (r0,v)
rv = cDotProductCuda(src_sloppy, v);
......@@ -102,7 +101,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise
caxpyCuda(alpha, v, r_sloppy); // 4
alpha.x *= -1.0; alpha.y *= -1.0;
MatPCCuda(t, gaugeSloppy, r_sloppy, invert_param->kappa, tmp_sloppy, invert_param->matpc_type, dag_type);
MatPCCuda(t, cudaGaugeSloppy, r_sloppy, invert_param->kappa, tmp_sloppy, invert_param->matpc_type, dag_type);
// omega = (t, r) / (t, t)
omega_t2 = cDotProductNormACuda(t, r_sloppy); // 6
......@@ -122,7 +121,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise
if (updateR) {
if (x.precision != x_sloppy.precision) copyCuda(x, x_sloppy);
MatPCCuda(r, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type, dag_type);
MatPCCuda(r, cudaGaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type, dag_type);
r2 = xmyNormCuda(b, r);
if (x.precision != r_sloppy.precision) copyCuda(r_sloppy, r);
......@@ -169,7 +168,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise
#if 0
// Calculate the true residual
MatPCCuda(r, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type, dag_type);
MatPCCuda(r, cudaGaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type, dag_type);
double true_res = xmyNormCuda(src, r);
printf("Converged after %d iterations, r2 = %e, true_r2 = %e\n", k, sqrt(r2/b2), sqrt(true_res / b2));
......
......@@ -7,8 +7,7 @@
#include <spinor_quda.h>
#include <gauge_quda.h>
void invertCgCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugePrecise,
FullGauge gaugeSloppy, ParitySpinor tmp, QudaInvertParam *perf)
void invertCgCuda(ParitySpinor x, ParitySpinor source, ParitySpinor tmp, QudaInvertParam *perf)
{
ParitySpinor p = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
ParitySpinor Ap = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
......@@ -63,7 +62,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugePrecise,
printf("%d iterations, r2 = %e\n", k, r2);
stopwatchStart();
while (r2 > stop && k<perf->maxiter) {
MatPCDagMatPCCuda(Ap, gaugeSloppy, p, perf->kappa, tmp_sloppy, perf->matpc_type);
MatPCDagMatPCCuda(Ap, cudaGaugeSloppy, p, perf->kappa, tmp_sloppy, perf->matpc_type);
pAp = reDotProductCuda(p, Ap);
......@@ -86,7 +85,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugePrecise,
if (x.precision != x_sloppy.precision) copyCuda(x, x_sloppy);
MatPCDagMatPCCuda(r, gaugePrecise, x, invert_param->kappa,
MatPCDagMatPCCuda(r, cudaGaugePrecise, x, invert_param->kappa,
tmp, invert_param->matpc_type);
r2 = xmyNormCuda(b, r);
......@@ -133,7 +132,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugePrecise,
#if 0
// Calculate the true residual
MatPCDagMatPCCuda(Ap, gauge, x, perf->kappa, tmp, perf->matpc_type);
MatPCDagMatPCCuda(Ap, cudaGaugePrecise, x, perf->kappa, tmp, perf->matpc_type);
copyCuda(r, b);
mxpyCuda(Ap, r);
double true_res = normCuda(r);
......
......@@ -14,9 +14,12 @@
FullGauge cudaGaugePrecise; // precise gauge field
FullGauge cudaGaugeSloppy; // sloppy gauge field
FullClover cudaCloverPrecise;
FullClover cudaCloverPrecise; // clover term
FullClover cudaCloverSloppy;
FullClover cudaCloverInvPrecise; // inverted clover term
FullClover cudaCloverInvSloppy;
void printGaugeParam(QudaGaugeParam *param) {
printf("Gauge Params:\n");
......@@ -100,6 +103,17 @@ void initQuda(int dev)
cudaGaugeSloppy.even = NULL;
cudaGaugeSloppy.odd = NULL;
cudaCloverPrecise.even.clover = NULL;
cudaCloverPrecise.odd.clover = NULL;
cudaCloverSloppy.even.clover = NULL;
cudaCloverSloppy.odd.clover = NULL;
cudaCloverInvPrecise.even.clover = NULL;
cudaCloverInvPrecise.odd.clover = NULL;
cudaCloverInvSloppy.even.clover = NULL;
cudaCloverInvSloppy.odd.clover = NULL;
}
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
......@@ -120,22 +134,86 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
} else {
cudaGaugeSloppy = cudaGaugePrecise;
}
}
// for now, only single-precision clover is supported
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *param)
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
{
if (!cudaGaugePrecise.even) {
printf("QUDA error: loadGaugeQuda() must precede call to loadCloverQuda()\n");
exit(-1);
}
if (!h_clover && !h_clovinv) {
printf("QUDA error: loadCloverQuda() called with neither clover term nor inverse\n");
exit(-1);
}
if (inv_param->clover_cpu_prec == QUDA_HALF_PRECISION) {
printf("QUDA error: half precision not supported on CPU\n");
exit(-1);
}
int X[4];
for (int i=0; i<4; i++) {
X[i] = gauge_param->X[i];
}
X[0] /= 2; // dimensions of the even-odd sublattice
inv_param->cloverGiB = 0;
if (h_clover) {
cudaCloverPrecise = allocateCloverField(X, inv_param->clover_cuda_prec);
loadCloverField(cudaCloverPrecise, h_clover, inv_param->clover_cuda_prec, inv_param->clover_order);
inv_param->cloverGiB += 2.0*cudaCloverPrecise.even.bytes / (1<<30);
if (inv_param->matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC ||
inv_param->matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
if (inv_param->clover_cuda_prec != inv_param->clover_cuda_prec_sloppy) {
cudaCloverSloppy = allocateCloverField(X, inv_param->clover_cuda_prec_sloppy);
loadCloverField(cudaCloverSloppy, h_clover, inv_param->clover_cuda_prec_sloppy, inv_param->clover_order);
inv_param->cloverGiB += 2.0*cudaCloverInvSloppy.even.bytes / (1<<30);
} else {
cudaCloverSloppy = cudaCloverPrecise;
}
} // sloppy precision clover term not needed otherwise
}
cudaCloverInvPrecise = allocateCloverField(X, inv_param->clover_cuda_prec);
if (!h_clovinv) {
printf("QUDA error: clover term inverse not implemented yet\n");
exit(-1);
} else {
loadCloverField(cudaCloverInvPrecise, h_clovinv, inv_param->clover_cuda_prec, inv_param->clover_order);
}
inv_param->cloverGiB += 2.0*cudaCloverInvPrecise.even.bytes / (1<<30);
if (inv_param->clover_cuda_prec != inv_param->clover_cuda_prec_sloppy) {
cu