Advanced Computing Platform for Theoretical Physics

commit大文件会使得服务器变得不稳定,请大家尽量只commit代码,不要commit大的文件。

Commit 5bf45c2a authored by rbabich's avatar rbabich
Browse files

finished quda clover (untested)


git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@463 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 458d290a
...@@ -13,8 +13,6 @@ these kernels. Mixed-precision implementations of both CG and ...@@ -13,8 +13,6 @@ these kernels. Mixed-precision implementations of both CG and
BiCGstab are provided, with support for double, single, and half BiCGstab are provided, with support for double, single, and half
(16-bit fixed-point) precision. (16-bit fixed-point) precision.
NOTE: In this pre-release, only the BiCGstab inverter supports clover.
Software compatibility: Software compatibility:
...@@ -53,7 +51,8 @@ Using the library: ...@@ -53,7 +51,8 @@ Using the library:
Include the header file "invert_quda.h" in your application, link Include the header file "invert_quda.h" in your application, link
against libquda.a, and study invert_test.c for an example of the against libquda.a, and study invert_test.c for an example of the
interface. interface. The various inverter options are enumerated in
enum_quda.h.
Known issues: Known issues:
......
...@@ -63,20 +63,24 @@ void freeCloverField(FullClover *clover) ...@@ -63,20 +63,24 @@ void freeCloverField(FullClover *clover)
template <typename Float> template <typename Float>
static inline void packCloverMatrix(float4* a, Float *b, int Vh) static inline void packCloverMatrix(float4* a, Float *b, int Vh)
{ {
const Float half = 0.5; // pre-include factor of 1/2 introduced by basis change
for (int i=0; i<18; i++) { for (int i=0; i<18; i++) {
a[i*Vh].x = b[4*i+0]; a[i*Vh].x = half * b[4*i+0];
a[i*Vh].y = b[4*i+1]; a[i*Vh].y = half * b[4*i+1];
a[i*Vh].z = b[4*i+2]; a[i*Vh].z = half * b[4*i+2];
a[i*Vh].w = b[4*i+3]; a[i*Vh].w = half * b[4*i+3];
} }
} }
template <typename Float> template <typename Float>
static inline void packCloverMatrix(double2* a, Float *b, int Vh) static inline void packCloverMatrix(double2* a, Float *b, int Vh)
{ {
const Float half = 0.5; // pre-include factor of 1/2 introduced by basis change
for (int i=0; i<36; i++) { for (int i=0; i<36; i++) {
a[i*Vh].x = b[2*i+0]; a[i*Vh].x = half * b[2*i+0];
a[i*Vh].y = b[2*i+1]; a[i*Vh].y = half * b[2*i+1];
} }
} }
...@@ -113,6 +117,7 @@ static void packFullClover(FloatN *even, FloatN *odd, Float *clover, int *X) ...@@ -113,6 +117,7 @@ static void packFullClover(FloatN *even, FloatN *odd, Float *clover, int *X)
template<typename Float> template<typename Float>
static inline void packCloverMatrixHalf(short4 *res, float *norm, Float *clover, int Vh) static inline void packCloverMatrixHalf(short4 *res, float *norm, Float *clover, int Vh)
{ {
const Float half = 0.5; // pre-include factor of 1/2 introduced by basis change
Float max, a, c; Float max, a, c;
// treat the two chiral blocks separately // treat the two chiral blocks separately
...@@ -128,7 +133,7 @@ static inline void packCloverMatrixHalf(short4 *res, float *norm, Float *clover, ...@@ -128,7 +133,7 @@ static inline void packCloverMatrixHalf(short4 *res, float *norm, Float *clover,
res[i*Vh].z = (short) (c * clover[4*i+2]); res[i*Vh].z = (short) (c * clover[4*i+2]);
res[i*Vh].w = (short) (c * clover[4*i+3]); res[i*Vh].w = (short) (c * clover[4*i+3]);
} }
norm[chi*Vh] = 1/c; norm[chi*Vh] = half/c;
res += 9; res += 9;
clover += 36; clover += 36;
} }
...@@ -310,3 +315,17 @@ void loadCloverField(FullClover ret, void *clover, Precision cpu_prec, CloverFie ...@@ -310,3 +315,17 @@ void loadCloverField(FullClover ret, void *clover, Precision cpu_prec, CloverFie
exit(-1); exit(-1);
} }
} }
/*
void createCloverField(FullClover *cudaClover, void *cpuClover, int *X, Precision precision)
{
if (invert_param->clover_cpu_prec == QUDA_HALF_PRECISION) {
printf("QUDA error: half precision not supported on cpu\n");
exit(-1);
}
// X should contain the dimensions of the even/odd sublattice
*cudaClover = allocateCloverField(X, precision);
loadCloverField(*cudaClover, cpuClover, precision, invert_param->clover_order);
}
*/
...@@ -724,9 +724,12 @@ void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa, ...@@ -724,9 +724,12 @@ void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa,
if (matpc_type == QUDA_MATPC_EVEN_EVEN) { if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashCuda(tmp, gauge, in, 1, dagger); dslashCuda(tmp, gauge, in, 1, dagger);
dslashXpayCuda(out, gauge, tmp, 0, dagger, in, kappa2); dslashXpayCuda(out, gauge, tmp, 0, dagger, in, kappa2);
} else { } else if (matpc_type == QUDA_MATPC_ODD_ODD) {
dslashCuda(tmp, gauge, in, 0, dagger); dslashCuda(tmp, gauge, in, 0, dagger);
dslashXpayCuda(out, gauge, tmp, 1, dagger, in, kappa2); dslashXpayCuda(out, gauge, tmp, 1, dagger, in, kappa2);
} else {
printf("QUDA error: matpc_type not valid for plain Wilson\n");
exit(-1);
} }
} }
...@@ -1798,18 +1801,32 @@ void cloverDslashXpayHCuda(ParitySpinor res, FullGauge gauge, FullClover cloverI ...@@ -1798,18 +1801,32 @@ void cloverDslashXpayHCuda(ParitySpinor res, FullGauge gauge, FullClover cloverI
} }
// Apply the even-odd preconditioned clover-improved Dirac operator // Apply the even-odd preconditioned clover-improved Dirac operator
void cloverMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover cloverInv, ParitySpinor in, void cloverMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover clover, FullClover cloverInv, ParitySpinor in,
double kappa, ParitySpinor tmp, MatPCType matpc_type, int dagger) double kappa, ParitySpinor tmp, MatPCType matpc_type, int dagger)
{ {
double kappa2 = -kappa*kappa; double kappa2 = -kappa*kappa;
if (((matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) || (matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC))
&& (clover.even.clover == NULL)) {
printf("QUDA error: For asymmetric matpc_type, the uninverted clover term must be loaded\n");
}
if (!dagger) { if (!dagger) {
if (matpc_type == QUDA_MATPC_EVEN_EVEN) { if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
cloverDslashCuda(tmp, gauge, cloverInv, in, 1, dagger); cloverDslashCuda(tmp, gauge, cloverInv, in, 1, dagger);
cloverDslashXpayCuda(out, gauge, cloverInv, tmp, 0, dagger, in, kappa2); cloverDslashXpayCuda(out, gauge, cloverInv, tmp, 0, dagger, in, kappa2);
} else { } else if (matpc_type == QUDA_MATPC_ODD_ODD) {
cloverDslashCuda(tmp, gauge, cloverInv, in, 0, dagger); cloverDslashCuda(tmp, gauge, cloverInv, in, 0, dagger);
cloverDslashXpayCuda(out, gauge, cloverInv, tmp, 1, dagger, in, kappa2); cloverDslashXpayCuda(out, gauge, cloverInv, tmp, 1, dagger, in, kappa2);
} else if (matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) {
printf("QUDA error: matpc_type QUDA_MATPC_EVEN_EVEN_ASYMMETRIC not implemented yet\n");
exit(1);
} else if (matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
printf("QUDA error: matpc_type QUDA_MATPC_ODD_ODD_ASYMMETRIC not implemented yet\n");
exit(-1);
} else {
printf("QUDA error: invalid matpc_type\n");
exit(-1);
} }
} else { // very inefficient (FIXME) } else { // very inefficient (FIXME)
if (matpc_type == QUDA_MATPC_EVEN_EVEN) { if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
...@@ -1817,20 +1834,29 @@ void cloverMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover cloverInv, Pa ...@@ -1817,20 +1834,29 @@ void cloverMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover cloverInv, Pa
cloverDslashCuda(out, gauge, cloverInv, tmp, 1, dagger); cloverDslashCuda(out, gauge, cloverInv, tmp, 1, dagger);
copyCuda(tmp, out); copyCuda(tmp, out);
dslashXpayCuda(out, gauge, tmp, 0, dagger, in, kappa2); dslashXpayCuda(out, gauge, tmp, 0, dagger, in, kappa2);
} else { } else if (matpc_type == QUDA_MATPC_ODD_ODD) {
cloverCuda(tmp, gauge, cloverInv, in, 1); cloverCuda(tmp, gauge, cloverInv, in, 1);
cloverDslashCuda(out, gauge, cloverInv, tmp, 0, dagger); cloverDslashCuda(out, gauge, cloverInv, tmp, 0, dagger);
copyCuda(tmp, out); copyCuda(tmp, out);
dslashXpayCuda(out, gauge, tmp, 1, dagger, in, kappa2); dslashXpayCuda(out, gauge, tmp, 1, dagger, in, kappa2);
} else if (matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) {
printf("QUDA error: matpc_type QUDA_MATPC_EVEN_EVEN_ASYMMETRIC not implemented yet\n");
exit(1);
} else if (matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
printf("QUDA error: matpc_type QUDA_MATPC_ODD_ODD_ASYMMETRIC not implemented yet\n");
exit(-1);
} else {
printf("QUDA error: invalid matpc_type\n");
exit(-1);
} }
} }
} }
void cloverMatPCDagMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover cloverInv, ParitySpinor in, void cloverMatPCDagMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover clover, FullClover cloverInv, ParitySpinor in,
double kappa, ParitySpinor tmp, MatPCType matpc_type) double kappa, ParitySpinor tmp, MatPCType matpc_type)
{ {
cloverMatPCCuda(out, gauge, cloverInv, in, kappa, tmp, matpc_type, 0); cloverMatPCCuda(out, gauge, clover, cloverInv, in, kappa, tmp, matpc_type, 0);
cloverMatPCCuda(out, gauge, cloverInv, out, kappa, tmp, matpc_type, 1); cloverMatPCCuda(out, gauge, clover, cloverInv, out, kappa, tmp, matpc_type, 1);
} }
// Apply the full operator (FIXME: create kernel to eliminate tmp) // Apply the full operator (FIXME: create kernel to eliminate tmp)
......
...@@ -16,9 +16,12 @@ extern "C" { ...@@ -16,9 +16,12 @@ extern "C" {
extern FullGauge cudaGaugePrecise; extern FullGauge cudaGaugePrecise;
extern FullGauge cudaGaugeSloppy; extern FullGauge cudaGaugeSloppy;
extern FullClover cudaClover; extern FullClover cudaCloverPrecise;
extern FullClover cudaCloverSloppy; extern FullClover cudaCloverSloppy;
extern FullClover cudaCloverInvPrecise;
extern FullClover cudaCloverInvSloppy;
extern QudaGaugeParam *gauge_param; extern QudaGaugeParam *gauge_param;
extern QudaInvertParam *invert_param; extern QudaInvertParam *invert_param;
...@@ -90,12 +93,12 @@ extern "C" { ...@@ -90,12 +93,12 @@ extern "C" {
int oddBit, int daggerBit, ParitySpinor x, int oddBit, int daggerBit, ParitySpinor x,
double a); double a);
void cloverMatPCCuda(ParitySpinor out, FullGauge gauge, void cloverMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover clover,
FullClover cloverInv, ParitySpinor in, double kappa, FullClover cloverInv, ParitySpinor in, double kappa,
ParitySpinor tmp, MatPCType matpc_type, int dagger); ParitySpinor tmp, MatPCType matpc_type, int dagger);
void cloverMatPCDagMatPCCuda(ParitySpinor out, FullGauge gauge, void cloverMatPCDagMatPCCuda(ParitySpinor out, FullGauge gauge,
FullClover cloverInv, ParitySpinor in, FullClover clover, FullClover cloverInv,
double kappa, ParitySpinor tmp, ParitySpinor in, double kappa, ParitySpinor tmp,
MatPCType matpc_type); MatPCType matpc_type);
void cloverMatCuda(FullSpinor out, FullGauge gauge, FullClover clover, void cloverMatCuda(FullSpinor out, FullGauge gauge, FullClover clover,
FullSpinor in, double kappa, ParitySpinor tmp, FullSpinor in, double kappa, ParitySpinor tmp,
...@@ -112,13 +115,11 @@ extern "C" { ...@@ -112,13 +115,11 @@ extern "C" {
ParitySpinor spinor, int oddBit); ParitySpinor spinor, int oddBit);
// -- inv_cg_cuda.cpp // -- inv_cg_cuda.cpp
void invertCgCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge, void invertCgCuda(ParitySpinor x, ParitySpinor b, ParitySpinor tmp,
FullGauge gaugeSloppy, ParitySpinor tmp,
QudaInvertParam *param); QudaInvertParam *param);
// -- inv_bicgstab_cuda.cpp // -- inv_bicgstab_cuda.cpp
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge, void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, ParitySpinor tmp,
FullGauge gaugeSloppy, ParitySpinor tmp,
QudaInvertParam *param, DagType dag_type); QudaInvertParam *param, DagType dag_type);
#ifdef __cplusplus #ifdef __cplusplus
......
...@@ -10,17 +10,18 @@ ...@@ -10,17 +10,18 @@
// What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat) // What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat)
int test_type = 1; int test_type = 1;
// clover-improved? (0 = plain Wilson, 1 = clover) // clover-improved? (0 = plain Wilson, 1 = clover)
int dslash_type = 0; int clover_yes = 0;
QudaGaugeParam gaugeParam; QudaGaugeParam gaugeParam;
QudaInvertParam inv_param; QudaInvertParam inv_param;
FullGauge gauge; FullGauge gauge;
FullClover clover, cloverInv;
FullSpinor cudaSpinor; FullSpinor cudaSpinor;
FullSpinor cudaSpinorOut; FullSpinor cudaSpinorOut;
ParitySpinor tmp; ParitySpinor tmp;
void *hostGauge[4]; void *hostGauge[4], *hostClover, *hostCloverInv;
void *spinor, *spinorEven, *spinorOdd; void *spinor, *spinorEven, *spinorOdd;
void *spinorRef, *spinorRefEven, *spinorRefOdd; void *spinorRef, *spinorRefEven, *spinorRefOdd;
void *spinorGPU, *spinorGPUEven, *spinorGPUOdd; void *spinorGPU, *spinorGPUEven, *spinorGPUOdd;
...@@ -36,35 +37,42 @@ void init() { ...@@ -36,35 +37,42 @@ void init() {
gaugeParam.X[1] = 24; gaugeParam.X[1] = 24;
gaugeParam.X[2] = 24; gaugeParam.X[2] = 24;
gaugeParam.X[3] = 32; gaugeParam.X[3] = 32;
setDims(gaugeParam.X); setDims(gaugeParam.X);
gaugeParam.blockDim = 64; gaugeParam.anisotropy = 2.3;
gaugeParam.gauge_order = QUDA_QDP_GAUGE_ORDER;
gaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;
gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION; gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION;
gaugeParam.cuda_prec = QUDA_SINGLE_PRECISION; gaugeParam.cuda_prec = QUDA_SINGLE_PRECISION;
gaugeParam.reconstruct = QUDA_RECONSTRUCT_12; gaugeParam.reconstruct = QUDA_RECONSTRUCT_12;
gaugeParam.reconstruct_sloppy = gaugeParam.reconstruct; gaugeParam.reconstruct_sloppy = gaugeParam.reconstruct;
gaugeParam.cuda_prec_sloppy = gaugeParam.cuda_prec; gaugeParam.cuda_prec_sloppy = gaugeParam.cuda_prec;
gaugeParam.anisotropy = 2.3;
gaugeParam.gauge_order = QUDA_QDP_GAUGE_ORDER;
gaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;
gaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO; gaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
gaugeParam.blockDim = 64;
if (clover_yes) {
inv_param.dslash_type = QUDA_CLOVER_WILSON_DSLASH;
} else {
inv_param.dslash_type = QUDA_WILSON_DSLASH;
}
inv_param.kappa = kappa;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION; inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_SINGLE_PRECISION; inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
if (test_type == 2) inv_param.dirac_order = QUDA_DIRAC_ORDER; if (test_type == 2) inv_param.dirac_order = QUDA_DIRAC_ORDER;
else inv_param.dirac_order = QUDA_DIRAC_ORDER; else inv_param.dirac_order = QUDA_DIRAC_ORDER;
inv_param.kappa = kappa;
if (dslash_type) { if (clover_yes) {
inv_param.dslash_type = QUDA_CLOVER_WILSON_DSLASH; inv_param.clover_cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.clover_cpu_prec = QUDA_SINGLE_PRECISION;
inv_param.clover_cuda_prec = QUDA_SINGLE_PRECISION; inv_param.clover_cuda_prec = QUDA_SINGLE_PRECISION;
} else { inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER;
inv_param.dslash_type = QUDA_WILSON_DSLASH;
} }
inv_param.verbosity = QUDA_VERBOSE;
gauge_param = &gaugeParam; gauge_param = &gaugeParam;
invert_param = &inv_param; invert_param = &inv_param;
...@@ -75,6 +83,17 @@ void init() { ...@@ -75,6 +83,17 @@ void init() {
// construct input fields // construct input fields
for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gSize); for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gSize);
if (clover_yes) {
size_t cSize = (inv_param.clover_cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
if (test_type == 2) {
hostClover = malloc(V*cloverSiteSize*cSize);
hostCloverInv = hostClover; // fake it
} else {
hostClover = NULL;
hostCloverInv = malloc(V*cloverSiteSize*cSize);
}
}
spinor = malloc(V*spinorSiteSize*sSize); spinor = malloc(V*spinorSiteSize*sSize);
spinorRef = malloc(V*spinorSiteSize*sSize); spinorRef = malloc(V*spinorSiteSize*sSize);
spinorGPU = malloc(V*spinorSiteSize*sSize); spinorGPU = malloc(V*spinorSiteSize*sSize);
...@@ -91,20 +110,36 @@ void init() { ...@@ -91,20 +110,36 @@ void init() {
spinorGPUOdd = (void*)((float*)spinorGPU + Vh*spinorSiteSize); spinorGPUOdd = (void*)((float*)spinorGPU + Vh*spinorSiteSize);
} }
printf("Randomizing fields..."); printf("Randomizing fields... ");
construct_gauge_field(hostGauge, 1, gaugeParam.cpu_prec); construct_gauge_field(hostGauge, 1, gaugeParam.cpu_prec);
construct_spinor_field(spinor, 1, 0, 0, 0, inv_param.cpu_prec); construct_spinor_field(spinor, 1, 0, 0, 0, inv_param.cpu_prec);
if (clover_yes) {
double norm = 1.0; // random components range between -norm and norm
double diag = 1.0; // constant added to the diagonal
if (test_type == 2) {
construct_clover_field(hostClover, norm, diag, inv_param.clover_cpu_prec);
} else {
construct_clover_field(hostCloverInv, norm, diag, inv_param.clover_cpu_prec);
}
}
printf("done.\n"); fflush(stdout); printf("done.\n"); fflush(stdout);
int dev = 0; int dev = 0;
initQuda(dev); initQuda(dev);
loadGaugeQuda(hostGauge, &gaugeParam);
loadGaugeQuda(hostGauge, &gaugeParam);
gauge = cudaGaugePrecise; gauge = cudaGaugePrecise;
printf("Sending fields to GPU..."); fflush(stdout); if (clover_yes) {
loadCloverQuda(hostClover, hostCloverInv, &inv_param);
clover = cudaCloverPrecise;
cloverInv = cudaCloverInvPrecise;
}
printf("Sending fields to GPU... "); fflush(stdout);
if (!TRANSFER) { if (!TRANSFER) {
...@@ -129,6 +164,10 @@ void init() { ...@@ -129,6 +164,10 @@ void init() {
void end() { void end() {
// release memory // release memory
for (int dir = 0; dir < 4; dir++) free(hostGauge[dir]); for (int dir = 0; dir < 4; dir++) free(hostGauge[dir]);
if (clover_yes) {
if (test_type == 2) free(hostClover);
else free(hostCloverInv);
}
free(spinorGPU); free(spinorGPU);
free(spinor); free(spinor);
free(spinorRef); free(spinorRef);
...@@ -150,16 +189,32 @@ double dslashCUDA() { ...@@ -150,16 +189,32 @@ double dslashCUDA() {
for (int i = 0; i < LOOPS; i++) { for (int i = 0; i < LOOPS; i++) {
switch (test_type) { switch (test_type) {
case 0: case 0:
if (TRANSFER) dslashQuda(spinorOdd, spinorEven, &inv_param, ODD_BIT, DAGGER_BIT); if (TRANSFER) {
else dslashCuda(cudaSpinor.odd, gauge, cudaSpinor.even, ODD_BIT, DAGGER_BIT); dslashQuda(spinorOdd, spinorEven, &inv_param, ODD_BIT, DAGGER_BIT);
} else if (!clover_yes) {
dslashCuda(cudaSpinor.odd, gauge, cudaSpinor.even, ODD_BIT, DAGGER_BIT);
} else {
cloverDslashCuda(cudaSpinor.odd, gauge, cloverInv, cudaSpinor.even, ODD_BIT, DAGGER_BIT);
}
break; break;
case 1: case 1:
if (TRANSFER) MatPCQuda(spinorOdd, spinorEven, &inv_param, DAGGER_BIT); if (TRANSFER) {
else MatPCCuda(cudaSpinor.odd, gauge, cudaSpinor.even, kappa, tmp, QUDA_MATPC_EVEN_EVEN, DAGGER_BIT); MatPCQuda(spinorOdd, spinorEven, &inv_param, DAGGER_BIT);
} else if (!clover_yes) {
MatPCCuda(cudaSpinor.odd, gauge, cudaSpinor.even, kappa, tmp, QUDA_MATPC_EVEN_EVEN, DAGGER_BIT);
} else {
cloverMatPCCuda(cudaSpinor.odd, gauge, clover, cloverInv, cudaSpinor.even, kappa, tmp,
QUDA_MATPC_EVEN_EVEN, DAGGER_BIT);
}
break; break;
case 2: case 2:
if (TRANSFER) MatQuda(spinorGPU, spinor, &inv_param, DAGGER_BIT); if (TRANSFER) {
else MatCuda(cudaSpinorOut, gauge, cudaSpinor, kappa, DAGGER_BIT); MatQuda(spinorGPU, spinor, &inv_param, DAGGER_BIT);
} else if (!clover_yes) {
MatCuda(cudaSpinorOut, gauge, cudaSpinor, kappa, DAGGER_BIT);
} else {
cloverMatCuda(cudaSpinorOut, gauge, clover, cudaSpinor, kappa, tmp, DAGGER_BIT);
}
} }
} }
...@@ -230,7 +285,7 @@ void dslashTest() { ...@@ -230,7 +285,7 @@ void dslashTest() {
int flops = test_type ? 1320*2 + 48 : 1320; int flops = test_type ? 1320*2 + 48 : 1320;
int floats = test_type ? 2*(7*24+8*gaugeParam.packed_size+24)+24 : 7*24+8*gaugeParam.packed_size+24; int floats = test_type ? 2*(7*24+8*gaugeParam.packed_size+24)+24 : 7*24+8*gaugeParam.packed_size+24;
if (dslash_type) { if (clover_yes) {
flops += test_type ? 504*2 : 504; flops += test_type ? 504*2 : 504;
floats += test_type ? 72*2 : 72; floats += test_type ? 72*2 : 72;
} }
......
...@@ -39,16 +39,26 @@ extern "C" { ...@@ -39,16 +39,26 @@ extern "C" {
} QudaPrecision; } QudaPrecision;
// Whether the preconditioned matrix is (1-k^2 Deo Doe) or (1-k^2 Doe Deo) // Whether the preconditioned matrix is (1-k^2 Deo Doe) or (1-k^2 Doe Deo)
//
// For the clover-improved Wilson Dirac operator, QUDA_MATPC_EVEN_EVEN
// defaults to the "symmetric" form, (1 - k^2 A_ee^-1 D_eo A_oo^-1 D_oe),
// and likewise for QUDA_MATPC_ODD_ODD.
//
// For the "asymmetric" form, (A_ee - k^2 D_eo A_oo^-1 D_oe), select
// QUDA_MATPC_EVEN_EVEN_ASYMMETRIC.
//
typedef enum QudaMatPCType_s { typedef enum QudaMatPCType_s {
QUDA_MATPC_EVEN_EVEN, QUDA_MATPC_EVEN_EVEN,
QUDA_MATPC_ODD_ODD QUDA_MATPC_ODD_ODD,
QUDA_MATPC_EVEN_EVEN_ASYMMETRIC,
QUDA_MATPC_ODD_ODD_ASYMMETRIC
} QudaMatPCType; } QudaMatPCType;
// The different solutions supported // The different solutions supported
typedef enum QudaSolutionType_s { typedef enum QudaSolutionType_s {
QUDA_MAT_SOLUTION, QUDA_MAT_SOLUTION,
QUDA_MATPC_SOLUTION, QUDA_MATPC_SOLUTION,
QUDA_MATPCDAG_SOLUTION, QUDA_MATPCDAG_SOLUTION, // not implemented
QUDA_MATPCDAG_MATPC_SOLUTION, QUDA_MATPCDAG_MATPC_SOLUTION,
} QudaSolutionType; } QudaSolutionType;
......
...@@ -8,8 +8,7 @@ ...@@ -8,8 +8,7 @@
#include <spinor_quda.h> #include <spinor_quda.h>
#include <gauge_quda.h> #include <gauge_quda.h>
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise, void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, ParitySpinor tmp,
FullGauge gaugeSloppy, ParitySpinor tmp,
QudaInvertParam *invert_param, DagType dag_type) QudaInvertParam *invert_param, DagType dag_type)
{ {
ParitySpinor r = allocateParitySpinor(x.X, x.precision); ParitySpinor r = allocateParitySpinor(x.X, x.precision);
...@@ -38,7 +37,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise ...@@ -38,7 +37,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise
copyCuda(b, src); copyCuda(b, src);
copyCuda(r_sloppy, src); copyCuda(r_sloppy, src);