Advanced Computing Platform for Theoretical Physics

Commit 5bf45c2a authored by rbabich's avatar rbabich
Browse files

finished quda clover (untested)


git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@463 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 458d290a
...@@ -13,8 +13,6 @@ these kernels. Mixed-precision implementations of both CG and ...@@ -13,8 +13,6 @@ these kernels. Mixed-precision implementations of both CG and
BiCGstab are provided, with support for double, single, and half BiCGstab are provided, with support for double, single, and half
(16-bit fixed-point) precision. (16-bit fixed-point) precision.
NOTE: In this pre-release, only the BiCGstab inverter supports clover.
Software compatibility: Software compatibility:
...@@ -53,7 +51,8 @@ Using the library: ...@@ -53,7 +51,8 @@ Using the library:
Include the header file "invert_quda.h" in your application, link Include the header file "invert_quda.h" in your application, link
against libquda.a, and study invert_test.c for an example of the against libquda.a, and study invert_test.c for an example of the
interface. interface. The various inverter options are enumerated in
enum_quda.h.
Known issues: Known issues:
......
...@@ -63,20 +63,24 @@ void freeCloverField(FullClover *clover) ...@@ -63,20 +63,24 @@ void freeCloverField(FullClover *clover)
template <typename Float> template <typename Float>
static inline void packCloverMatrix(float4* a, Float *b, int Vh) static inline void packCloverMatrix(float4* a, Float *b, int Vh)
{ {
const Float half = 0.5; // pre-include factor of 1/2 introduced by basis change
for (int i=0; i<18; i++) { for (int i=0; i<18; i++) {
a[i*Vh].x = b[4*i+0]; a[i*Vh].x = half * b[4*i+0];
a[i*Vh].y = b[4*i+1]; a[i*Vh].y = half * b[4*i+1];
a[i*Vh].z = b[4*i+2]; a[i*Vh].z = half * b[4*i+2];
a[i*Vh].w = b[4*i+3]; a[i*Vh].w = half * b[4*i+3];
} }
} }
template <typename Float> template <typename Float>
static inline void packCloverMatrix(double2* a, Float *b, int Vh) static inline void packCloverMatrix(double2* a, Float *b, int Vh)
{ {
const Float half = 0.5; // pre-include factor of 1/2 introduced by basis change
for (int i=0; i<36; i++) { for (int i=0; i<36; i++) {
a[i*Vh].x = b[2*i+0]; a[i*Vh].x = half * b[2*i+0];
a[i*Vh].y = b[2*i+1]; a[i*Vh].y = half * b[2*i+1];
} }
} }
...@@ -113,6 +117,7 @@ static void packFullClover(FloatN *even, FloatN *odd, Float *clover, int *X) ...@@ -113,6 +117,7 @@ static void packFullClover(FloatN *even, FloatN *odd, Float *clover, int *X)
template<typename Float> template<typename Float>
static inline void packCloverMatrixHalf(short4 *res, float *norm, Float *clover, int Vh) static inline void packCloverMatrixHalf(short4 *res, float *norm, Float *clover, int Vh)
{ {
const Float half = 0.5; // pre-include factor of 1/2 introduced by basis change
Float max, a, c; Float max, a, c;
// treat the two chiral blocks separately // treat the two chiral blocks separately
...@@ -128,7 +133,7 @@ static inline void packCloverMatrixHalf(short4 *res, float *norm, Float *clover, ...@@ -128,7 +133,7 @@ static inline void packCloverMatrixHalf(short4 *res, float *norm, Float *clover,
res[i*Vh].z = (short) (c * clover[4*i+2]); res[i*Vh].z = (short) (c * clover[4*i+2]);
res[i*Vh].w = (short) (c * clover[4*i+3]); res[i*Vh].w = (short) (c * clover[4*i+3]);
} }
norm[chi*Vh] = 1/c; norm[chi*Vh] = half/c;
res += 9; res += 9;
clover += 36; clover += 36;
} }
...@@ -310,3 +315,17 @@ void loadCloverField(FullClover ret, void *clover, Precision cpu_prec, CloverFie ...@@ -310,3 +315,17 @@ void loadCloverField(FullClover ret, void *clover, Precision cpu_prec, CloverFie
exit(-1); exit(-1);
} }
} }
/*
void createCloverField(FullClover *cudaClover, void *cpuClover, int *X, Precision precision)
{
if (invert_param->clover_cpu_prec == QUDA_HALF_PRECISION) {
printf("QUDA error: half precision not supported on cpu\n");
exit(-1);
}
// X should contain the dimensions of the even/odd sublattice
*cudaClover = allocateCloverField(X, precision);
loadCloverField(*cudaClover, cpuClover, precision, invert_param->clover_order);
}
*/
...@@ -724,9 +724,12 @@ void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa, ...@@ -724,9 +724,12 @@ void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa,
if (matpc_type == QUDA_MATPC_EVEN_EVEN) { if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashCuda(tmp, gauge, in, 1, dagger); dslashCuda(tmp, gauge, in, 1, dagger);
dslashXpayCuda(out, gauge, tmp, 0, dagger, in, kappa2); dslashXpayCuda(out, gauge, tmp, 0, dagger, in, kappa2);
} else { } else if (matpc_type == QUDA_MATPC_ODD_ODD) {
dslashCuda(tmp, gauge, in, 0, dagger); dslashCuda(tmp, gauge, in, 0, dagger);
dslashXpayCuda(out, gauge, tmp, 1, dagger, in, kappa2); dslashXpayCuda(out, gauge, tmp, 1, dagger, in, kappa2);
} else {
printf("QUDA error: matpc_type not valid for plain Wilson\n");
exit(-1);
} }
} }
...@@ -1798,18 +1801,32 @@ void cloverDslashXpayHCuda(ParitySpinor res, FullGauge gauge, FullClover cloverI ...@@ -1798,18 +1801,32 @@ void cloverDslashXpayHCuda(ParitySpinor res, FullGauge gauge, FullClover cloverI
} }
// Apply the even-odd preconditioned clover-improved Dirac operator // Apply the even-odd preconditioned clover-improved Dirac operator
void cloverMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover cloverInv, ParitySpinor in, void cloverMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover clover, FullClover cloverInv, ParitySpinor in,
double kappa, ParitySpinor tmp, MatPCType matpc_type, int dagger) double kappa, ParitySpinor tmp, MatPCType matpc_type, int dagger)
{ {
double kappa2 = -kappa*kappa; double kappa2 = -kappa*kappa;
if (((matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) || (matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC))
&& (clover.even.clover == NULL)) {
printf("QUDA error: For asymmetric matpc_type, the uninverted clover term must be loaded\n");
}
if (!dagger) { if (!dagger) {
if (matpc_type == QUDA_MATPC_EVEN_EVEN) { if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
cloverDslashCuda(tmp, gauge, cloverInv, in, 1, dagger); cloverDslashCuda(tmp, gauge, cloverInv, in, 1, dagger);
cloverDslashXpayCuda(out, gauge, cloverInv, tmp, 0, dagger, in, kappa2); cloverDslashXpayCuda(out, gauge, cloverInv, tmp, 0, dagger, in, kappa2);
} else { } else if (matpc_type == QUDA_MATPC_ODD_ODD) {
cloverDslashCuda(tmp, gauge, cloverInv, in, 0, dagger); cloverDslashCuda(tmp, gauge, cloverInv, in, 0, dagger);
cloverDslashXpayCuda(out, gauge, cloverInv, tmp, 1, dagger, in, kappa2); cloverDslashXpayCuda(out, gauge, cloverInv, tmp, 1, dagger, in, kappa2);
} else if (matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) {
printf("QUDA error: matpc_type QUDA_MATPC_EVEN_EVEN_ASYMMETRIC not implemented yet\n");
exit(1);
} else if (matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
printf("QUDA error: matpc_type QUDA_MATPC_ODD_ODD_ASYMMETRIC not implemented yet\n");
exit(-1);
} else {
printf("QUDA error: invalid matpc_type\n");
exit(-1);
} }
} else { // very inefficient (FIXME) } else { // very inefficient (FIXME)
if (matpc_type == QUDA_MATPC_EVEN_EVEN) { if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
...@@ -1817,20 +1834,29 @@ void cloverMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover cloverInv, Pa ...@@ -1817,20 +1834,29 @@ void cloverMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover cloverInv, Pa
cloverDslashCuda(out, gauge, cloverInv, tmp, 1, dagger); cloverDslashCuda(out, gauge, cloverInv, tmp, 1, dagger);
copyCuda(tmp, out); copyCuda(tmp, out);
dslashXpayCuda(out, gauge, tmp, 0, dagger, in, kappa2); dslashXpayCuda(out, gauge, tmp, 0, dagger, in, kappa2);
} else { } else if (matpc_type == QUDA_MATPC_ODD_ODD) {
cloverCuda(tmp, gauge, cloverInv, in, 1); cloverCuda(tmp, gauge, cloverInv, in, 1);
cloverDslashCuda(out, gauge, cloverInv, tmp, 0, dagger); cloverDslashCuda(out, gauge, cloverInv, tmp, 0, dagger);
copyCuda(tmp, out); copyCuda(tmp, out);
dslashXpayCuda(out, gauge, tmp, 1, dagger, in, kappa2); dslashXpayCuda(out, gauge, tmp, 1, dagger, in, kappa2);
} else if (matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) {
printf("QUDA error: matpc_type QUDA_MATPC_EVEN_EVEN_ASYMMETRIC not implemented yet\n");
exit(1);
} else if (matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
printf("QUDA error: matpc_type QUDA_MATPC_ODD_ODD_ASYMMETRIC not implemented yet\n");
exit(-1);
} else {
printf("QUDA error: invalid matpc_type\n");
exit(-1);
} }
} }
} }
void cloverMatPCDagMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover cloverInv, ParitySpinor in, void cloverMatPCDagMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover clover, FullClover cloverInv, ParitySpinor in,
double kappa, ParitySpinor tmp, MatPCType matpc_type) double kappa, ParitySpinor tmp, MatPCType matpc_type)
{ {
cloverMatPCCuda(out, gauge, cloverInv, in, kappa, tmp, matpc_type, 0); cloverMatPCCuda(out, gauge, clover, cloverInv, in, kappa, tmp, matpc_type, 0);
cloverMatPCCuda(out, gauge, cloverInv, out, kappa, tmp, matpc_type, 1); cloverMatPCCuda(out, gauge, clover, cloverInv, out, kappa, tmp, matpc_type, 1);
} }
// Apply the full operator (FIXME: create kernel to eliminate tmp) // Apply the full operator (FIXME: create kernel to eliminate tmp)
......
...@@ -16,9 +16,12 @@ extern "C" { ...@@ -16,9 +16,12 @@ extern "C" {
extern FullGauge cudaGaugePrecise; extern FullGauge cudaGaugePrecise;
extern FullGauge cudaGaugeSloppy; extern FullGauge cudaGaugeSloppy;
extern FullClover cudaClover; extern FullClover cudaCloverPrecise;
extern FullClover cudaCloverSloppy; extern FullClover cudaCloverSloppy;
extern FullClover cudaCloverInvPrecise;
extern FullClover cudaCloverInvSloppy;
extern QudaGaugeParam *gauge_param; extern QudaGaugeParam *gauge_param;
extern QudaInvertParam *invert_param; extern QudaInvertParam *invert_param;
...@@ -90,12 +93,12 @@ extern "C" { ...@@ -90,12 +93,12 @@ extern "C" {
int oddBit, int daggerBit, ParitySpinor x, int oddBit, int daggerBit, ParitySpinor x,
double a); double a);
void cloverMatPCCuda(ParitySpinor out, FullGauge gauge, void cloverMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover clover,
FullClover cloverInv, ParitySpinor in, double kappa, FullClover cloverInv, ParitySpinor in, double kappa,
ParitySpinor tmp, MatPCType matpc_type, int dagger); ParitySpinor tmp, MatPCType matpc_type, int dagger);
void cloverMatPCDagMatPCCuda(ParitySpinor out, FullGauge gauge, void cloverMatPCDagMatPCCuda(ParitySpinor out, FullGauge gauge,
FullClover cloverInv, ParitySpinor in, FullClover clover, FullClover cloverInv,
double kappa, ParitySpinor tmp, ParitySpinor in, double kappa, ParitySpinor tmp,
MatPCType matpc_type); MatPCType matpc_type);
void cloverMatCuda(FullSpinor out, FullGauge gauge, FullClover clover, void cloverMatCuda(FullSpinor out, FullGauge gauge, FullClover clover,
FullSpinor in, double kappa, ParitySpinor tmp, FullSpinor in, double kappa, ParitySpinor tmp,
...@@ -112,13 +115,11 @@ extern "C" { ...@@ -112,13 +115,11 @@ extern "C" {
ParitySpinor spinor, int oddBit); ParitySpinor spinor, int oddBit);
// -- inv_cg_cuda.cpp // -- inv_cg_cuda.cpp
void invertCgCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge, void invertCgCuda(ParitySpinor x, ParitySpinor b, ParitySpinor tmp,
FullGauge gaugeSloppy, ParitySpinor tmp,
QudaInvertParam *param); QudaInvertParam *param);
// -- inv_bicgstab_cuda.cpp // -- inv_bicgstab_cuda.cpp
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge, void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, ParitySpinor tmp,
FullGauge gaugeSloppy, ParitySpinor tmp,
QudaInvertParam *param, DagType dag_type); QudaInvertParam *param, DagType dag_type);
#ifdef __cplusplus #ifdef __cplusplus
......
...@@ -10,17 +10,18 @@ ...@@ -10,17 +10,18 @@
// What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat) // What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat)
int test_type = 1; int test_type = 1;
// clover-improved? (0 = plain Wilson, 1 = clover) // clover-improved? (0 = plain Wilson, 1 = clover)
int dslash_type = 0; int clover_yes = 0;
QudaGaugeParam gaugeParam; QudaGaugeParam gaugeParam;
QudaInvertParam inv_param; QudaInvertParam inv_param;
FullGauge gauge; FullGauge gauge;
FullClover clover, cloverInv;
FullSpinor cudaSpinor; FullSpinor cudaSpinor;
FullSpinor cudaSpinorOut; FullSpinor cudaSpinorOut;
ParitySpinor tmp; ParitySpinor tmp;
void *hostGauge[4]; void *hostGauge[4], *hostClover, *hostCloverInv;
void *spinor, *spinorEven, *spinorOdd; void *spinor, *spinorEven, *spinorOdd;
void *spinorRef, *spinorRefEven, *spinorRefOdd; void *spinorRef, *spinorRefEven, *spinorRefOdd;
void *spinorGPU, *spinorGPUEven, *spinorGPUOdd; void *spinorGPU, *spinorGPUEven, *spinorGPUOdd;
...@@ -36,35 +37,42 @@ void init() { ...@@ -36,35 +37,42 @@ void init() {
gaugeParam.X[1] = 24; gaugeParam.X[1] = 24;
gaugeParam.X[2] = 24; gaugeParam.X[2] = 24;
gaugeParam.X[3] = 32; gaugeParam.X[3] = 32;
setDims(gaugeParam.X); setDims(gaugeParam.X);
gaugeParam.blockDim = 64; gaugeParam.anisotropy = 2.3;
gaugeParam.gauge_order = QUDA_QDP_GAUGE_ORDER;
gaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;
gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION; gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION;
gaugeParam.cuda_prec = QUDA_SINGLE_PRECISION; gaugeParam.cuda_prec = QUDA_SINGLE_PRECISION;
gaugeParam.reconstruct = QUDA_RECONSTRUCT_12; gaugeParam.reconstruct = QUDA_RECONSTRUCT_12;
gaugeParam.reconstruct_sloppy = gaugeParam.reconstruct; gaugeParam.reconstruct_sloppy = gaugeParam.reconstruct;
gaugeParam.cuda_prec_sloppy = gaugeParam.cuda_prec; gaugeParam.cuda_prec_sloppy = gaugeParam.cuda_prec;
gaugeParam.anisotropy = 2.3;
gaugeParam.gauge_order = QUDA_QDP_GAUGE_ORDER;
gaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;
gaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO; gaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
gaugeParam.blockDim = 64;
if (clover_yes) {
inv_param.dslash_type = QUDA_CLOVER_WILSON_DSLASH;
} else {
inv_param.dslash_type = QUDA_WILSON_DSLASH;
}
inv_param.kappa = kappa;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION; inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_SINGLE_PRECISION; inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
if (test_type == 2) inv_param.dirac_order = QUDA_DIRAC_ORDER; if (test_type == 2) inv_param.dirac_order = QUDA_DIRAC_ORDER;
else inv_param.dirac_order = QUDA_DIRAC_ORDER; else inv_param.dirac_order = QUDA_DIRAC_ORDER;
inv_param.kappa = kappa;
if (dslash_type) { if (clover_yes) {
inv_param.dslash_type = QUDA_CLOVER_WILSON_DSLASH; inv_param.clover_cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.clover_cpu_prec = QUDA_SINGLE_PRECISION;
inv_param.clover_cuda_prec = QUDA_SINGLE_PRECISION; inv_param.clover_cuda_prec = QUDA_SINGLE_PRECISION;
} else { inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER;
inv_param.dslash_type = QUDA_WILSON_DSLASH;
} }
inv_param.verbosity = QUDA_VERBOSE;
gauge_param = &gaugeParam; gauge_param = &gaugeParam;
invert_param = &inv_param; invert_param = &inv_param;
...@@ -75,6 +83,17 @@ void init() { ...@@ -75,6 +83,17 @@ void init() {
// construct input fields // construct input fields
for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gSize); for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gSize);
if (clover_yes) {
size_t cSize = (inv_param.clover_cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
if (test_type == 2) {
hostClover = malloc(V*cloverSiteSize*cSize);
hostCloverInv = hostClover; // fake it
} else {
hostClover = NULL;
hostCloverInv = malloc(V*cloverSiteSize*cSize);
}
}
spinor = malloc(V*spinorSiteSize*sSize); spinor = malloc(V*spinorSiteSize*sSize);
spinorRef = malloc(V*spinorSiteSize*sSize); spinorRef = malloc(V*spinorSiteSize*sSize);
spinorGPU = malloc(V*spinorSiteSize*sSize); spinorGPU = malloc(V*spinorSiteSize*sSize);
...@@ -91,20 +110,36 @@ void init() { ...@@ -91,20 +110,36 @@ void init() {
spinorGPUOdd = (void*)((float*)spinorGPU + Vh*spinorSiteSize); spinorGPUOdd = (void*)((float*)spinorGPU + Vh*spinorSiteSize);
} }
printf("Randomizing fields..."); printf("Randomizing fields... ");
construct_gauge_field(hostGauge, 1, gaugeParam.cpu_prec); construct_gauge_field(hostGauge, 1, gaugeParam.cpu_prec);
construct_spinor_field(spinor, 1, 0, 0, 0, inv_param.cpu_prec); construct_spinor_field(spinor, 1, 0, 0, 0, inv_param.cpu_prec);
if (clover_yes) {
double norm = 1.0; // random components range between -norm and norm
double diag = 1.0; // constant added to the diagonal
if (test_type == 2) {
construct_clover_field(hostClover, norm, diag, inv_param.clover_cpu_prec);
} else {
construct_clover_field(hostCloverInv, norm, diag, inv_param.clover_cpu_prec);
}
}
printf("done.\n"); fflush(stdout); printf("done.\n"); fflush(stdout);
int dev = 0; int dev = 0;
initQuda(dev); initQuda(dev);
loadGaugeQuda(hostGauge, &gaugeParam);
loadGaugeQuda(hostGauge, &gaugeParam);
gauge = cudaGaugePrecise; gauge = cudaGaugePrecise;
printf("Sending fields to GPU..."); fflush(stdout); if (clover_yes) {
loadCloverQuda(hostClover, hostCloverInv, &inv_param);
clover = cudaCloverPrecise;
cloverInv = cudaCloverInvPrecise;
}
printf("Sending fields to GPU... "); fflush(stdout);
if (!TRANSFER) { if (!TRANSFER) {
...@@ -129,6 +164,10 @@ void init() { ...@@ -129,6 +164,10 @@ void init() {
void end() { void end() {
// release memory // release memory
for (int dir = 0; dir < 4; dir++) free(hostGauge[dir]); for (int dir = 0; dir < 4; dir++) free(hostGauge[dir]);
if (clover_yes) {
if (test_type == 2) free(hostClover);
else free(hostCloverInv);
}
free(spinorGPU); free(spinorGPU);
free(spinor); free(spinor);
free(spinorRef); free(spinorRef);
...@@ -150,16 +189,32 @@ double dslashCUDA() { ...@@ -150,16 +189,32 @@ double dslashCUDA() {
for (int i = 0; i < LOOPS; i++) { for (int i = 0; i < LOOPS; i++) {
switch (test_type) { switch (test_type) {
case 0: case 0:
if (TRANSFER) dslashQuda(spinorOdd, spinorEven, &inv_param, ODD_BIT, DAGGER_BIT); if (TRANSFER) {
else dslashCuda(cudaSpinor.odd, gauge, cudaSpinor.even, ODD_BIT, DAGGER_BIT); dslashQuda(spinorOdd, spinorEven, &inv_param, ODD_BIT, DAGGER_BIT);
} else if (!clover_yes) {
dslashCuda(cudaSpinor.odd, gauge, cudaSpinor.even, ODD_BIT, DAGGER_BIT);
} else {
cloverDslashCuda(cudaSpinor.odd, gauge, cloverInv, cudaSpinor.even, ODD_BIT, DAGGER_BIT);
}
break; break;
case 1: case 1:
if (TRANSFER) MatPCQuda(spinorOdd, spinorEven, &inv_param, DAGGER_BIT); if (TRANSFER) {
else MatPCCuda(cudaSpinor.odd, gauge, cudaSpinor.even, kappa, tmp, QUDA_MATPC_EVEN_EVEN, DAGGER_BIT); MatPCQuda(spinorOdd, spinorEven, &inv_param, DAGGER_BIT);
} else if (!clover_yes) {
MatPCCuda(cudaSpinor.odd, gauge, cudaSpinor.even, kappa, tmp, QUDA_MATPC_EVEN_EVEN, DAGGER_BIT);
} else {
cloverMatPCCuda(cudaSpinor.odd, gauge, clover, cloverInv, cudaSpinor.even, kappa, tmp,
QUDA_MATPC_EVEN_EVEN, DAGGER_BIT);
}
break; break;
case 2: case 2:
if (TRANSFER) MatQuda(spinorGPU, spinor, &inv_param, DAGGER_BIT); if (TRANSFER) {
else MatCuda(cudaSpinorOut, gauge, cudaSpinor, kappa, DAGGER_BIT); MatQuda(spinorGPU, spinor, &inv_param, DAGGER_BIT);
} else if (!clover_yes) {
MatCuda(cudaSpinorOut, gauge, cudaSpinor, kappa, DAGGER_BIT);
} else {
cloverMatCuda(cudaSpinorOut, gauge, clover, cudaSpinor, kappa, tmp, DAGGER_BIT);
}
} }
} }
...@@ -230,7 +285,7 @@ void dslashTest() { ...@@ -230,7 +285,7 @@ void dslashTest() {
int flops = test_type ? 1320*2 + 48 : 1320; int flops = test_type ? 1320*2 + 48 : 1320;
int floats = test_type ? 2*(7*24+8*gaugeParam.packed_size+24)+24 : 7*24+8*gaugeParam.packed_size+24; int floats = test_type ? 2*(7*24+8*gaugeParam.packed_size+24)+24 : 7*24+8*gaugeParam.packed_size+24;
if (dslash_type) { if (clover_yes) {
flops += test_type ? 504*2 : 504; flops += test_type ? 504*2 : 504;
floats += test_type ? 72*2 : 72; floats += test_type ? 72*2 : 72;
} }
......
...@@ -39,16 +39,26 @@ extern "C" { ...@@ -39,16 +39,26 @@ extern "C" {
} QudaPrecision; } QudaPrecision;
// Whether the preconditioned matrix is (1-k^2 Deo Doe) or (1-k^2 Doe Deo) // Whether the preconditioned matrix is (1-k^2 Deo Doe) or (1-k^2 Doe Deo)
//
// For the clover-improved Wilson Dirac operator, QUDA_MATPC_EVEN_EVEN
// defaults to the "symmetric" form, (1 - k^2 A_ee^-1 D_eo A_oo^-1 D_oe),
// and likewise for QUDA_MATPC_ODD_ODD.
//
// For the "asymmetric" form, (A_ee - k^2 D_eo A_oo^-1 D_oe), select
// QUDA_MATPC_EVEN_EVEN_ASYMMETRIC.
//
typedef enum QudaMatPCType_s { typedef enum QudaMatPCType_s {
QUDA_MATPC_EVEN_EVEN, QUDA_MATPC_EVEN_EVEN,
QUDA_MATPC_ODD_ODD QUDA_MATPC_ODD_ODD,
QUDA_MATPC_EVEN_EVEN_ASYMMETRIC,
QUDA_MATPC_ODD_ODD_ASYMMETRIC
} QudaMatPCType; } QudaMatPCType;
// The different solutions supported // The different solutions supported
typedef enum QudaSolutionType_s { typedef enum QudaSolutionType_s {
QUDA_MAT_SOLUTION, QUDA_MAT_SOLUTION,
QUDA_MATPC_SOLUTION, QUDA_MATPC_SOLUTION,
QUDA_MATPCDAG_SOLUTION, QUDA_MATPCDAG_SOLUTION, // not implemented
QUDA_MATPCDAG_MATPC_SOLUTION, QUDA_MATPCDAG_MATPC_SOLUTION,
} QudaSolutionType; } QudaSolutionType;
......
...@@ -8,8 +8,7 @@ ...@@ -8,8 +8,7 @@
#include <spinor_quda.h> #include <spinor_quda.h>
#include <gauge_quda.h> #include <gauge_quda.h>
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise, void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, ParitySpinor tmp,
FullGauge gaugeSloppy, ParitySpinor tmp,
QudaInvertParam *invert_param, DagType dag_type) QudaInvertParam *invert_param, DagType dag_type)
{ {
ParitySpinor r = allocateParitySpinor(x.X, x.precision); ParitySpinor r = allocateParitySpinor(x.X, x.precision);
...@@ -38,7 +37,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise ...@@ -38,7 +37,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise
copyCuda(b, src); copyCuda(b, src);
copyCuda(r_sloppy, src); copyCuda(r_sloppy, src);
/*MatPCDagCuda(y, gaugePrecise, src, invert_param->kappa, tmp, invert_param->matpc_type); /*MatPCDagCuda(y, cudaGaugePrecise, src, invert_param->kappa, tmp, invert_param->matpc_type);
copyCuda(src_sloppy, y);*/ // uncomment for BiCRstab copyCuda(src_sloppy, y);*/ // uncomment for BiCRstab
zeroCuda(y); zeroCuda(y);
...@@ -90,7 +89,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise ...@@ -90,7 +89,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise
cxpaypbzCuda(r_sloppy, beta_omega, v, beta, p); // 8 cxpaypbzCuda(r_sloppy, beta_omega, v, beta, p); // 8
} }
MatPCCuda(v, gaugeSloppy, p, invert_param->kappa, tmp_sloppy, invert_param->matpc_type, dag_type); MatPCCuda(v, cudaGaugeSloppy, p, invert_param->kappa, tmp_sloppy, invert_param->matpc_type, dag_type);
// rv = (r0,v) // rv = (r0,v)
rv = cDotProductCuda(src_sloppy, v); rv = cDotProductCuda(src_sloppy, v);
...@@ -102,7 +101,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise ...@@ -102,7 +101,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise
caxpyCuda(alpha, v, r_sloppy); // 4 caxpyCuda(alpha, v, r_sloppy); // 4
alpha.x *= -1.0; alpha.y *= -1.0; alpha.x *= -1.0; alpha.y *= -1.0;
MatPCCuda(t, gaugeSloppy, r_sloppy, invert_param->kappa, tmp_sloppy, invert_param->matpc_type, dag_type); MatPCCuda(t, cudaGaugeSloppy, r_sloppy, invert_param->kappa, tmp_sloppy, invert_param->matpc_type, dag_type);
// omega = (t, r) / (t, t) // omega = (t, r) / (t, t)
omega_t2 = cDotProductNormACuda(t, r_sloppy); // 6 omega_t2 = cDotProductNormACuda(t, r_sloppy); // 6
...@@ -122,7 +121,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise ...@@ -122,7 +121,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise
if (updateR) { if (updateR) {
if (x.precision != x_sloppy.precision) copyCuda(x, x_sloppy); if (x.precision != x_sloppy.precision) copyCuda(x, x_sloppy);
MatPCCuda(r, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type, dag_type); MatPCCuda(r, cudaGaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type, dag_type);
r2 = xmyNormCuda(b, r); r2 = xmyNormCuda(b, r);
if (x.precision != r_sloppy.precision) copyCuda(r_sloppy, r); if (x.precision != r_sloppy.precision) copyCuda(r_sloppy, r);
...@@ -169,7 +168,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise ...@@ -169,7 +168,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise
#if 0 #if 0
// Calculate the true residual // Calculate the true residual
MatPCCuda(r, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type, dag_type); MatPCCuda(r, cudaGaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type, dag_type);
double true_res = xmyNormCuda(src, r); double true_res = xmyNormCuda(src, r);
printf("Converged after %d iterations, r2 = %e, true_r2 = %e\n", k, sqrt(r2/b2), sqrt(true_res / b2)); printf("Converged after %d iterations, r2 = %e, true_r2 = %e\n", k, sqrt(r2/b2), sqrt(true_res / b2));
......
...@@ -7,8 +7,7 @@ ...@@ -7,8 +7,7 @@
#include <spinor_quda.h> #include <spinor_quda.h>
#include <gauge_quda.h> #include <gauge_quda.h>
void invertCgCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugePrecise, void invertCgCuda(ParitySpinor x, ParitySpinor source, ParitySpinor tmp, QudaInvertParam *perf)
FullGauge gaugeSloppy, ParitySpinor tmp, QudaInvertParam *perf)
{ {
ParitySpinor p = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy); ParitySpinor p = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
ParitySpinor Ap = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy); ParitySpinor Ap = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
...@@ -63,7 +62,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugePrecise, ...@@ -63,7 +62,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugePrecise,
printf("%d iterations, r2 = %e\n", k, r2); printf("%d iterations, r2 = %e\n", k, r2);
stopwatchStart(); stopwatchStart();
while (r2 > stop && k<perf->maxiter) { while (r2 > stop && k<perf->maxiter) {
MatPCDagMatPCCuda(Ap, gaugeSloppy, p, perf->kappa, tmp_sloppy, perf->matpc_type); MatPCDagMatPCCuda(Ap, cudaGaugeSloppy, p, perf->kappa, tmp_sloppy, perf->matpc_type);
pAp = reDotProductCuda(p, Ap); pAp = reDotProductCuda(p, Ap);
...@@ -86,7 +85,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugePrecise, ...@@ -86,7 +85,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugePrecise,
if (x.precision != x_sloppy.precision) copyCuda(x, x_sloppy); if (x.precision != x_sloppy.precision) copyCuda(x, x_sloppy);
MatPCDagMatPCCuda(r, gaugePrecise, x, invert_param->kappa, MatPCDagMatPCCuda(r, cudaGaugePrecise, x, invert_param->kappa,
tmp, invert_param->matpc_type); tmp, invert_param->matpc_type);
r2 = xmyNormCuda(b, r); r2 = xmyNormCuda(b, r);
...@@ -133,7 +132,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugePrecise, ...@@ -133,7 +132,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugePrecise,
#if 0 #if 0
// Calculate the true residual // Calculate the true residual
MatPCDagMatPCCuda(Ap, gauge, x, perf->kappa, tmp, perf->matpc_type); MatPCDagMatPCCuda(Ap, cudaGaugePrecise, x, perf->kappa, tmp, perf->matpc_type);
copyCuda(r, b); copyCuda(r, b);
mxpyCuda(Ap, r); mxpyCuda(Ap, r);
double true_res = normCuda(r); double true_res = normCuda(r);
......
...@@ -14,9 +14,12 @@ ...@@ -14,9 +14,12 @@
FullGauge cudaGaugePrecise; // precise gauge field FullGauge cudaGaugePrecise; // precise gauge field
FullGauge cudaGaugeSloppy; // sloppy gauge field FullGauge cudaGaugeSloppy; // sloppy gauge field
FullClover cudaCloverPrecise; FullClover cudaCloverPrecise; // clover term
FullClover cudaCloverSloppy; FullClover cudaCloverSloppy;
FullClover cudaCloverInvPrecise; // inverted clover term
FullClover cudaCloverInvSloppy;
void printGaugeParam(QudaGaugeParam *param) { void printGaugeParam(QudaGaugeParam *param) {
printf("Gauge Params:\n"); printf("Gauge Params:\n");
...@@ -100,6 +103,17 @@ void initQuda(int dev) ...@@ -100,6 +103,17 @@ void initQuda(int dev)
cudaGaugeSloppy.even = NULL; cudaGaugeSloppy.even = NULL;
cudaGaugeSloppy.odd = NULL; cudaGaugeSloppy.odd = NULL;
cudaCloverPrecise.even.clover = NULL;
cudaCloverPrecise.odd.clover = NULL;
cudaCloverSloppy.even.clover = NULL;
cudaCloverSloppy.odd.clover = NULL;
cudaCloverInvPrecise.even.clover = NULL;
cudaCloverInvPrecise.odd.clover = NULL;
cudaCloverInvSloppy.even.clover = NULL;
cudaCloverInvSloppy.odd.clover = NULL;
} }
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
...@@ -120,22 +134,86 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) ...@@ -120,22 +134,86 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
} else { } else {
cudaGaugeSloppy = cudaGaugePrecise; cudaGaugeSloppy = cudaGaugePrecise;
} }
} }
// for now, only single-precision clover is supported void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *param)
{ {
if (!cudaGaugePrecise.even) {
printf("QUDA error: loadGaugeQuda() must precede call to loadCloverQuda()\n");
exit(-1);
}
if (!h_clover && !h_clovinv) {
printf("QUDA error: loadCloverQuda() called with neither clover term nor inverse\n");
exit(-1);
}
if (inv_param->clover_cpu_prec == QUDA_HALF_PRECISION) {
printf("QUDA error: half precision not supported on CPU\n");
exit(-1);
}
int X[4];
for (int i=0; i<4; i++) {
X[i] = gauge_param->X[i];
}
X[0] /= 2; // dimensions of the even-odd sublattice
inv_param->cloverGiB = 0;
if (h_clover) {
cudaCloverPrecise = allocateCloverField(X, inv_param->clover_cuda_prec);
loadCloverField(cudaCloverPrecise, h_clover, inv_param->clover_cuda_prec, inv_param->clover_order);
inv_param->cloverGiB += 2.0*cudaCloverPrecise.even.bytes / (1<<30);
if (inv_param->matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC ||
inv_param->matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
if (inv_param->clover_cuda_prec != inv_param->clover_cuda_prec_sloppy) {
cudaCloverSloppy = allocateCloverField(X, inv_param->clover_cuda_prec_sloppy);
loadCloverField(cudaCloverSloppy, h_clover, inv_param->clover_cuda_prec_sloppy, inv_param->clover_order);
inv_param->cloverGiB += 2.0*cudaCloverInvSloppy.even.bytes / (1<<30);
} else {
cudaCloverSloppy = cudaCloverPrecise;
}
} // sloppy precision clover term not needed otherwise
}
cudaCloverInvPrecise = allocateCloverField(X, inv_param->clover_cuda_prec);
if (!h_clovinv) {
printf("QUDA error: clover term inverse not implemented yet\n");
exit(-1);
} else {
loadCloverField(cudaCloverInvPrecise, h_clovinv, inv_param->clover_cuda_prec, inv_param->clover_order);
}
inv_param->cloverGiB += 2.0*cudaCloverInvPrecise.even.bytes / (1<<30);
if (inv_param->clover_cuda_prec != inv_param->clover_cuda_prec_sloppy) {
cudaCloverInvSloppy = allocateCloverField(X, inv_param->clover_cuda_prec_sloppy);
loadCloverField(cudaCloverInvSloppy, h_clovinv, inv_param->clover_cuda_prec_sloppy, inv_param->clover_order);
inv_param->cloverGiB += 2.0*cudaCloverInvSloppy.even.bytes / (1<<30);
} else {
cudaCloverInvSloppy = cudaCloverInvPrecise;
}
}
cudaCloverSloppy = cudaCloverPrecise; // discard clover term but keep the inverse
void discardCloverQuda(QudaInvertParam *inv_param)
{
inv_param->cloverGiB -= 2.0*cudaCloverPrecise.even.bytes / (1<<30);
freeCloverField(&cudaCloverPrecise);
if (cudaCloverSloppy.even.clover) {
inv_param->cloverGiB -= 2.0*cudaCloverSloppy.even.bytes / (1<<30);
freeCloverField(&cudaCloverSloppy);
}
} }
void endQuda() void endQuda(void)
{ {
freeSpinorBuffer(); freeSpinorBuffer();
freeGaugeField(&cudaGaugePrecise); freeGaugeField(&cudaGaugePrecise);
freeGaugeField(&cudaGaugeSloppy); freeGaugeField(&cudaGaugeSloppy);
if (cudaCloverPrecise.even.clover) freeCloverField(&cudaCloverPrecise);
if (cudaCloverSloppy.even.clover) freeCloverField(&cudaCloverSloppy);
if (cudaCloverInvPrecise.even.clover) freeCloverField(&cudaCloverInvPrecise);
if (cudaCloverInvSloppy.even.clover) freeCloverField(&cudaCloverInvSloppy);
} }
void checkPrecision(QudaInvertParam *param) { void checkPrecision(QudaInvertParam *param) {
...@@ -153,7 +231,14 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int parity, ...@@ -153,7 +231,14 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int parity,
ParitySpinor out = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec); ParitySpinor out = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec);
loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->dirac_order); loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->dirac_order);
dslashCuda(out, cudaGaugePrecise, in, parity, dagger); if (inv_param->dslash_type == QUDA_WILSON_DSLASH) {
dslashCuda(out, cudaGaugePrecise, in, parity, dagger);
} else if (inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
cloverDslashCuda(out, cudaGaugePrecise, cudaCloverInvPrecise, in, parity, dagger);
} else {
printf("QUDA error: unsupported dslash_type\n");
exit(-1);
}
retrieveParitySpinor(h_out, out, inv_param->cpu_prec, inv_param->dirac_order); retrieveParitySpinor(h_out, out, inv_param->cpu_prec, inv_param->dirac_order);
freeParitySpinor(out); freeParitySpinor(out);
...@@ -169,7 +254,15 @@ void MatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int dagger) ...@@ -169,7 +254,15 @@ void MatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int dagger)
ParitySpinor tmp = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec); ParitySpinor tmp = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec);
loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->dirac_order); loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->dirac_order);
MatPCCuda(out, cudaGaugePrecise, in, inv_param->kappa, tmp, inv_param->matpc_type, dagger); if (inv_param->dslash_type == QUDA_WILSON_DSLASH) {
MatPCCuda(out, cudaGaugePrecise, in, inv_param->kappa, tmp, inv_param->matpc_type, dagger);
} else if (inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
cloverMatPCCuda(out, cudaGaugePrecise, cudaCloverPrecise, cudaCloverInvPrecise, in, inv_param->kappa,
tmp, inv_param->matpc_type, dagger);
} else {
printf("QUDA error: unsupported dslash_type\n");
exit(-1);
}
retrieveParitySpinor(h_out, out, inv_param->cpu_prec, inv_param->dirac_order); retrieveParitySpinor(h_out, out, inv_param->cpu_prec, inv_param->dirac_order);
freeParitySpinor(tmp); freeParitySpinor(tmp);
...@@ -186,7 +279,15 @@ void MatPCDagMatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) ...@@ -186,7 +279,15 @@ void MatPCDagMatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
ParitySpinor tmp = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec); ParitySpinor tmp = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec);
loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->dirac_order); loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->dirac_order);
MatPCDagMatPCCuda(out, cudaGaugePrecise, in, inv_param->kappa, tmp, inv_param->matpc_type); if (inv_param->dslash_type == QUDA_WILSON_DSLASH) {
MatPCDagMatPCCuda(out, cudaGaugePrecise, in, inv_param->kappa, tmp, inv_param->matpc_type);
} else if (inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
cloverMatPCDagMatPCCuda(out, cudaGaugePrecise, cudaCloverPrecise, cudaCloverInvPrecise, in, inv_param->kappa,
tmp, inv_param->matpc_type);
} else {
printf("QUDA error: unsupported dslash_type\n");
exit(-1);
}
retrieveParitySpinor(h_out, out, inv_param->cpu_prec, inv_param->dirac_order); retrieveParitySpinor(h_out, out, inv_param->cpu_prec, inv_param->dirac_order);
freeParitySpinor(tmp); freeParitySpinor(tmp);
...@@ -202,9 +303,16 @@ void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int dagger) { ...@@ -202,9 +303,16 @@ void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int dagger) {
loadSpinorField(in, h_in, inv_param->cpu_prec, inv_param->dirac_order); loadSpinorField(in, h_in, inv_param->cpu_prec, inv_param->dirac_order);
dslashXpayCuda(out.odd, cudaGaugePrecise, in.even, 1, dagger, in.odd, -inv_param->kappa); if (inv_param->dslash_type == QUDA_WILSON_DSLASH) {
dslashXpayCuda(out.even, cudaGaugePrecise, in.odd, 0, dagger, in.even, -inv_param->kappa); MatCuda(out, cudaGaugePrecise, in, -inv_param->kappa, dagger);
} else if (inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
ParitySpinor tmp = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec);
cloverMatCuda(out, cudaGaugePrecise, cudaCloverPrecise, in, inv_param->kappa, tmp, dagger);
freeParitySpinor(tmp);
} else {
printf("QUDA error: unsupported dslash_type\n");
exit(-1);
}
retrieveSpinorField(h_out, out, inv_param->cpu_prec, inv_param->dirac_order); retrieveSpinorField(h_out, out, inv_param->cpu_prec, inv_param->dirac_order);
freeSpinorField(out); freeSpinorField(out);
...@@ -244,8 +352,14 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param) ...@@ -244,8 +352,14 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
b.odd = tmp; b.odd = tmp;
} }
if (param->matpc_type == QUDA_MATPC_EVEN_EVEN) { x.odd = tmp; x.even = out; } if (param->matpc_type == QUDA_MATPC_EVEN_EVEN ||
else { x.even = tmp; x.odd = out; } param->matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) {
x.odd = tmp;
x.even = out;
} else {
x.even = tmp;
x.odd = out;
}
loadSpinorField(b, h_b, param->cpu_prec, param->dirac_order); loadSpinorField(b, h_b, param->cpu_prec, param->dirac_order);
...@@ -258,13 +372,52 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param) ...@@ -258,13 +372,52 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
// cps uses a different anisotropy normalization // cps uses a different anisotropy normalization
if (param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) { if (param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
axCuda(1.0/gauge_param->anisotropy, b.even); axCuda(1.0/gauge_param->anisotropy, b.even);
axCuda(1.0/gauge_param->anisotropy, b.even); axCuda(1.0/gauge_param->anisotropy, b.odd);
} }
if (param->matpc_type == QUDA_MATPC_EVEN_EVEN) { if (param->dslash_type == QUDA_WILSON_DSLASH) {
dslashXpayCuda(in, cudaGaugePrecise, b.odd, 0, 0, b.even, kappa); if (param->matpc_type == QUDA_MATPC_EVEN_EVEN) {
// in = b_e + k D_eo b_o
dslashXpayCuda(in, cudaGaugePrecise, b.odd, 0, 0, b.even, kappa);
} else if (param->matpc_type == QUDA_MATPC_ODD_ODD) {
// in = b_o + k D_oe b_e
dslashXpayCuda(in, cudaGaugePrecise, b.even, 1, 0, b.odd, kappa);
} else {
printf("QUDA error: matpc_type not valid for plain Wilson\n");
exit(-1);
}
} else if (param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
if (param->matpc_type == QUDA_MATPC_EVEN_EVEN) {
// in = A_ee^-1 (b_e + k D_eo A_oo^-1 b_o)
ParitySpinor aux = tmp; // aliases b.odd when PRESERVE_SOURCE_NO is set
cloverCuda(in, cudaGaugePrecise, cudaCloverInvPrecise, b.odd, 1);
dslashXpayCuda(aux, cudaGaugePrecise, in, 0, 0, b.even, kappa);
cloverCuda(in, cudaGaugePrecise, cudaCloverInvPrecise, aux, 0);
} else if (param->matpc_type == QUDA_MATPC_ODD_ODD) {
// in = A_oo^-1 (b_o + k D_oe A_ee^-1 b_e)
ParitySpinor aux = tmp; // aliases b.odd when PRESERVE_SOURCE_NO is set
cloverCuda(in, cudaGaugePrecise, cudaCloverInvPrecise, b.even, 0);
dslashXpayCuda(aux, cudaGaugePrecise, in, 1, 0, b.odd, kappa);
cloverCuda(in, cudaGaugePrecise, cudaCloverInvPrecise, aux, 1);
} else if (param->matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) {
// in = b_e + k D_eo A_oo^-1 b_o
ParitySpinor aux = out; // aliases b.even when PRESERVE_SOURCE_NO is set
cloverCuda(in, cudaGaugePrecise, cudaCloverInvPrecise, b.odd, 1);
dslashXpayCuda(aux, cudaGaugePrecise, in, 0, 0, b.even, kappa);
copyCuda(in, aux);
} else if (param->matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
// in = b_o + k D_oe A_ee^-1 b_e
ParitySpinor aux = out; // aliases b.even when PRESERVE_SOURCE_NO is set
cloverCuda(in, cudaGaugePrecise, cudaCloverInvPrecise, b.even, 0);
dslashXpayCuda(aux, cudaGaugePrecise, in, 1, 0, b.odd, kappa);
copyCuda(in, aux);
} else {
printf("QUDA error: invalid matpc_type\n");
exit(-1);
}
} else { } else {
dslashXpayCuda(in, cudaGaugePrecise, b.even, 1, 0, b.odd, kappa); printf("QUDA error: unsupported dslash_type\n");
exit(-1);
} }
} else if (param->solution_type == QUDA_MATPC_SOLUTION || } else if (param->solution_type == QUDA_MATPC_SOLUTION ||
...@@ -293,14 +446,14 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param) ...@@ -293,14 +446,14 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
copyCuda(out, in); copyCuda(out, in);
MatPCCuda(in, cudaGaugePrecise, out, kappa, tmp, param->matpc_type, QUDA_DAG_YES); MatPCCuda(in, cudaGaugePrecise, out, kappa, tmp, param->matpc_type, QUDA_DAG_YES);
} }
invertCgCuda(out, in, cudaGaugePrecise, cudaGaugeSloppy, tmp, param); invertCgCuda(out, in, tmp, param);
break; break;
case QUDA_BICGSTAB_INVERTER: case QUDA_BICGSTAB_INVERTER:
if (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) { if (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) {
invertBiCGstabCuda(out, in, cudaGaugePrecise, cudaGaugeSloppy, tmp, param, QUDA_DAG_YES); invertBiCGstabCuda(out, in, tmp, param, QUDA_DAG_YES);
copyCuda(in, out); copyCuda(in, out);
} }
invertBiCGstabCuda(out, in, cudaGaugePrecise, cudaGaugeSloppy, tmp, param, QUDA_DAG_NO); invertBiCGstabCuda(out, in, tmp, param, QUDA_DAG_NO);
break; break;
default: default:
printf("Inverter type %d not implemented\n", param->inv_type); printf("Inverter type %d not implemented\n", param->inv_type);
...@@ -315,10 +468,32 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param) ...@@ -315,10 +468,32 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
loadSpinorField(b, h_b, param->cpu_prec, param->dirac_order); loadSpinorField(b, h_b, param->cpu_prec, param->dirac_order);
} }
if (param->matpc_type == QUDA_MATPC_EVEN_EVEN) { if (param->dslash_type == QUDA_WILSON_DSLASH) {
dslashXpayCuda(x.odd, cudaGaugePrecise, out, 1, 0, b.odd, kappa); if (param->matpc_type == QUDA_MATPC_EVEN_EVEN) {
// x_o = b_o + k D_oe x_e
dslashXpayCuda(x.odd, cudaGaugePrecise, out, 1, 0, b.odd, kappa);
} else {
// x_e = b_e + k D_eo x_o
dslashXpayCuda(x.even, cudaGaugePrecise, out, 0, 0, b.even, kappa);
}
} else { } else {
dslashXpayCuda(x.even, cudaGaugePrecise, out, 0, 0, b.even, kappa); if (param->matpc_type == QUDA_MATPC_EVEN_EVEN) {
// x_o = A_oo^-1 (b_o + k D_oe x_e)
ParitySpinor aux = b.even;
dslashXpayCuda(aux, cudaGaugePrecise, out, 1, 0, b.odd, kappa);
cloverCuda(x.odd, cudaGaugePrecise, cudaCloverInvPrecise, aux, 1);
} else if (param->matpc_type == QUDA_MATPC_ODD_ODD) {
// x_e = A_ee^-1 (b_e + k D_eo x_o)
ParitySpinor aux = b.odd;
dslashXpayCuda(aux, cudaGaugePrecise, out, 0, 0, b.even, kappa);
cloverCuda(x.even, cudaGaugePrecise, cudaCloverInvPrecise, aux, 0);
} else if (param->matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) {
// x_o = b_o + k D_oe x_e
dslashXpayCuda(x.odd, cudaGaugePrecise, out, 1, 0, b.odd, kappa);
} else {
// x_e = b_e + k D_eo x_o
dslashXpayCuda(x.even, cudaGaugePrecise, out, 0, 0, b.even, kappa);
}
} }
retrieveSpinorField(h_x, x, param->cpu_prec, param->dirac_order); retrieveSpinorField(h_x, x, param->cpu_prec, param->dirac_order);
...@@ -335,4 +510,3 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param) ...@@ -335,4 +510,3 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
return; return;
} }
...@@ -15,6 +15,8 @@ extern "C" { ...@@ -15,6 +15,8 @@ extern "C" {
QudaGaugeFieldOrder gauge_order; QudaGaugeFieldOrder gauge_order;
QudaTboundary t_boundary;
QudaPrecision cpu_prec; QudaPrecision cpu_prec;
QudaPrecision cuda_prec; QudaPrecision cuda_prec;
...@@ -25,29 +27,27 @@ extern "C" { ...@@ -25,29 +27,27 @@ extern "C" {
QudaGaugeFixed gauge_fix; QudaGaugeFixed gauge_fix;
QudaTboundary t_boundary; int blockDim; // number of threads in a block
int blockDim_sloppy;
int packed_size; int packed_size;
double gaugeGiB; double gaugeGiB;
int blockDim; // number of threads in a block
int blockDim_sloppy;
} QudaGaugeParam; } QudaGaugeParam;
typedef struct QudaInvertParam_s { typedef struct QudaInvertParam_s {
double kappa;
QudaMassNormalization mass_normalization;
QudaDslashType dslash_type; QudaDslashType dslash_type;
QudaInverterType inv_type; QudaInverterType inv_type;
double kappa;
double tol; double tol;
int iter;
int maxiter; int maxiter;
double reliable_delta; // reliable update tolerance double reliable_delta; // reliable update tolerance
QudaMatPCType matpc_type; QudaMatPCType matpc_type;
QudaSolutionType solution_type; QudaSolutionType solution_type;
QudaMassNormalization mass_normalization;
QudaPreserveSource preserve_source; QudaPreserveSource preserve_source;
...@@ -63,24 +63,26 @@ extern "C" { ...@@ -63,24 +63,26 @@ extern "C" {
QudaCloverFieldOrder clover_order; QudaCloverFieldOrder clover_order;
QudaVerbosity verbosity;
int iter;
double spinorGiB; double spinorGiB;
double cloverGiB; double cloverGiB;
double gflops; double gflops;
double secs; double secs;
QudaVerbosity verbosity;
} QudaInvertParam; } QudaInvertParam;
// Interface functions // Interface functions
void initQuda(int dev); void initQuda(int dev);
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param); void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param);
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param);
void discardCloverQuda(QudaInvertParam *inv_param);
void invertQuda(void *h_x, void *h_b, QudaInvertParam *param); void invertQuda(void *h_x, void *h_b, QudaInvertParam *param);
void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int parity, int dagger); void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int parity, int dagger);
void MatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int dagger); void MatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int dagger);
void MatPCDagMatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param); void MatPCDagMatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param);
void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int dagger); void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int dagger);
void endQuda(void); void endQuda(void);
......
...@@ -10,7 +10,7 @@ int main(int argc, char **argv) ...@@ -10,7 +10,7 @@ int main(int argc, char **argv)
{ {
int device = 0; int device = 0;
void *gauge[4]; void *gauge[4], *clover_inv;
QudaGaugeParam Gauge_param; QudaGaugeParam Gauge_param;
QudaInvertParam inv_param; QudaInvertParam inv_param;
...@@ -21,40 +21,53 @@ int main(int argc, char **argv) ...@@ -21,40 +21,53 @@ int main(int argc, char **argv)
Gauge_param.X[3] = 32; Gauge_param.X[3] = 32;
setDims(Gauge_param.X); setDims(Gauge_param.X);
Gauge_param.blockDim = 64; Gauge_param.anisotropy = 1.0;
Gauge_param.blockDim_sloppy = 64; Gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
Gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T;
Gauge_param.cpu_prec = QUDA_DOUBLE_PRECISION; Gauge_param.cpu_prec = QUDA_DOUBLE_PRECISION;
Gauge_param.cuda_prec = QUDA_DOUBLE_PRECISION; Gauge_param.cuda_prec = QUDA_DOUBLE_PRECISION;
Gauge_param.reconstruct = QUDA_RECONSTRUCT_12; Gauge_param.reconstruct = QUDA_RECONSTRUCT_12;
Gauge_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION; Gauge_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
Gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12; Gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12;
Gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO; Gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
Gauge_param.anisotropy = 1.0; Gauge_param.blockDim = 64;
Gauge_param.blockDim_sloppy = 64;
inv_param.inv_type = QUDA_BICGSTAB_INVERTER;
Gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T;
Gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
gauge_param = &Gauge_param; gauge_param = &Gauge_param;
int clover_yes = 0; // 0 for plain Wilson, 1 for clover
if (clover_yes) {
inv_param.dslash_type = QUDA_CLOVER_WILSON_DSLASH;
} else {
inv_param.dslash_type = QUDA_WILSON_DSLASH;
}
inv_param.inv_type = QUDA_BICGSTAB_INVERTER;
double mass = -0.95; double mass = -0.95;
inv_param.kappa = 1.0 / (2.0*(4 + mass)); inv_param.kappa = 1.0 / (2.0*(1 + 3/gauge_param->anisotropy + mass));
inv_param.tol = 1e-12; inv_param.tol = 1e-12;
inv_param.maxiter = 10000; inv_param.maxiter = 10000;
inv_param.reliable_delta = 1e-3; inv_param.reliable_delta = 1e-3;
inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
inv_param.solution_type = QUDA_MAT_SOLUTION;
inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION; inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION; inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_DOUBLE_PRECISION; inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
inv_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION; inv_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
inv_param.solution_type = QUDA_MAT_SOLUTION;
inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
inv_param.preserve_source = QUDA_PRESERVE_SOURCE_YES; inv_param.preserve_source = QUDA_PRESERVE_SOURCE_YES;
inv_param.dirac_order = QUDA_DIRAC_ORDER; inv_param.dirac_order = QUDA_DIRAC_ORDER;
if (clover_yes) {
inv_param.clover_cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.clover_cuda_prec = QUDA_DOUBLE_PRECISION;
inv_param.clover_cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER;
}
inv_param.verbosity = QUDA_VERBOSE; inv_param.verbosity = QUDA_VERBOSE;
size_t gSize = (Gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); size_t gSize = (Gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
...@@ -65,6 +78,15 @@ int main(int argc, char **argv) ...@@ -65,6 +78,15 @@ int main(int argc, char **argv)
} }
construct_gauge_field(gauge, 1, Gauge_param.cpu_prec); construct_gauge_field(gauge, 1, Gauge_param.cpu_prec);
if (clover_yes) {
double norm = 1.0; // random components range between -norm and norm
double diag = 1.0; // constant added to the diagonal
size_t cSize = (inv_param.clover_cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
clover_inv = malloc(V*cloverSiteSize*cSize);
construct_clover_field(clover_inv, norm, diag, inv_param.clover_cpu_prec);
}
void *spinorIn = malloc(V*spinorSiteSize*sSize); void *spinorIn = malloc(V*spinorSiteSize*sSize);
void *spinorOut = malloc(V*spinorSiteSize*sSize); void *spinorOut = malloc(V*spinorSiteSize*sSize);
void *spinorCheck = malloc(V*spinorSiteSize*sSize); void *spinorCheck = malloc(V*spinorSiteSize*sSize);
...@@ -78,6 +100,7 @@ int main(int argc, char **argv) ...@@ -78,6 +100,7 @@ int main(int argc, char **argv)
initQuda(device); initQuda(device);
loadGaugeQuda((void*)gauge, &Gauge_param); loadGaugeQuda((void*)gauge, &Gauge_param);
if (clover_yes) loadCloverQuda(NULL, clover_inv, &inv_param);
invertQuda(spinorOut, spinorIn, &inv_param); invertQuda(spinorOut, spinorIn, &inv_param);
......
...@@ -42,9 +42,12 @@ extern "C" { ...@@ -42,9 +42,12 @@ extern "C" {
CloverFieldOrder clover_order); CloverFieldOrder clover_order);
void loadFullClover(FullClover ret, void *clover, Precision cpu_prec, void loadFullClover(FullClover ret, void *clover, Precision cpu_prec,
CloverFieldOrder clover_order); CloverFieldOrder clover_order);
void loadCloverField(FullSpinor ret, void *clover, Precision cpu_prec, void loadCloverField(FullClover ret, void *clover, Precision cpu_prec,
CloverFieldOrder clover_order); CloverFieldOrder clover_order);
/* void createCloverField(FullClover *cudaClover, void *cpuClover, int *X,
Precision precision); */
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
......
...@@ -440,6 +440,28 @@ void construct_gauge_field(void **gauge, int type, Precision precision) { ...@@ -440,6 +440,28 @@ void construct_gauge_field(void **gauge, int type, Precision precision) {
} }
} }
template <typename Float>
void constructCloverField(Float *res, double norm, double diag) {
Float c = 2.0 * norm / RAND_MAX;
for(int i = 0; i < V; i++) {
for (int j = 0; j < 72; j++) {
res[i*72 + j] = c * rand() - 1.0;
}
for (int j = 0; j< 6; j++) {
res[i*72 + j] += diag;
res[i*72 + j+36] += diag;
}
}
}
void construct_clover_field(void *clover, double norm, double diag, Precision precision) {
if (precision == QUDA_DOUBLE_PRECISION) constructCloverField((double *)clover, norm, diag);
else constructCloverField((float *)clover, norm, diag);
}
template <typename Float> template <typename Float>
void constructPointSpinorField(Float *res, int i0, int s0, int c0) { void constructPointSpinorField(Float *res, int i0, int s0, int c0) {
Float *resEven = res; Float *resEven = res;
......
...@@ -17,6 +17,7 @@ extern "C" { ...@@ -17,6 +17,7 @@ extern "C" {
int getOddBit(int X); int getOddBit(int X);
void construct_gauge_field(void **gauge, int type, Precision precision); void construct_gauge_field(void **gauge, int type, Precision precision);
void construct_clover_field(void *clover, double norm, double diag, Precision precision);
void construct_spinor_field(void *spinor, int type, int i0, int s0, int c0, Precision precision); void construct_spinor_field(void *spinor, int type, int i0, int s0, int c0, Precision precision);
void su3_construct(void *mat, ReconstructType reconstruct, Precision precision); void su3_construct(void *mat, ReconstructType reconstruct, Precision precision);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment