Advanced Computing Platform for Theoretical Physics

Commit 610ecbb7 authored by mikeaclark's avatar mikeaclark
Browse files

Added support for cps anisotropy

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@417 be54200a-260c-0410-bdd7-ce6af2a381ab
parent e5e5e289
...@@ -10,6 +10,8 @@ ...@@ -10,6 +10,8 @@
#define SCALE_FLOAT (SHORT_LENGTH-1) / 2.f #define SCALE_FLOAT (SHORT_LENGTH-1) / 2.f
#define SHIFT_FLOAT -1.f / (SHORT_LENGTH-1) #define SHIFT_FLOAT -1.f / (SHORT_LENGTH-1)
double Anisotropy;
template <typename Float> template <typename Float>
inline short FloatToShort(Float a) { inline short FloatToShort(Float a) {
return (short)((a+SHIFT_FLOAT)*SCALE_FLOAT); return (short)((a+SHIFT_FLOAT)*SCALE_FLOAT);
...@@ -111,9 +113,9 @@ void packCPSGaugeField(FloatN *res, Float *gauge, int oddBit, ReconstructType re ...@@ -111,9 +113,9 @@ void packCPSGaugeField(FloatN *res, Float *gauge, int oddBit, ReconstructType re
for (int dir = 0; dir < 4; dir++) { for (int dir = 0; dir < 4; dir++) {
Float *g = gauge + (oddBit*V*4+dir)*18; Float *g = gauge + (oddBit*V*4+dir)*18;
for (int i = 0; i < V; i++) { for (int i = 0; i < V; i++) {
// Must reorder rows-columns // Must reorder rows-columns and divide by anisotropy
for (int ic=0; ic<2; ic++) for (int jc=0; jc<3; jc++) for (int r=0; r<2; r++) for (int ic=0; ic<2; ic++) for (int jc=0; jc<3; jc++) for (int r=0; r<2; r++)
gT[(ic*3+jc)*2+r] = g[4*i*18 + (jc*3+ic)*2+r]; gT[(ic*3+jc)*2+r] = g[4*i*18 + (jc*3+ic)*2+r] / Anisotropy;
pack12(res+i, gT, dir, V); pack12(res+i, gT, dir, V);
} }
} }
...@@ -121,9 +123,9 @@ void packCPSGaugeField(FloatN *res, Float *gauge, int oddBit, ReconstructType re ...@@ -121,9 +123,9 @@ void packCPSGaugeField(FloatN *res, Float *gauge, int oddBit, ReconstructType re
for (int dir = 0; dir < 4; dir++) { for (int dir = 0; dir < 4; dir++) {
Float *g = gauge + (oddBit*V*4+dir)*18; Float *g = gauge + (oddBit*V*4+dir)*18;
for (int i = 0; i < V; i++) { for (int i = 0; i < V; i++) {
// Must reorder rows-columns // Must reorder rows-columns and divide by anisotropy
for (int ic=0; ic<3; ic++) for (int jc=0; jc<3; jc++) for (int r=0; r<2; r++) for (int ic=0; ic<3; ic++) for (int jc=0; jc<3; jc++) for (int r=0; r<2; r++)
gT[(ic*3+jc)*2+r] = g[4*i*18 + (jc*3+ic)*2+r]; gT[(ic*3+jc)*2+r] = g[4*i*18 + (jc*3+ic)*2+r] / Anisotropy;
pack8(res+i, gT, dir, V); pack8(res+i, gT, dir, V);
} }
} }
...@@ -191,6 +193,7 @@ void loadGaugeField(FloatN *even, FloatN *odd, Float *cpuGauge, ReconstructType ...@@ -191,6 +193,7 @@ void loadGaugeField(FloatN *even, FloatN *odd, Float *cpuGauge, ReconstructType
} else if (gauge_param->gauge_order == QUDA_CPS_WILSON_GAUGE_ORDER) { } else if (gauge_param->gauge_order == QUDA_CPS_WILSON_GAUGE_ORDER) {
packCPSGaugeField(packedEven, (Float*)cpuGauge, 0, reconstruct, Vh); packCPSGaugeField(packedEven, (Float*)cpuGauge, 0, reconstruct, Vh);
packCPSGaugeField(packedOdd, (Float*)cpuGauge, 1, reconstruct, Vh); packCPSGaugeField(packedOdd, (Float*)cpuGauge, 1, reconstruct, Vh);
} else { } else {
printf("Sorry, %d GaugeFieldOrder not supported\n", gauge_param->gauge_order); printf("Sorry, %d GaugeFieldOrder not supported\n", gauge_param->gauge_order);
exit(-1); exit(-1);
...@@ -210,7 +213,7 @@ void loadGaugeField(FloatN *even, FloatN *odd, Float *cpuGauge, ReconstructType ...@@ -210,7 +213,7 @@ void loadGaugeField(FloatN *even, FloatN *odd, Float *cpuGauge, ReconstructType
} }
void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, ReconstructType reconstruct, void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, ReconstructType reconstruct,
Precision precision, int *X) { Precision precision, int *X, double anisotropy) {
if (gauge_param->cpu_prec == QUDA_HALF_PRECISION) { if (gauge_param->cpu_prec == QUDA_HALF_PRECISION) {
printf("QUDA error: half precision not supported on cpu\n"); printf("QUDA error: half precision not supported on cpu\n");
...@@ -227,6 +230,9 @@ void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, ReconstructType reco ...@@ -227,6 +230,9 @@ void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, ReconstructType reco
exit(-1); exit(-1);
} }
Anisotropy = anisotropy;
cudaGauge->anisotropy = anisotropy;
cudaGauge->volume = 1; cudaGauge->volume = 1;
for (int d=0; d<4; d++) { for (int d=0; d<4; d++) {
cudaGauge->X[d] = X[d]; cudaGauge->X[d] = X[d];
......
...@@ -8,8 +8,8 @@ ...@@ -8,8 +8,8 @@
extern "C" { extern "C" {
#endif #endif
void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, ReconstructType reconstruct,
ReconstructType reconstruct, Precision precision, int *X); Precision precision, int *X, double anisotropy);
void freeGaugeField(FullGauge *cudaCauge); void freeGaugeField(FullGauge *cudaCauge);
#ifdef __cplusplus #ifdef __cplusplus
......
...@@ -111,7 +111,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise ...@@ -111,7 +111,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise
rho_r2 = caxpbypzYmbwcDotProductWYNormYQuda(alpha, p, omega, r_sloppy, x_sloppy, t, src_sloppy); rho_r2 = caxpbypzYmbwcDotProductWYNormYQuda(alpha, p, omega, r_sloppy, x_sloppy, t, src_sloppy);
rho0 = rho; rho.x = rho_r2.x; rho.y = rho_r2.y; r2 = rho_r2.z; rho0 = rho; rho.x = rho_r2.x; rho.y = rho_r2.y; r2 = rho_r2.z;
// reliable updates (ideally should be double precision) // reliable updates
rNorm = sqrt(r2); rNorm = sqrt(r2);
if (rNorm > maxrx) maxrx = rNorm; if (rNorm > maxrx) maxrx = rNorm;
if (rNorm > maxrr) maxrr = rNorm; if (rNorm > maxrr) maxrr = rNorm;
......
...@@ -59,7 +59,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugePrecise, ...@@ -59,7 +59,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugePrecise,
int k=0; int k=0;
int xUpdate = 0, rUpdate = 0; int xUpdate = 0, rUpdate = 0;
//printf("%d iterations, r2 = %e\n", k, r2); printf("%d iterations, r2 = %e\n", k, r2);
stopwatchStart(); stopwatchStart();
while (r2 > stop && k<perf->maxiter) { while (r2 > stop && k<perf->maxiter) {
MatPCDagMatPCCuda(Ap, gaugeSloppy, p, perf->kappa, tmp_sloppy, perf->matpc_type); MatPCDagMatPCCuda(Ap, gaugeSloppy, p, perf->kappa, tmp_sloppy, perf->matpc_type);
...@@ -110,7 +110,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugePrecise, ...@@ -110,7 +110,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugePrecise,
} }
k++; k++;
//printf("%d iterations, r2 = %e\n", k, r2); printf("%d iterations, r2 = %e\n", k, r2);
} }
if (x.precision != x_sloppy.precision) copyCuda(x, x_sloppy); if (x.precision != x_sloppy.precision) copyCuda(x, x_sloppy);
...@@ -121,7 +121,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugePrecise, ...@@ -121,7 +121,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugePrecise,
if (k==invert_param->maxiter) if (k==invert_param->maxiter)
printf("Exceeded maximum iterations %d\n", invert_param->maxiter); printf("Exceeded maximum iterations %d\n", invert_param->maxiter);
//printf("Residual updates = %d, Solution updates = %d\n", rUpdate, xUpdate); printf("Residual updates = %d, Solution updates = %d\n", rUpdate, xUpdate);
float gflops = k*(1.0e-9*x.volume)*(2*(2*1320+48) + 10*spinorSiteSize); float gflops = k*(1.0e-9*x.volume)*(2*(2*1320+48) + 10*spinorSiteSize);
//printf("%f gflops\n", k*gflops / stopwatchReadSeconds()); //printf("%f gflops\n", k*gflops / stopwatchReadSeconds());
......
...@@ -93,20 +93,15 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) ...@@ -93,20 +93,15 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
{ {
gauge_param = param; gauge_param = param;
/* if (gauge_param->X != L1 || gauge_param->Y != L2 || gauge_param->Z != L3 || gauge_param->T != L4) {
printf("QUDA error: dimensions do not match: %d=%d, %d=%d, %d=%d, %d=%d\n",
gauge_param->X, L1, gauge_param->Y, L2, gauge_param->Z, L3, gauge_param->T, L4);
exit(-1);
}*/
gauge_param->packed_size = (gauge_param->reconstruct == QUDA_RECONSTRUCT_8) ? 8 : 12; gauge_param->packed_size = (gauge_param->reconstruct == QUDA_RECONSTRUCT_8) ? 8 : 12;
createGaugeField(&cudaGaugePrecise, h_gauge, gauge_param->reconstruct, createGaugeField(&cudaGaugePrecise, h_gauge, gauge_param->reconstruct,
gauge_param->cuda_prec, gauge_param->X); gauge_param->cuda_prec, gauge_param->X, gauge_param->anisotropy);
gauge_param->gaugeGiB = 2.0*cudaGaugePrecise.bytes/ (1 << 30); gauge_param->gaugeGiB = 2.0*cudaGaugePrecise.bytes/ (1 << 30);
if (gauge_param->cuda_prec_sloppy != gauge_param->cuda_prec || if (gauge_param->cuda_prec_sloppy != gauge_param->cuda_prec ||
gauge_param->reconstruct_sloppy != gauge_param->reconstruct) { gauge_param->reconstruct_sloppy != gauge_param->reconstruct) {
createGaugeField(&cudaGaugeSloppy, h_gauge, gauge_param->reconstruct_sloppy, createGaugeField(&cudaGaugeSloppy, h_gauge, gauge_param->reconstruct_sloppy,
gauge_param->cuda_prec_sloppy, gauge_param->X); gauge_param->cuda_prec_sloppy, gauge_param->X, gauge_param->anisotropy);
gauge_param->gaugeGiB += 2.0*cudaGaugeSloppy.bytes/ (1 << 30); gauge_param->gaugeGiB += 2.0*cudaGaugeSloppy.bytes/ (1 << 30);
} else { } else {
cudaGaugeSloppy = cudaGaugePrecise; cudaGaugeSloppy = cudaGaugePrecise;
...@@ -212,6 +207,7 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param) ...@@ -212,6 +207,7 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
param->iter = 0; param->iter = 0;
double kappa = param->kappa; double kappa = param->kappa;
if (param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) kappa /= cudaGaugePrecise.anisotropy;
FullSpinor b, x; FullSpinor b, x;
ParitySpinor in = allocateParitySpinor(cudaGaugePrecise.X, invert_param->cuda_prec); // source vector ParitySpinor in = allocateParitySpinor(cudaGaugePrecise.X, invert_param->cuda_prec); // source vector
...@@ -237,6 +233,12 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param) ...@@ -237,6 +233,12 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
axCuda(2.0*kappa, b.odd); axCuda(2.0*kappa, b.odd);
} }
// cps uses a different anisotropy normalization
if (param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
axCuda(1.0/gauge_param->anisotropy, b.even);
axCuda(1.0/gauge_param->anisotropy, b.even);
}
if (param->matpc_type == QUDA_MATPC_EVEN_EVEN) { if (param->matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashXpayCuda(in, cudaGaugePrecise, b.odd, 0, 0, b.even, kappa); dslashXpayCuda(in, cudaGaugePrecise, b.odd, 0, 0, b.even, kappa);
} else { } else {
...@@ -253,6 +255,14 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param) ...@@ -253,6 +255,14 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
axCuda(4.0*kappa*kappa, in); axCuda(4.0*kappa*kappa, in);
else else
axCuda(16.0*pow(kappa,4), in); axCuda(16.0*pow(kappa,4), in);
// cps uses a different anisotropy normalization
if (param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER)
if (param->solution_type == QUDA_MATPC_SOLUTION)
axCuda(pow(1.0/gauge_param->anisotropy, 2), in);
else
axCuda(pow(1.0/gauge_param->anisotropy, 4), in);
} }
switch (param->inv_type) { switch (param->inv_type) {
......
...@@ -97,13 +97,13 @@ void packTest() { ...@@ -97,13 +97,13 @@ void packTest() {
stopwatchStart(); stopwatchStart();
param.gauge_order = QUDA_CPS_WILSON_GAUGE_ORDER; param.gauge_order = QUDA_CPS_WILSON_GAUGE_ORDER;
createGaugeField(&cudaGaugePrecise, cpsGauge, param.reconstruct, param.cuda_prec, param.X); createGaugeField(&cudaGaugePrecise, cpsGauge, param.reconstruct, param.cuda_prec, param.X, 1.0);
double cpsGtime = stopwatchReadSeconds(); double cpsGtime = stopwatchReadSeconds();
printf("CPS Gauge send time = %e seconds\n", cpsGtime); printf("CPS Gauge send time = %e seconds\n", cpsGtime);
stopwatchStart(); stopwatchStart();
param.gauge_order = QUDA_QDP_GAUGE_ORDER; param.gauge_order = QUDA_QDP_GAUGE_ORDER;
createGaugeField(&cudaGaugePrecise, qdpGauge, param.reconstruct, param.cuda_prec, param.X); createGaugeField(&cudaGaugePrecise, qdpGauge, param.reconstruct, param.cuda_prec, param.X, 1.0);
double qdpGtime = stopwatchReadSeconds(); double qdpGtime = stopwatchReadSeconds();
printf("QDP Gauge send time = %e seconds\n", qdpGtime); printf("QDP Gauge send time = %e seconds\n", qdpGtime);
......
...@@ -46,6 +46,7 @@ extern "C" { ...@@ -46,6 +46,7 @@ extern "C" {
ReconstructType reconstruct; ReconstructType reconstruct;
ParityGauge odd; ParityGauge odd;
ParityGauge even; ParityGauge even;
double anisotropy;
} FullGauge; } FullGauge;
typedef struct { typedef struct {
......
// this will evaluate wrong for (1/L^3) < 5e-7, i.e., L=100
#define FAST_INT_DIVIDE(a, b) \
((int)(__fdividef((float)a, (float)b)+0.0000005f))
#define FAST_INT_MOD(a, b) (a - FAST_INT_DIVIDE(a,b)*b)
// Performs complex addition // Performs complex addition
#define COMPLEX_ADD_TO(a, b) \ #define COMPLEX_ADD_TO(a, b) \
a##_re += b##_re, \ a##_re += b##_re, \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment