Added support for cps anisotropy

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@417 be54200a-260c-0410-bdd7-ce6af2a381ab

Added support for cps anisotropy
git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@417 be54200a-260c-0410-bdd7-ce6af2a381ab
610ecbb7 · mikeaclark · e5e5e289 · 610ecbb7 · 610ecbb7 · 610ecbb7
Commit 610ecbb7 authored Jul 15, 2009 by mikeaclark
--- a/gauge_quda.cpp
+++ b/gauge_quda.cpp
@@ -10,6 +10,8 @@
 #define SCALE_FLOAT (SHORT_LENGTH-1) / 2.f
 #define SHIFT_FLOAT -1.f / (SHORT_LENGTH-1)

+double Anisotropy;
+
 template <typename Float>
 inline short FloatToShort(Float a) {
  return (short)((a+SHIFT_FLOAT)*SCALE_FLOAT);
@@ -111,9 +113,9 @@ void packCPSGaugeField(FloatN *res, Float *gauge, int oddBit, ReconstructType re
    for (int dir = 0; dir < 4; dir++) {
      Float *g = gauge + (oddBit*V*4+dir)*18;
      for (int i = 0; i < V; i++) {
-	// Must reorder rows-columns
+	// Must reorder rows-columns and divide by anisotropy
 	for (int ic=0; ic<2; ic++) for (int jc=0; jc<3; jc++) for (int r=0; r<2; r++)
-	  gT[(ic*3+jc)*2+r] = g[4*i*18 + (jc*3+ic)*2+r];      
+	  gT[(ic*3+jc)*2+r] = g[4*i*18 + (jc*3+ic)*2+r] / Anisotropy;
 	pack12(res+i, gT, dir, V);
      }
    } 
@@ -121,9 +123,9 @@ void packCPSGaugeField(FloatN *res, Float *gauge, int oddBit, ReconstructType re
    for (int dir = 0; dir < 4; dir++) {
      Float *g = gauge + (oddBit*V*4+dir)*18;
      for (int i = 0; i < V; i++) {
-	// Must reorder rows-columns
+	// Must reorder rows-columns and divide by anisotropy
 	for (int ic=0; ic<3; ic++) for (int jc=0; jc<3; jc++) for (int r=0; r<2; r++)
-	  gT[(ic*3+jc)*2+r] = g[4*i*18 + (jc*3+ic)*2+r];      
+	  gT[(ic*3+jc)*2+r] = g[4*i*18 + (jc*3+ic)*2+r] / Anisotropy;
 	pack8(res+i, gT, dir, V);
      }
    }
@@ -191,6 +193,7 @@ void loadGaugeField(FloatN *even, FloatN *odd, Float *cpuGauge, ReconstructType
  } else if (gauge_param->gauge_order == QUDA_CPS_WILSON_GAUGE_ORDER) {
    packCPSGaugeField(packedEven, (Float*)cpuGauge, 0, reconstruct, Vh);
    packCPSGaugeField(packedOdd,  (Float*)cpuGauge, 1, reconstruct, Vh);
+    
  } else {
    printf("Sorry, %d GaugeFieldOrder not supported\n", gauge_param->gauge_order);
    exit(-1);
@@ -210,7 +213,7 @@ void loadGaugeField(FloatN *even, FloatN *odd, Float *cpuGauge, ReconstructType
 }

 void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, ReconstructType reconstruct, 
-		      Precision precision, int *X) {
+		      Precision precision, int *X, double anisotropy) {

  if (gauge_param->cpu_prec == QUDA_HALF_PRECISION) {
    printf("QUDA error: half precision not supported on cpu\n");
@@ -227,6 +230,9 @@ void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, ReconstructType reco
    exit(-1);
  }

+  Anisotropy = anisotropy;
+
+  cudaGauge->anisotropy = anisotropy;
  cudaGauge->volume = 1;
  for (int d=0; d<4; d++) {
    cudaGauge->X[d] = X[d];

--- a/gauge_quda.h
+++ b/gauge_quda.h
@@ -8,8 +8,8 @@
 extern "C" {
 #endif

-  void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, 
-			ReconstructType reconstruct, Precision precision, int *X);
+  void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, ReconstructType reconstruct, 
+			Precision precision, int *X, double anisotropy);
  void freeGaugeField(FullGauge *cudaCauge);
  
 #ifdef __cplusplus

--- a/inv_bicgstab_quda.cpp
+++ b/inv_bicgstab_quda.cpp
@@ -111,7 +111,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise
    rho_r2 = caxpbypzYmbwcDotProductWYNormYQuda(alpha, p, omega, r_sloppy, x_sloppy, t, src_sloppy);
    rho0 = rho; rho.x = rho_r2.x; rho.y = rho_r2.y; r2 = rho_r2.z;

-    // reliable updates (ideally should be double precision)
+    // reliable updates
    rNorm = sqrt(r2);
    if (rNorm > maxrx) maxrx = rNorm;
    if (rNorm > maxrr) maxrr = rNorm;

--- a/inv_cg_quda.cpp
+++ b/inv_cg_quda.cpp
@@ -59,7 +59,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugePrecise,
  int k=0;
  int xUpdate = 0, rUpdate = 0;

-  //printf("%d iterations, r2 = %e\n", k, r2);
+  printf("%d iterations, r2 = %e\n", k, r2);
  stopwatchStart();
  while (r2 > stop && k<perf->maxiter) {
    MatPCDagMatPCCuda(Ap, gaugeSloppy, p, perf->kappa, tmp_sloppy, perf->matpc_type);
@@ -110,7 +110,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugePrecise,
    }

    k++;
-    //printf("%d iterations, r2 = %e\n", k, r2);
+    printf("%d iterations, r2 = %e\n", k, r2);
  }

  if (x.precision != x_sloppy.precision) copyCuda(x, x_sloppy);
@@ -121,7 +121,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugePrecise,
  if (k==invert_param->maxiter) 
    printf("Exceeded maximum iterations %d\n", invert_param->maxiter);

-  //printf("Residual updates = %d, Solution updates = %d\n", rUpdate, xUpdate);
+  printf("Residual updates = %d, Solution updates = %d\n", rUpdate, xUpdate);

  float gflops = k*(1.0e-9*x.volume)*(2*(2*1320+48) + 10*spinorSiteSize);
  //printf("%f gflops\n", k*gflops / stopwatchReadSeconds());

--- a/invert_quda.cpp
+++ b/invert_quda.cpp
@@ -93,20 +93,15 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
 {
  gauge_param = param;

-  /*  if (gauge_param->X != L1 || gauge_param->Y != L2 || gauge_param->Z != L3 || gauge_param->T != L4) {
-    printf("QUDA error: dimensions do not match: %d=%d, %d=%d, %d=%d, %d=%d\n", 
-	   gauge_param->X, L1, gauge_param->Y, L2, gauge_param->Z, L3, gauge_param->T, L4);
-    exit(-1);
-    }*/
  gauge_param->packed_size = (gauge_param->reconstruct == QUDA_RECONSTRUCT_8) ? 8 : 12;

  createGaugeField(&cudaGaugePrecise, h_gauge, gauge_param->reconstruct, 
-		   gauge_param->cuda_prec, gauge_param->X);
+		   gauge_param->cuda_prec, gauge_param->X, gauge_param->anisotropy);
  gauge_param->gaugeGiB = 2.0*cudaGaugePrecise.bytes/ (1 << 30);
  if (gauge_param->cuda_prec_sloppy != gauge_param->cuda_prec ||
      gauge_param->reconstruct_sloppy != gauge_param->reconstruct) {
    createGaugeField(&cudaGaugeSloppy, h_gauge, gauge_param->reconstruct_sloppy, 
-		     gauge_param->cuda_prec_sloppy, gauge_param->X);
+		     gauge_param->cuda_prec_sloppy, gauge_param->X, gauge_param->anisotropy);
    gauge_param->gaugeGiB += 2.0*cudaGaugeSloppy.bytes/ (1 << 30);
  } else {
    cudaGaugeSloppy = cudaGaugePrecise;
@@ -212,6 +207,7 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
  param->iter = 0;

  double kappa = param->kappa;
+  if (param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) kappa /= cudaGaugePrecise.anisotropy;

  FullSpinor b, x;
  ParitySpinor in = allocateParitySpinor(cudaGaugePrecise.X, invert_param->cuda_prec); // source vector
@@ -237,6 +233,12 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
      axCuda(2.0*kappa, b.odd);
    }

+    // cps uses a different anisotropy normalization
+    if (param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
+      axCuda(1.0/gauge_param->anisotropy, b.even);
+      axCuda(1.0/gauge_param->anisotropy, b.even);
+    }
+
    if (param->matpc_type == QUDA_MATPC_EVEN_EVEN) {
      dslashXpayCuda(in, cudaGaugePrecise, b.odd, 0, 0, b.even, kappa);
    } else {
@@ -253,6 +255,14 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
 	axCuda(4.0*kappa*kappa, in);
      else
 	axCuda(16.0*pow(kappa,4), in);
+
+    // cps uses a different anisotropy normalization
+    if (param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER)
+      if (param->solution_type == QUDA_MATPC_SOLUTION) 
+	axCuda(pow(1.0/gauge_param->anisotropy, 2), in);
+      else 
+	axCuda(pow(1.0/gauge_param->anisotropy, 4), in);
+
  }

  switch (param->inv_type) {

--- a/pack_test.c
+++ b/pack_test.c
@@ -97,13 +97,13 @@ void packTest() {
  
  stopwatchStart();
  param.gauge_order = QUDA_CPS_WILSON_GAUGE_ORDER;
-  createGaugeField(&cudaGaugePrecise, cpsGauge, param.reconstruct, param.cuda_prec, param.X);
+  createGaugeField(&cudaGaugePrecise, cpsGauge, param.reconstruct, param.cuda_prec, param.X, 1.0);
  double cpsGtime = stopwatchReadSeconds();
  printf("CPS Gauge send time = %e seconds\n", cpsGtime);

  stopwatchStart();
  param.gauge_order = QUDA_QDP_GAUGE_ORDER;
-  createGaugeField(&cudaGaugePrecise, qdpGauge, param.reconstruct, param.cuda_prec, param.X);
+  createGaugeField(&cudaGaugePrecise, qdpGauge, param.reconstruct, param.cuda_prec, param.X, 1.0);
  double qdpGtime = stopwatchReadSeconds();
  printf("QDP Gauge send time = %e seconds\n", qdpGtime);


--- a/quda.h
+++ b/quda.h
@@ -46,6 +46,7 @@ extern "C" {
    ReconstructType reconstruct;
    ParityGauge odd;
    ParityGauge even;
+    double anisotropy;
  } FullGauge;
  
  typedef struct {

--- a/read_gauge.h
+++ b/read_gauge.h
+// this will evaluate wrong for (1/L^3) < 5e-7, i.e., L=100
+
+#define FAST_INT_DIVIDE(a, b)			\
+  ((int)(__fdividef((float)a, (float)b)+0.0000005f))
+
+#define FAST_INT_MOD(a, b) (a - FAST_INT_DIVIDE(a,b)*b)
+
 // Performs complex addition
 #define COMPLEX_ADD_TO(a, b)			\
  a##_re += b##_re,				\