Runtime block size in dslash kernel

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@441 be54200a-260c-0410-bdd7-ce6af2a381ab

Runtime block size in dslash kernel
git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@441 be54200a-260c-0410-bdd7-ce6af2a381ab
587241a4 · mikeaclark · 54ccabd5 · 587241a4 · 587241a4 · 587241a4
Commit 587241a4 authored Aug 10, 2009 by mikeaclark
--- a/dslash_core.h
+++ b/dslash_core.h
 // *** CUDA DSLASH ***

 #define SHARED_FLOATS_PER_THREAD 0
-#define SHARED_BYTES_DOUBLE (BLOCK_DIM*SHARED_FLOATS_PER_THREAD*sizeof(double))
-
-#define SHARED_BYTES_SINGLE (BLOCK_DIM*SHARED_FLOATS_PER_THREAD*sizeof(float))
-
 // input spinor
 #if (DD_SPREC==0)
 #define spinorFloat double
@@ -298,7 +294,7 @@ volatile spinorFloat o32_im;
 #include "read_clover.h"
 #include "io_spinor.h"

-int sid = BLOCK_DIM*blockIdx.x + threadIdx.x;
+int sid = blockIdx.x*blockDim.x + threadIdx.x;
 int z1 = FAST_INT_DIVIDE(sid, X1h);
 int x1h = sid - z1*X1h;
 int z2 = FAST_INT_DIVIDE(z1, X2);

--- a/dslash_cuda_gen.py
+++ b/dslash_cuda_gen.py
@@ -133,8 +133,8 @@ def prolog():
    str = []
    str.append("// *** CUDA DSLASH ***\n\n" if not dagger else "// *** CUDA DSLASH DAGGER ***\n\n")
    str.append("#define SHARED_FLOATS_PER_THREAD "+`sharedFloats`+"\n")
-    str.append("#define SHARED_BYTES_DOUBLE (BLOCK_DIM*SHARED_FLOATS_PER_THREAD*sizeof(double))\n\n")
-    str.append("#define SHARED_BYTES_SINGLE (BLOCK_DIM*SHARED_FLOATS_PER_THREAD*sizeof(float))\n\n")
+#    str.append("#define SHARED_BYTES_DOUBLE (BLOCK_DIM*SHARED_FLOATS_PER_THREAD*sizeof(double))\n\n")
+#    str.append("#define SHARED_BYTES_SINGLE (BLOCK_DIM*SHARED_FLOATS_PER_THREAD*sizeof(float))\n\n")
    
    str.append("// input spinor\n")

@@ -247,7 +247,7 @@ def prolog():
 #include "read_clover.h"
 #include "io_spinor.h"

-int sid = BLOCK_DIM*blockIdx.x + threadIdx.x;
+int sid = blockIdx.x*blockDim.x + threadIdx.x;
 int z1 = FAST_INT_DIVIDE(sid, X1h);
 int x1h = sid - z1*X1h;
 int z2 = FAST_INT_DIVIDE(z1, X2);

--- a/dslash_dagger_core.h
+++ b/dslash_dagger_core.h
 // *** CUDA DSLASH DAGGER ***

 #define SHARED_FLOATS_PER_THREAD 0
-#define SHARED_BYTES_DOUBLE (BLOCK_DIM*SHARED_FLOATS_PER_THREAD*sizeof(double))
-
-#define SHARED_BYTES_SINGLE (BLOCK_DIM*SHARED_FLOATS_PER_THREAD*sizeof(float))
-
 // input spinor
 #if (DD_SPREC==0)
 #define spinorFloat double
@@ -298,7 +294,7 @@ volatile spinorFloat o32_im;
 #include "read_clover.h"
 #include "io_spinor.h"

-int sid = BLOCK_DIM*blockIdx.x + threadIdx.x;
+int sid = blockIdx.x*blockDim.x + threadIdx.x;
 int z1 = FAST_INT_DIVIDE(sid, X1h);
 int x1h = sid - z1*X1h;
 int z2 = FAST_INT_DIVIDE(z1, X2);

--- a/dslash_quda.cu
+++ b/dslash_quda.cu
--- a/dslash_quda.h
+++ b/dslash_quda.h
@@ -9,8 +9,6 @@
 #define spinorSiteSize 24 // real numbers per spinor
 #define cloverSiteSize 72 // real numbers per block-diagonal clover matrix

-#define BLOCK_DIM (64) // threads per block
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -25,7 +23,7 @@ extern "C" {

 // ---------- dslash_quda.cu ----------

-  int dslashCudaSharedBytes();
+  int dslashCudaSharedBytes(Precision spinor_prec, int blockDim);
  void initDslashCuda();
  void bindGaugeTex(FullGauge gauge, int oddBit);


--- a/dslash_test.c
+++ b/dslash_test.c
@@ -30,12 +30,15 @@ int TRANSFER = 0; // include transfer time in the benchmark?

 void init() {

-  gaugeParam.X[0] = 36;
-  gaugeParam.X[1] = 36;
-  gaugeParam.X[2] = 36;
-  gaugeParam.X[3] = 24;
+  gaugeParam.X[0] = 64;
+  gaugeParam.X[1] = 24;
+  gaugeParam.X[2] = 26;
+  gaugeParam.X[3] = 26;
+
  setDims(gaugeParam.X);

+  gaugeParam.blockDim = 64;
+
  gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION;
  gaugeParam.cuda_prec = QUDA_SINGLE_PRECISION;
  gaugeParam.reconstruct = QUDA_RECONSTRUCT_12;
@@ -129,7 +132,7 @@ void end() {
 double dslashCUDA() {

  // execute kernel
-  const int LOOPS = 10;
+  const int LOOPS = 100;
  printf("Executing %d kernel loops...", LOOPS);
  fflush(stdout);
  stopwatchStart();
@@ -191,53 +194,47 @@ void dslashRef() {
 void dslashTest() {

  init();
-
+  
  float spinorGiB = (float)Vh*spinorSiteSize*sizeof(inv_param.cpu_prec) / (1 << 30);
-  float sharedKB = (float)dslashCudaSharedBytes() / (1 << 10);
+  float sharedKB = (float)dslashCudaSharedBytes(inv_param.cuda_prec, gaugeParam.blockDim) / (1 << 10);
  printf("\nSpinor mem: %.3f GiB\n", spinorGiB);
  printf("Gauge mem: %.3f GiB\n", gaugeParam.gaugeGiB);
  printf("Shared mem: %.3f KB\n", sharedKB);
-
-  int attempts = 10000;
+  
+  int attempts = 1;
  dslashRef();
  for (int i=0; i<attempts; i++) {
    
    double secs = dslashCUDA();
-  
+    
    if (!TRANSFER) {
      if (test_type < 2) 
 	retrieveParitySpinor(spinorOdd, cudaSpinor.odd, inv_param.cpu_prec, inv_param.dirac_order);
      else 
 	retrieveSpinorField(spinorGPU, cudaSpinorOut, inv_param.cpu_prec, inv_param.dirac_order);
    }
-
+    
    // print timing information
    printf("%fms per loop\n", 1000*secs);
-
+    
    int flops = test_type ? 1320*2 + 48 : 1320;
    int floats = test_type ? 2*(7*24+8*gaugeParam.packed_size+24)+24 : 7*24+8*gaugeParam.packed_size+24;
    printf("GFLOPS = %f\n", 1.0e-9*flops*Vh/secs);
    printf("GiB/s = %f\n\n", Vh*floats*sizeof(float)/(secs*(1<<30)));
-
-    /*for (int is=0; is<Vh; is++) {
-      printf("%e %e\n", ((double*)spinorRef)[is*24], ((double*)spinorOdd)[is*24]);
-    }
-    exit(0);*/
    
    int res;
    if (test_type < 2) res = compare_floats(spinorOdd, spinorRef, Vh*4*3*2, 1e-4, inv_param.cpu_prec);
    else res = compare_floats(spinorGPU, spinorRef, V*4*3*2, 1e-4, inv_param.cpu_prec);
-
-    printf("%d Test %s\n", i, (1 == res) ? "PASSED" : "FAILED");
-
-    if (test_type < 2) strong_check(spinorRef, spinorOdd, Vh, inv_param.cpu_prec);
-    else strong_check(spinorRef, spinorGPU, V, inv_param.cpu_prec);
-
-    exit(0);
+      
+      printf("%d Test %s\n", i, (1 == res) ? "PASSED" : "FAILED");
+      
+      if (test_type < 2) strong_check(spinorRef, spinorOdd, Vh, inv_param.cpu_prec);
+      else strong_check(spinorRef, spinorGPU, V, inv_param.cpu_prec);
+    
  }  
-
+  
  end();
-
+  
 }

 int main(int argc, char **argv) {

--- a/gauge_quda.cpp
+++ b/gauge_quda.cpp
@@ -223,7 +223,7 @@ void loadGaugeField(FloatN *even, FloatN *odd, Float *cpuGauge, ReconstructType
 }

 void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, ReconstructType reconstruct, 
-		      Precision precision, int *X, double anisotropy) {
+		      Precision precision, int *X, double anisotropy, int blockDim) {

  if (gauge_param->cpu_prec == QUDA_HALF_PRECISION) {
    printf("QUDA error: half precision not supported on cpu\n");
@@ -251,6 +251,8 @@ void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, ReconstructType reco
  cudaGauge->X[0] /= 2; // actually store the even-odd sublattice dimensions
  cudaGauge->volume /= 2;

+  cudaGauge->blockDim = blockDim;
+
  allocateGaugeField(cudaGauge, reconstruct, precision);

  if (precision == QUDA_DOUBLE_PRECISION) {

--- a/gauge_quda.h
+++ b/gauge_quda.h
@@ -9,7 +9,7 @@ extern "C" {
 #endif

  void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, ReconstructType reconstruct, 
-			Precision precision, int *X, double anisotropy);
+			Precision precision, int *X, double anisotropy, int blockDim);
  void freeGaugeField(FullGauge *cudaCauge);
  
 #ifdef __cplusplus

--- a/invert_quda.cpp
+++ b/invert_quda.cpp
@@ -96,12 +96,13 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
  gauge_param->packed_size = (gauge_param->reconstruct == QUDA_RECONSTRUCT_8) ? 8 : 12;

  createGaugeField(&cudaGaugePrecise, h_gauge, gauge_param->reconstruct, 
-		   gauge_param->cuda_prec, gauge_param->X, gauge_param->anisotropy);
+		   gauge_param->cuda_prec, gauge_param->X, gauge_param->anisotropy, gauge_param->blockDim);
  gauge_param->gaugeGiB = 2.0*cudaGaugePrecise.bytes/ (1 << 30);
  if (gauge_param->cuda_prec_sloppy != gauge_param->cuda_prec ||
      gauge_param->reconstruct_sloppy != gauge_param->reconstruct) {
    createGaugeField(&cudaGaugeSloppy, h_gauge, gauge_param->reconstruct_sloppy, 
-		     gauge_param->cuda_prec_sloppy, gauge_param->X, gauge_param->anisotropy);
+		     gauge_param->cuda_prec_sloppy, gauge_param->X, gauge_param->anisotropy,
+		     gauge_param->blockDim_sloppy);
    gauge_param->gaugeGiB += 2.0*cudaGaugeSloppy.bytes/ (1 << 30);
  } else {
    cudaGaugeSloppy = cudaGaugePrecise;

--- a/invert_quda.h
+++ b/invert_quda.h
@@ -30,6 +30,8 @@ extern "C" {
    int packed_size;
    double gaugeGiB;

+    int blockDim; // number of threads in a block
+    int blockDim_sloppy;
  } QudaGaugeParam;

  typedef struct QudaInvertParam_s {

--- a/invert_test.c
+++ b/invert_test.c
@@ -21,6 +21,8 @@ int main(int argc, char **argv)
  Gauge_param.X[3] = 32;
  setDims(Gauge_param.X);

+  Gauge_param.blockDim = 64;
+
  Gauge_param.cpu_prec = QUDA_DOUBLE_PRECISION;

  Gauge_param.cuda_prec = QUDA_SINGLE_PRECISION;

--- a/pack_test.c
+++ b/pack_test.c
@@ -40,6 +40,8 @@ void init() {

  Precision single = QUDA_SINGLE_PRECISION;

+  param.blockDim = 64;
+
  param.cpu_prec = QUDA_SINGLE_PRECISION;
  param.cuda_prec = QUDA_SINGLE_PRECISION;
  param.reconstruct = QUDA_RECONSTRUCT_12;
@@ -97,13 +99,13 @@ void packTest() {
  
  stopwatchStart();
  param.gauge_order = QUDA_CPS_WILSON_GAUGE_ORDER;
-  createGaugeField(&cudaGaugePrecise, cpsGauge, param.reconstruct, param.cuda_prec, param.X, 1.0);
+  createGaugeField(&cudaGaugePrecise, cpsGauge, param.reconstruct, param.cuda_prec, param.X, 1.0, param.blockDim);
  double cpsGtime = stopwatchReadSeconds();
  printf("CPS Gauge send time = %e seconds\n", cpsGtime);

  stopwatchStart();
  param.gauge_order = QUDA_QDP_GAUGE_ORDER;
-  createGaugeField(&cudaGaugePrecise, qdpGauge, param.reconstruct, param.cuda_prec, param.X, 1.0);
+  createGaugeField(&cudaGaugePrecise, qdpGauge, param.reconstruct, param.cuda_prec, param.X, 1.0, param.blockDim);
  double qdpGtime = stopwatchReadSeconds();
  printf("QDP Gauge send time = %e seconds\n", qdpGtime);


--- a/quda.h
+++ b/quda.h
@@ -37,6 +37,7 @@ extern "C" {
  typedef void *ParityGauge;

  typedef struct {
+    int blockDim; // The size of the thread block to use
    size_t bytes;
    Precision precision;
    int length; // total length