Advanced Computing Platform for Theoretical Physics

commit大文件会使得服务器变得不稳定,请大家尽量只commit代码,不要commit大的文件。

dslash_test.c 9.28 KB
Newer Older
mikeaclark's avatar
mikeaclark committed
1 2 3 4
#include <stdio.h>
#include <stdlib.h>

#include <quda.h>
5
#include <dslash_reference.h>
mikeaclark's avatar
mikeaclark committed
6
#include <util_quda.h>
7 8
#include <spinor_quda.h>
#include <gauge_quda.h>
mikeaclark's avatar
mikeaclark committed
9

10
// What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat)
mikeaclark's avatar
mikeaclark committed
11
int test_type = 1;
rbabich's avatar
rbabich committed
12
// clover-improved? (0 = plain Wilson, 1 = clover)
mikeaclark's avatar
mikeaclark committed
13
int clover_yes = 0;
mikeaclark's avatar
mikeaclark committed
14

15
QudaGaugeParam gaugeParam;
mikeaclark's avatar
mikeaclark committed
16
QudaInvertParam inv_param;
17 18

FullGauge gauge;
rbabich's avatar
rbabich committed
19
FullClover clover, cloverInv;
mikeaclark's avatar
mikeaclark committed
20
FullSpinor cudaSpinor;
21
FullSpinor cudaSpinorOut;
mikeaclark's avatar
mikeaclark committed
22 23
ParitySpinor tmp;

rbabich's avatar
rbabich committed
24
void *hostGauge[4], *hostClover, *hostCloverInv;
25 26 27
void *spinor, *spinorEven, *spinorOdd;
void *spinorRef, *spinorRefEven, *spinorRefOdd;
void *spinorGPU, *spinorGPUEven, *spinorGPUOdd;
rbabich's avatar
rbabich committed
28

29
double kappa = 1.0;
30
int ODD_BIT = 1;
31
int DAGGER_BIT = 0;
32
int TRANSFER = 0; // include transfer time in the benchmark?
33

mikeaclark's avatar
mikeaclark committed
34 35
void init() {

mikeaclark's avatar
mikeaclark committed
36
  gaugeParam.X[0] = 24;
37
  gaugeParam.X[1] = 24;
mikeaclark's avatar
mikeaclark committed
38
  gaugeParam.X[2] = 24;
39
  gaugeParam.X[3] = 48;
40 41
  setDims(gaugeParam.X);

rbabich's avatar
rbabich committed
42 43 44 45
  gaugeParam.anisotropy = 2.3;

  gaugeParam.gauge_order = QUDA_QDP_GAUGE_ORDER;
  gaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;
46

47
  gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION;
48
  gaugeParam.cuda_prec = QUDA_SINGLE_PRECISION;
49 50 51
  gaugeParam.reconstruct = QUDA_RECONSTRUCT_12;
  gaugeParam.reconstruct_sloppy = gaugeParam.reconstruct;
  gaugeParam.cuda_prec_sloppy = gaugeParam.cuda_prec;
52
  gaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
mikeaclark's avatar
mikeaclark committed
53

rbabich's avatar
rbabich committed
54 55 56 57 58 59 60 61 62 63
  gaugeParam.blockDim = 64;

  if (clover_yes) {
    inv_param.dslash_type = QUDA_CLOVER_WILSON_DSLASH;
  } else {
    inv_param.dslash_type = QUDA_WILSON_DSLASH;
  }

  inv_param.kappa = kappa;

64
  inv_param.matpc_type = QUDA_MATPC_ODD_ODD;
65

66
  inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
67
  inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
rbabich's avatar
rbabich committed
68

69 70
  if (test_type == 2) inv_param.dirac_order = QUDA_DIRAC_ORDER;
  else inv_param.dirac_order = QUDA_DIRAC_ORDER;
rbabich's avatar
rbabich committed
71

rbabich's avatar
rbabich committed
72 73
  if (clover_yes) {
    inv_param.clover_cpu_prec = QUDA_DOUBLE_PRECISION;
74
    inv_param.clover_cuda_prec = QUDA_HALF_PRECISION;
75
    inv_param.clover_cuda_prec_sloppy = inv_param.clover_cuda_prec;
rbabich's avatar
rbabich committed
76
    inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER;
rbabich's avatar
rbabich committed
77
  }
rbabich's avatar
rbabich committed
78
  inv_param.verbosity = QUDA_VERBOSE;
rbabich's avatar
rbabich committed
79 80

  gauge_param = &gaugeParam;
mikeaclark's avatar
mikeaclark committed
81 82
  invert_param = &inv_param;

83 84 85
  size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
  size_t sSize = (inv_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);

mikeaclark's avatar
mikeaclark committed
86
  // construct input fields
87
  for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gSize);
mikeaclark's avatar
mikeaclark committed
88

rbabich's avatar
rbabich committed
89 90
  if (clover_yes) {
    size_t cSize = (inv_param.clover_cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
91
    if (test_type > 0) {
rbabich's avatar
rbabich committed
92 93 94 95 96 97 98 99
      hostClover = malloc(V*cloverSiteSize*cSize);
      hostCloverInv = hostClover; // fake it
    } else {
      hostClover = NULL;
      hostCloverInv = malloc(V*cloverSiteSize*cSize);
    }
  }

100 101 102
  spinor = malloc(V*spinorSiteSize*sSize);
  spinorRef = malloc(V*spinorSiteSize*sSize);
  spinorGPU = malloc(V*spinorSiteSize*sSize);
mikeaclark's avatar
mikeaclark committed
103
  spinorEven = spinor;
104 105 106 107 108 109 110 111 112 113 114 115
  spinorRefEven = spinorRef;
  spinorGPUEven = spinorGPU;
  if (inv_param.cpu_prec == QUDA_DOUBLE_PRECISION) {
    spinorOdd = (void*)((double*)spinor + Vh*spinorSiteSize);
    spinorRefOdd = (void*)((double*)spinorRef + Vh*spinorSiteSize);
    spinorGPUOdd = (void*)((double*)spinorGPU + Vh*spinorSiteSize);
  } else {
    spinorOdd = (void*)((float*)spinor + Vh*spinorSiteSize);
    spinorRefOdd = (void*)((float*)spinorRef + Vh*spinorSiteSize);
    spinorGPUOdd = (void*)((float*)spinorGPU + Vh*spinorSiteSize);
  }

rbabich's avatar
rbabich committed
116
  printf("Randomizing fields... ");
117

118 119
  construct_gauge_field(hostGauge, 1, gaugeParam.cpu_prec);
  construct_spinor_field(spinor, 1, 0, 0, 0, inv_param.cpu_prec);
mikeaclark's avatar
mikeaclark committed
120

rbabich's avatar
rbabich committed
121
  if (clover_yes) {
122
    double norm = 0; // clover components are random numbers in the range (-norm, norm)
rbabich's avatar
rbabich committed
123 124 125 126 127 128 129 130
    double diag = 1.0; // constant added to the diagonal

    if (test_type == 2) {
      construct_clover_field(hostClover, norm, diag, inv_param.clover_cpu_prec);
    } else {
      construct_clover_field(hostCloverInv, norm, diag, inv_param.clover_cpu_prec);
    }
  }
mikeaclark's avatar
mikeaclark committed
131 132
  printf("done.\n"); fflush(stdout);
  
133 134 135
  int dev = 0;
  initQuda(dev);

rbabich's avatar
rbabich committed
136
  loadGaugeQuda(hostGauge, &gaugeParam);
137
  gauge = cudaGaugePrecise;
mikeaclark's avatar
mikeaclark committed
138

rbabich's avatar
rbabich committed
139 140 141 142 143 144 145
  if (clover_yes) {
    loadCloverQuda(hostClover, hostCloverInv, &inv_param);
    clover = cudaCloverPrecise;
    cloverInv = cudaCloverInvPrecise;
  }

  printf("Sending fields to GPU... "); fflush(stdout);
mikeaclark's avatar
mikeaclark committed
146

147
  if (!TRANSFER) {
148

149 150 151 152 153
    gaugeParam.X[0] /= 2;
    tmp = allocateParitySpinor(gaugeParam.X, inv_param.cuda_prec);
    cudaSpinor = allocateSpinorField(gaugeParam.X, inv_param.cuda_prec);
    cudaSpinorOut = allocateSpinorField(gaugeParam.X, inv_param.cuda_prec);
    gaugeParam.X[0] *= 2;
154 155 156

    if (test_type < 2) {
      loadParitySpinor(cudaSpinor.even, spinorEven, inv_param.cpu_prec, 
157
		       inv_param.dirac_order);
158 159
    } else {
      loadSpinorField(cudaSpinor, spinor, inv_param.cpu_prec, 
160
		      inv_param.dirac_order);
161
    }
162

163
  }
164

mikeaclark's avatar
mikeaclark committed
165 166 167 168 169
}

void end() {
  // release memory
  for (int dir = 0; dir < 4; dir++) free(hostGauge[dir]);
rbabich's avatar
rbabich committed
170 171 172 173
  if (clover_yes) {
    if (test_type == 2) free(hostClover);
    else free(hostCloverInv);
  }
174
  free(spinorGPU);
mikeaclark's avatar
mikeaclark committed
175 176
  free(spinor);
  free(spinorRef);
177
  if (!TRANSFER) {
178
    freeSpinorField(cudaSpinorOut);
179 180 181 182
    freeSpinorField(cudaSpinor);
    freeParitySpinor(tmp);
  }
  endQuda();
mikeaclark's avatar
mikeaclark committed
183 184 185 186 187
}

double dslashCUDA() {

  // execute kernel
188
  const int LOOPS = 100;
mikeaclark's avatar
mikeaclark committed
189 190 191 192
  printf("Executing %d kernel loops...", LOOPS);
  fflush(stdout);
  stopwatchStart();
  for (int i = 0; i < LOOPS; i++) {
193 194
    switch (test_type) {
    case 0:
rbabich's avatar
rbabich committed
195 196 197 198 199 200 201
      if (TRANSFER) {
	dslashQuda(spinorOdd, spinorEven, &inv_param, ODD_BIT, DAGGER_BIT);
      } else if (!clover_yes) {
	dslashCuda(cudaSpinor.odd, gauge, cudaSpinor.even, ODD_BIT, DAGGER_BIT);
      } else {
	cloverDslashCuda(cudaSpinor.odd, gauge, cloverInv, cudaSpinor.even, ODD_BIT, DAGGER_BIT);    
      }
202 203
      break;
    case 1:
rbabich's avatar
rbabich committed
204 205 206
      if (TRANSFER) {
	MatPCQuda(spinorOdd, spinorEven, &inv_param, DAGGER_BIT);
      } else if (!clover_yes) {
207
	MatPCCuda(cudaSpinor.odd, gauge, cudaSpinor.even, kappa, tmp, inv_param.matpc_type, DAGGER_BIT);
rbabich's avatar
rbabich committed
208 209
      } else {
	cloverMatPCCuda(cudaSpinor.odd, gauge, clover, cloverInv, cudaSpinor.even, kappa, tmp,
210
			inv_param.matpc_type, DAGGER_BIT);
rbabich's avatar
rbabich committed
211
      }
212 213
      break;
    case 2:
rbabich's avatar
rbabich committed
214 215 216 217 218 219 220
      if (TRANSFER) {
	MatQuda(spinorGPU, spinor, &inv_param, DAGGER_BIT);
      } else if (!clover_yes) {
	MatCuda(cudaSpinorOut, gauge, cudaSpinor, kappa, DAGGER_BIT);
      } else {
	cloverMatCuda(cudaSpinorOut, gauge, clover, cudaSpinor, kappa, tmp, DAGGER_BIT);
      }
221
    }
mikeaclark's avatar
mikeaclark committed
222 223 224 225 226 227 228 229 230 231 232 233
  }
    
  // check for errors
  cudaError_t stat = cudaGetLastError();
  if (stat != cudaSuccess)
    printf("with ERROR: %s\n", cudaGetErrorString(stat));

  cudaThreadSynchronize();
  double secs = stopwatchReadSeconds() / LOOPS;
  printf("done.\n\n");

  return secs;
234
}
mikeaclark's avatar
mikeaclark committed
235

236
void dslashRef() {
237

238
  // FIXME: remove once reference clover is finished
239 240 241 242 243 244
  if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) {
    inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
  } else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
    inv_param.matpc_type = QUDA_MATPC_ODD_ODD;
  }

245 246 247 248 249
  // compare to dslash reference implementation
  printf("Calculating reference implementation...");
  fflush(stdout);
  switch (test_type) {
  case 0:
250 251
    dslash(spinorRef, hostGauge, spinorEven, ODD_BIT, DAGGER_BIT, 
	   inv_param.cpu_prec, gaugeParam.cpu_prec);
252 253
    break;
  case 1:    
254
    matpc(spinorRef, hostGauge, spinorEven, kappa, inv_param.matpc_type, DAGGER_BIT, 
255 256 257 258 259 260 261 262 263
	  inv_param.cpu_prec, gaugeParam.cpu_prec);
    break;
  case 2:
    mat(spinorRef, hostGauge, spinor, kappa, DAGGER_BIT, 
	inv_param.cpu_prec, gaugeParam.cpu_prec);
    break;
  default:
    printf("Test type not defined\n");
    exit(-1);
264
  }
mikeaclark's avatar
mikeaclark committed
265

266
  printf("done.\n");
267
    
mikeaclark's avatar
mikeaclark committed
268 269
}

mikeaclark's avatar
mikeaclark committed
270 271 272
void dslashTest() {

  init();
273
  
274
  float spinorGiB = (float)Vh*spinorSiteSize*sizeof(inv_param.cpu_prec) / (1 << 30);
275
  float sharedKB = (float)dslashCudaSharedBytes(inv_param.cuda_prec, gaugeParam.blockDim) / (1 << 10);
mikeaclark's avatar
mikeaclark committed
276
  printf("\nSpinor mem: %.3f GiB\n", spinorGiB);
277
  printf("Gauge mem: %.3f GiB\n", gaugeParam.gaugeGiB);
mikeaclark's avatar
mikeaclark committed
278
  printf("Shared mem: %.3f KB\n", sharedKB);
279 280
  
  int attempts = 1;
mikeaclark's avatar
mikeaclark committed
281 282 283 284
  dslashRef();
  for (int i=0; i<attempts; i++) {
    
    double secs = dslashCUDA();
285
    
286
    if (!TRANSFER) {
287 288 289 290
      if (test_type < 2) 
	retrieveParitySpinor(spinorOdd, cudaSpinor.odd, inv_param.cpu_prec, inv_param.dirac_order);
      else 
	retrieveSpinorField(spinorGPU, cudaSpinorOut, inv_param.cpu_prec, inv_param.dirac_order);
291
    }
292
    
mikeaclark's avatar
mikeaclark committed
293 294
    // print timing information
    printf("%fms per loop\n", 1000*secs);
295
    
296 297
    int flops = test_type ? 1320*2 + 48 : 1320;
    int floats = test_type ? 2*(7*24+8*gaugeParam.packed_size+24)+24 : 7*24+8*gaugeParam.packed_size+24;
rbabich's avatar
rbabich committed
298
    if (clover_yes) {
rbabich's avatar
rbabich committed
299 300 301
      flops += test_type ? 504*2 : 504;
      floats += test_type ? 72*2 : 72;
    }
302 303
    printf("GFLOPS = %f\n", 1.0e-9*flops*Vh/secs);
    printf("GiB/s = %f\n\n", Vh*floats*sizeof(float)/(secs*(1<<30)));
mikeaclark's avatar
mikeaclark committed
304
    
305
    int res;
306 307
    if (test_type < 2) res = compare_floats(spinorOdd, spinorRef, Vh*4*3*2, 1e-4, inv_param.cpu_prec);
    else res = compare_floats(spinorGPU, spinorRef, V*4*3*2, 1e-4, inv_param.cpu_prec);
308 309 310
      
      printf("%d Test %s\n", i, (1 == res) ? "PASSED" : "FAILED");
      
311 312
      //if (test_type < 2) strong_check(spinorRef, spinorOdd, Vh, inv_param.cpu_prec);
      //else strong_check(spinorRef, spinorGPU, V, inv_param.cpu_prec);    
rbabich's avatar
rbabich committed
313
  }    
mikeaclark's avatar
mikeaclark committed
314 315 316 317 318 319
  end();
}

int main(int argc, char **argv) {
  dslashTest();
}