Advanced Computing Platform for Theoretical Physics

dslash_test.c 6.31 KB
Newer Older
mikeaclark's avatar
mikeaclark committed
1
2
3
4
#include <stdio.h>
#include <stdlib.h>

#include <quda.h>
5
#include <dslash_reference.h>
mikeaclark's avatar
mikeaclark committed
6
#include <util_quda.h>
7
8
#include <spinor_quda.h>
#include <gauge_quda.h>
mikeaclark's avatar
mikeaclark committed
9

10
// What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat)
mikeaclark's avatar
mikeaclark committed
11
int test_type = 1;
mikeaclark's avatar
mikeaclark committed
12

13
QudaGaugeParam gaugeParam;
mikeaclark's avatar
mikeaclark committed
14
QudaInvertParam inv_param;
15
16

FullGauge gauge;
mikeaclark's avatar
mikeaclark committed
17
FullSpinor cudaSpinor;
18
FullSpinor cudaSpinorOut;
mikeaclark's avatar
mikeaclark committed
19
20
ParitySpinor tmp;

21
22
23
void *hostGauge[4];
void *spinor, *spinorRef, *spinorGPU;
void *spinorEven, *spinorOdd;
mikeaclark's avatar
mikeaclark committed
24
    
25
double kappa = 1.0;
mikeaclark's avatar
mikeaclark committed
26
27
int ODD_BIT = 0;
int DAGGER_BIT = 0;
mikeaclark's avatar
mikeaclark committed
28
int TRANSFER = 1; // include transfer time in the benchmark?
29

mikeaclark's avatar
mikeaclark committed
30
31
void init() {

32
  gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION;
mikeaclark's avatar
mikeaclark committed
33
  gaugeParam.cuda_prec = QUDA_DOUBLE_PRECISION;
34
35
36
  gaugeParam.reconstruct = QUDA_RECONSTRUCT_12;
  gaugeParam.reconstruct_sloppy = gaugeParam.reconstruct;
  gaugeParam.cuda_prec_sloppy = gaugeParam.cuda_prec;
37
38
39
40
41
42
43
44
45
  gaugeParam.X = L1;
  gaugeParam.Y = L2;
  gaugeParam.Z = L3;
  gaugeParam.T = L4;
  gaugeParam.anisotropy = 2.3;
  gaugeParam.gauge_order = QUDA_QDP_GAUGE_ORDER;
  gaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;
  gaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
  gauge_param = &gaugeParam;
mikeaclark's avatar
mikeaclark committed
46

47
  inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
mikeaclark's avatar
mikeaclark committed
48
  inv_param.cuda_prec = QUDA_DOUBLE_PRECISION;
49
50
  if (test_type == 2) inv_param.dirac_order = QUDA_DIRAC_ORDER;
  else inv_param.dirac_order = QUDA_DIRAC_ORDER;
51
  inv_param.kappa = kappa;
mikeaclark's avatar
mikeaclark committed
52
53
  invert_param = &inv_param;

54
55
56
  size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
  size_t sSize = (inv_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);

mikeaclark's avatar
mikeaclark committed
57
  // construct input fields
58
  for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(N*gaugeSiteSize*gSize);
mikeaclark's avatar
mikeaclark committed
59

60
61
62
  spinor = malloc(N*spinorSiteSize*sSize);
  spinorRef = malloc(N*spinorSiteSize*sSize);
  spinorGPU = malloc(N*spinorSiteSize*sSize);
mikeaclark's avatar
mikeaclark committed
63
  spinorEven = spinor;
64
65
66
67
  if (inv_param.cpu_prec == QUDA_DOUBLE_PRECISION)
    spinorOdd = (void*)((double*)spinor + Nh*spinorSiteSize);
  else 
    spinorOdd = (void*)((float*)spinor + Nh*spinorSiteSize);
mikeaclark's avatar
mikeaclark committed
68
69
    
  printf("Randomizing fields...");
70
71
  construct_gauge_field(hostGauge, 1, gaugeParam.cpu_prec);
  construct_spinor_field(spinor, 1, 0, 0, 0, inv_param.cpu_prec);
mikeaclark's avatar
mikeaclark committed
72

mikeaclark's avatar
mikeaclark committed
73
74
  printf("done.\n"); fflush(stdout);
  
75
76
  int dev = 0;
  initQuda(dev);
77
  loadGaugeQuda(hostGauge, &gaugeParam);
78

79
  gauge = cudaGaugePrecise;
mikeaclark's avatar
mikeaclark committed
80
81
82

  printf("Sending fields to GPU..."); fflush(stdout);

83
  if (!TRANSFER) {
84
85
86
87
88
89
90

    tmp = allocateParitySpinor(Nh, inv_param.cuda_prec);
    cudaSpinor = allocateSpinorField(N, inv_param.cuda_prec);
    cudaSpinorOut = allocateSpinorField(N, inv_param.cuda_prec);

    if (test_type < 2) {
      loadParitySpinor(cudaSpinor.even, spinorEven, inv_param.cpu_prec, 
91
		       inv_param.dirac_order);
92
93
    } else {
      loadSpinorField(cudaSpinor, spinor, inv_param.cpu_prec, 
94
		      inv_param.dirac_order);
95
    }
96
  }
97
98


mikeaclark's avatar
mikeaclark committed
99
100
101
102
103
}

void end() {
  // release memory
  for (int dir = 0; dir < 4; dir++) free(hostGauge[dir]);
104
  free(spinorGPU);
mikeaclark's avatar
mikeaclark committed
105
106
  free(spinor);
  free(spinorRef);
107
  if (!TRANSFER) {
108
    freeSpinorField(cudaSpinorOut);
109
110
111
112
    freeSpinorField(cudaSpinor);
    freeParitySpinor(tmp);
  }
  endQuda();
mikeaclark's avatar
mikeaclark committed
113
114
115
116
117
}

double dslashCUDA() {

  // execute kernel
118
  const int LOOPS = 10;
mikeaclark's avatar
mikeaclark committed
119
120
121
122
  printf("Executing %d kernel loops...", LOOPS);
  fflush(stdout);
  stopwatchStart();
  for (int i = 0; i < LOOPS; i++) {
123
124
    switch (test_type) {
    case 0:
125
126
      if (TRANSFER) dslashQuda(spinorOdd, spinorEven, &inv_param, ODD_BIT, DAGGER_BIT);
      else dslashCuda(cudaSpinor.odd, gauge, cudaSpinor.even, ODD_BIT, DAGGER_BIT);
127
128
      break;
    case 1:
mikeaclark's avatar
mikeaclark committed
129
130
      if (TRANSFER) MatPCQuda(spinorOdd, spinorEven, &inv_param, DAGGER_BIT);
      else MatPCCuda(cudaSpinor.odd, gauge, cudaSpinor.even, kappa, tmp, QUDA_MATPC_EVEN_EVEN, DAGGER_BIT);
131
132
      break;
    case 2:
mikeaclark's avatar
mikeaclark committed
133
134
      if (TRANSFER) MatQuda(spinorGPU, spinor, &inv_param, DAGGER_BIT);
      else MatCuda(cudaSpinorOut, gauge, cudaSpinor, kappa, DAGGER_BIT);
135
    }
mikeaclark's avatar
mikeaclark committed
136
137
138
139
140
141
142
143
144
145
146
147
  }
    
  // check for errors
  cudaError_t stat = cudaGetLastError();
  if (stat != cudaSuccess)
    printf("with ERROR: %s\n", cudaGetErrorString(stat));

  cudaThreadSynchronize();
  double secs = stopwatchReadSeconds() / LOOPS;
  printf("done.\n\n");

  return secs;
148
}
mikeaclark's avatar
mikeaclark committed
149

150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
void dslashRef() {
  
  // compare to dslash reference implementation
  printf("Calculating reference implementation...");
  fflush(stdout);
  switch (test_type) {
  case 0:
    dslash_reference(spinorRef, hostGauge, spinorEven, ODD_BIT, DAGGER_BIT, 
		     inv_param.cpu_prec, gaugeParam.cpu_prec);
    break;
  case 1:    
    matpc(spinorRef, hostGauge, spinorEven, kappa, QUDA_MATPC_EVEN_EVEN, DAGGER_BIT, 
	  inv_param.cpu_prec, gaugeParam.cpu_prec);
    break;
  case 2:
    mat(spinorRef, hostGauge, spinor, kappa, DAGGER_BIT, 
	inv_param.cpu_prec, gaugeParam.cpu_prec);
    break;
  default:
    printf("Test type not defined\n");
    exit(-1);
171
  }
mikeaclark's avatar
mikeaclark committed
172

173
  printf("done.\n");
174
    
mikeaclark's avatar
mikeaclark committed
175
176
}

mikeaclark's avatar
mikeaclark committed
177
178
179
180
void dslashTest() {

  init();

181
  float spinorGiB = (float)Nh*spinorSiteSize*sizeof(inv_param.cpu_prec) / (1 << 30);
mikeaclark's avatar
mikeaclark committed
182
183
  float sharedKB = (float)dslashCudaSharedBytes() / (1 << 10);
  printf("\nSpinor mem: %.3f GiB\n", spinorGiB);
184
  printf("Gauge mem: %.3f GiB\n", gaugeParam.gaugeGiB);
mikeaclark's avatar
mikeaclark committed
185
186
  printf("Shared mem: %.3f KB\n", sharedKB);

mikeaclark's avatar
mikeaclark committed
187
  int attempts = 10000;
mikeaclark's avatar
mikeaclark committed
188
189
190
191
  dslashRef();
  for (int i=0; i<attempts; i++) {
    
    double secs = dslashCUDA();
192
193
  
    if (!TRANSFER) {
194
195
      if (test_type < 2) retrieveParitySpinor(spinorOdd, cudaSpinor.odd, inv_param.cpu_prec, inv_param.dirac_order);
      else retrieveSpinorField(spinorGPU, cudaSpinorOut, inv_param.cpu_prec, inv_param.dirac_order);
196
    }
mikeaclark's avatar
mikeaclark committed
197
198
    // print timing information
    printf("%fms per loop\n", 1000*secs);
199
200
    int flops = test_type ? 1320*2 + 48 : 1320;
    int floats = test_type ? 2*(7*24+8*gaugeParam.packed_size+24)+24 : 7*24+8*gaugeParam.packed_size+24;
mikeaclark's avatar
mikeaclark committed
201
202
203
    printf("GFLOPS = %f\n", 1.0e-9*flops*Nh/secs);
    printf("GiB/s = %f\n\n", Nh*floats*sizeof(float)/(secs*(1<<30)));
    
204
    int res;
205
206
    if (test_type < 2) res = compare_floats(spinorOdd, spinorRef, Nh*4*3*2, 1e-4, inv_param.cpu_prec);
    else res = compare_floats(spinorGPU, spinorRef, N*4*3*2, 1e-4, inv_param.cpu_prec);
mikeaclark's avatar
mikeaclark committed
207
    printf("%d Test %s\n", i, (1 == res) ? "PASSED" : "FAILED");
mikeaclark's avatar
mikeaclark committed
208

209
210
    if (test_type < 2) strong_check(spinorRef, spinorOdd, Nh, inv_param.cpu_prec);
    else strong_check(spinorRef, spinorGPU, Nh, inv_param.cpu_prec);
mikeaclark's avatar
mikeaclark committed
211
212

    exit(0);
mikeaclark's avatar
mikeaclark committed
213
214
215
216
217
218
219
220
221
  }  

  end();

}

int main(int argc, char **argv) {
  dslashTest();
}