Advanced Computing Platform for Theoretical Physics

commit大文件会使得服务器变得不稳定,请大家尽量只commit代码,不要commit大的文件。

Commit 587241a4 authored by mikeaclark's avatar mikeaclark
Browse files

Runtime block size in dslash kernel

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@441 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 54ccabd5
// *** CUDA DSLASH ***
#define SHARED_FLOATS_PER_THREAD 0
#define SHARED_BYTES_DOUBLE (BLOCK_DIM*SHARED_FLOATS_PER_THREAD*sizeof(double))
#define SHARED_BYTES_SINGLE (BLOCK_DIM*SHARED_FLOATS_PER_THREAD*sizeof(float))
// input spinor
#if (DD_SPREC==0)
#define spinorFloat double
......@@ -298,7 +294,7 @@ volatile spinorFloat o32_im;
#include "read_clover.h"
#include "io_spinor.h"
int sid = BLOCK_DIM*blockIdx.x + threadIdx.x;
int sid = blockIdx.x*blockDim.x + threadIdx.x;
int z1 = FAST_INT_DIVIDE(sid, X1h);
int x1h = sid - z1*X1h;
int z2 = FAST_INT_DIVIDE(z1, X2);
......
......@@ -133,8 +133,8 @@ def prolog():
str = []
str.append("// *** CUDA DSLASH ***\n\n" if not dagger else "// *** CUDA DSLASH DAGGER ***\n\n")
str.append("#define SHARED_FLOATS_PER_THREAD "+`sharedFloats`+"\n")
str.append("#define SHARED_BYTES_DOUBLE (BLOCK_DIM*SHARED_FLOATS_PER_THREAD*sizeof(double))\n\n")
str.append("#define SHARED_BYTES_SINGLE (BLOCK_DIM*SHARED_FLOATS_PER_THREAD*sizeof(float))\n\n")
# str.append("#define SHARED_BYTES_DOUBLE (BLOCK_DIM*SHARED_FLOATS_PER_THREAD*sizeof(double))\n\n")
# str.append("#define SHARED_BYTES_SINGLE (BLOCK_DIM*SHARED_FLOATS_PER_THREAD*sizeof(float))\n\n")
str.append("// input spinor\n")
......@@ -247,7 +247,7 @@ def prolog():
#include "read_clover.h"
#include "io_spinor.h"
int sid = BLOCK_DIM*blockIdx.x + threadIdx.x;
int sid = blockIdx.x*blockDim.x + threadIdx.x;
int z1 = FAST_INT_DIVIDE(sid, X1h);
int x1h = sid - z1*X1h;
int z2 = FAST_INT_DIVIDE(z1, X2);
......
// *** CUDA DSLASH DAGGER ***
#define SHARED_FLOATS_PER_THREAD 0
#define SHARED_BYTES_DOUBLE (BLOCK_DIM*SHARED_FLOATS_PER_THREAD*sizeof(double))
#define SHARED_BYTES_SINGLE (BLOCK_DIM*SHARED_FLOATS_PER_THREAD*sizeof(float))
// input spinor
#if (DD_SPREC==0)
#define spinorFloat double
......@@ -298,7 +294,7 @@ volatile spinorFloat o32_im;
#include "read_clover.h"
#include "io_spinor.h"
int sid = BLOCK_DIM*blockIdx.x + threadIdx.x;
int sid = blockIdx.x*blockDim.x + threadIdx.x;
int z1 = FAST_INT_DIVIDE(sid, X1h);
int x1h = sid - z1*X1h;
int z2 = FAST_INT_DIVIDE(z1, X2);
......
This diff is collapsed.
......@@ -9,8 +9,6 @@
#define spinorSiteSize 24 // real numbers per spinor
#define cloverSiteSize 72 // real numbers per block-diagonal clover matrix
#define BLOCK_DIM (64) // threads per block
#ifdef __cplusplus
extern "C" {
#endif
......@@ -25,7 +23,7 @@ extern "C" {
// ---------- dslash_quda.cu ----------
int dslashCudaSharedBytes();
int dslashCudaSharedBytes(Precision spinor_prec, int blockDim);
void initDslashCuda();
void bindGaugeTex(FullGauge gauge, int oddBit);
......
......@@ -30,12 +30,15 @@ int TRANSFER = 0; // include transfer time in the benchmark?
void init() {
gaugeParam.X[0] = 36;
gaugeParam.X[1] = 36;
gaugeParam.X[2] = 36;
gaugeParam.X[3] = 24;
gaugeParam.X[0] = 64;
gaugeParam.X[1] = 24;
gaugeParam.X[2] = 26;
gaugeParam.X[3] = 26;
setDims(gaugeParam.X);
gaugeParam.blockDim = 64;
gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION;
gaugeParam.cuda_prec = QUDA_SINGLE_PRECISION;
gaugeParam.reconstruct = QUDA_RECONSTRUCT_12;
......@@ -129,7 +132,7 @@ void end() {
double dslashCUDA() {
// execute kernel
const int LOOPS = 10;
const int LOOPS = 100;
printf("Executing %d kernel loops...", LOOPS);
fflush(stdout);
stopwatchStart();
......@@ -191,53 +194,47 @@ void dslashRef() {
void dslashTest() {
init();
float spinorGiB = (float)Vh*spinorSiteSize*sizeof(inv_param.cpu_prec) / (1 << 30);
float sharedKB = (float)dslashCudaSharedBytes() / (1 << 10);
float sharedKB = (float)dslashCudaSharedBytes(inv_param.cuda_prec, gaugeParam.blockDim) / (1 << 10);
printf("\nSpinor mem: %.3f GiB\n", spinorGiB);
printf("Gauge mem: %.3f GiB\n", gaugeParam.gaugeGiB);
printf("Shared mem: %.3f KB\n", sharedKB);
int attempts = 10000;
int attempts = 1;
dslashRef();
for (int i=0; i<attempts; i++) {
double secs = dslashCUDA();
if (!TRANSFER) {
if (test_type < 2)
retrieveParitySpinor(spinorOdd, cudaSpinor.odd, inv_param.cpu_prec, inv_param.dirac_order);
else
retrieveSpinorField(spinorGPU, cudaSpinorOut, inv_param.cpu_prec, inv_param.dirac_order);
}
// print timing information
printf("%fms per loop\n", 1000*secs);
int flops = test_type ? 1320*2 + 48 : 1320;
int floats = test_type ? 2*(7*24+8*gaugeParam.packed_size+24)+24 : 7*24+8*gaugeParam.packed_size+24;
printf("GFLOPS = %f\n", 1.0e-9*flops*Vh/secs);
printf("GiB/s = %f\n\n", Vh*floats*sizeof(float)/(secs*(1<<30)));
/*for (int is=0; is<Vh; is++) {
printf("%e %e\n", ((double*)spinorRef)[is*24], ((double*)spinorOdd)[is*24]);
}
exit(0);*/
int res;
if (test_type < 2) res = compare_floats(spinorOdd, spinorRef, Vh*4*3*2, 1e-4, inv_param.cpu_prec);
else res = compare_floats(spinorGPU, spinorRef, V*4*3*2, 1e-4, inv_param.cpu_prec);
printf("%d Test %s\n", i, (1 == res) ? "PASSED" : "FAILED");
if (test_type < 2) strong_check(spinorRef, spinorOdd, Vh, inv_param.cpu_prec);
else strong_check(spinorRef, spinorGPU, V, inv_param.cpu_prec);
exit(0);
printf("%d Test %s\n", i, (1 == res) ? "PASSED" : "FAILED");
if (test_type < 2) strong_check(spinorRef, spinorOdd, Vh, inv_param.cpu_prec);
else strong_check(spinorRef, spinorGPU, V, inv_param.cpu_prec);
}
end();
}
int main(int argc, char **argv) {
......
......@@ -223,7 +223,7 @@ void loadGaugeField(FloatN *even, FloatN *odd, Float *cpuGauge, ReconstructType
}
void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, ReconstructType reconstruct,
Precision precision, int *X, double anisotropy) {
Precision precision, int *X, double anisotropy, int blockDim) {
if (gauge_param->cpu_prec == QUDA_HALF_PRECISION) {
printf("QUDA error: half precision not supported on cpu\n");
......@@ -251,6 +251,8 @@ void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, ReconstructType reco
cudaGauge->X[0] /= 2; // actually store the even-odd sublattice dimensions
cudaGauge->volume /= 2;
cudaGauge->blockDim = blockDim;
allocateGaugeField(cudaGauge, reconstruct, precision);
if (precision == QUDA_DOUBLE_PRECISION) {
......
......@@ -9,7 +9,7 @@ extern "C" {
#endif
void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, ReconstructType reconstruct,
Precision precision, int *X, double anisotropy);
Precision precision, int *X, double anisotropy, int blockDim);
void freeGaugeField(FullGauge *cudaCauge);
#ifdef __cplusplus
......
......@@ -96,12 +96,13 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
gauge_param->packed_size = (gauge_param->reconstruct == QUDA_RECONSTRUCT_8) ? 8 : 12;
createGaugeField(&cudaGaugePrecise, h_gauge, gauge_param->reconstruct,
gauge_param->cuda_prec, gauge_param->X, gauge_param->anisotropy);
gauge_param->cuda_prec, gauge_param->X, gauge_param->anisotropy, gauge_param->blockDim);
gauge_param->gaugeGiB = 2.0*cudaGaugePrecise.bytes/ (1 << 30);
if (gauge_param->cuda_prec_sloppy != gauge_param->cuda_prec ||
gauge_param->reconstruct_sloppy != gauge_param->reconstruct) {
createGaugeField(&cudaGaugeSloppy, h_gauge, gauge_param->reconstruct_sloppy,
gauge_param->cuda_prec_sloppy, gauge_param->X, gauge_param->anisotropy);
gauge_param->cuda_prec_sloppy, gauge_param->X, gauge_param->anisotropy,
gauge_param->blockDim_sloppy);
gauge_param->gaugeGiB += 2.0*cudaGaugeSloppy.bytes/ (1 << 30);
} else {
cudaGaugeSloppy = cudaGaugePrecise;
......
......@@ -30,6 +30,8 @@ extern "C" {
int packed_size;
double gaugeGiB;
int blockDim; // number of threads in a block
int blockDim_sloppy;
} QudaGaugeParam;
typedef struct QudaInvertParam_s {
......
......@@ -21,6 +21,8 @@ int main(int argc, char **argv)
Gauge_param.X[3] = 32;
setDims(Gauge_param.X);
Gauge_param.blockDim = 64;
Gauge_param.cpu_prec = QUDA_DOUBLE_PRECISION;
Gauge_param.cuda_prec = QUDA_SINGLE_PRECISION;
......
......@@ -40,6 +40,8 @@ void init() {
Precision single = QUDA_SINGLE_PRECISION;
param.blockDim = 64;
param.cpu_prec = QUDA_SINGLE_PRECISION;
param.cuda_prec = QUDA_SINGLE_PRECISION;
param.reconstruct = QUDA_RECONSTRUCT_12;
......@@ -97,13 +99,13 @@ void packTest() {
stopwatchStart();
param.gauge_order = QUDA_CPS_WILSON_GAUGE_ORDER;
createGaugeField(&cudaGaugePrecise, cpsGauge, param.reconstruct, param.cuda_prec, param.X, 1.0);
createGaugeField(&cudaGaugePrecise, cpsGauge, param.reconstruct, param.cuda_prec, param.X, 1.0, param.blockDim);
double cpsGtime = stopwatchReadSeconds();
printf("CPS Gauge send time = %e seconds\n", cpsGtime);
stopwatchStart();
param.gauge_order = QUDA_QDP_GAUGE_ORDER;
createGaugeField(&cudaGaugePrecise, qdpGauge, param.reconstruct, param.cuda_prec, param.X, 1.0);
createGaugeField(&cudaGaugePrecise, qdpGauge, param.reconstruct, param.cuda_prec, param.X, 1.0, param.blockDim);
double qdpGtime = stopwatchReadSeconds();
printf("QDP Gauge send time = %e seconds\n", qdpGtime);
......
......@@ -37,6 +37,7 @@ extern "C" {
typedef void *ParityGauge;
typedef struct {
int blockDim; // The size of the thread block to use
size_t bytes;
Precision precision;
int length; // total length
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment