Advanced Computing Platform for Theoretical Physics

commit大文件会使得服务器变得不稳定,请大家尽量只commit代码,不要commit大的文件。

Commit 15bfc7cc authored by rbabich's avatar rbabich
Browse files

quda: changed where autotuned parameters are set so that initBlas() is

only necessary before doing reductions.


git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@613 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 7ac1f1ee
......@@ -16,7 +16,7 @@ BiCGstab are provided, with support for double, single, and half
Software compatibility:
The library has been tested under linux (CentOS 5.3 and Ubuntu 8.04)
The library has been tested under Linux (CentOS 5.3 and Ubuntu 8.04)
using release 2.3 of the CUDA toolkit. There are known issues with
releases 2.1 and 2.2, but 2.0 should work if one is forced to use an
older version (for compatibility with an old driver, for example).
......@@ -45,6 +45,24 @@ edit the first few lines to specify the CUDA install path, the
platform (x86 or x86_64), and the GPU architecture (see "Hardware
compatibility" above). Then type 'make' to build the library.
As an optional step, 'make tune' will invoke tests/blas_test to
perform autotuning of the various BLAS-like functions needed by the
inverters. This involves testing many combinations of parameters
(corresponding to different numbers of CUDA threads per block and
blocks per grid for each kernel) and writing the optimal values to
lib/blas_param.h. The new values will take effect the next time the
library is built. Ideally, the autotuning should be performed on the
machine where the library is to be used, since the optimal parameters
will depend on the CUDA device and host hardware.
In summary, for an optimized install, run
make && make tune && make
By default, the autotuning is performed using CUDA device 0. To
select a different device number, set DEVICE in make.inc
appropriately.
Using the library:
......@@ -53,21 +71,16 @@ against lib/libquda.a, and study tests/invert_test.c for an example of
the interface. The various inverter options are enumerated in
include/enum_quda.h.
The lib/blas_quda.cu file contains all of the BLAS-like functions
required for the inverters. The threads per block and blocks per grid
parameters are auto-tuned using the blas_test function in tests/, and
the output stored in blas_param.h which is included here. These
optimal values may change as a function of the CUDA device and the
host hardware, so re-running blas_test and copying over the output
blas_param.h into lib/ and recompiling the blas library may provide
extra performance.
Known issues:
* One of the stages of the build process requires over 5 GB of memory.
If too little memory is available, the compilation will either take
a very long time (given enough swap space) or fail completely.
* When building for the 'sm_13' GPU architecture (which enables double
precision support), one of the stages in the build process requires
over 5 GB of memory. If too little memory is available, the
compilation will either take a very long time (given enough swap
space) or fail completely. In addition, the CUDA C compiler
requires over 1 GB of disk space in /tmp for the creation of
temporary files.
* For compatibility with CUDA, on 32-bit platforms the library is compiled
with the GCC option -malign-double. This differs from the GCC default
......@@ -88,7 +101,7 @@ M. A. Clark, R. Babich, K. Barros, R. Brower, and C. Rebbi, "Solving
Lattice QCD systems of equations using mixed precision solvers on
GPUs" (2009), arXiv:0911.3191 [hep-lat].
Please also drop us a note so that we can inform you of updates and
Please also drop us a note so that we may inform you of updates and
bug-fixes. The most recent public release will always be available
online at http://lattice.bu.edu/quda/
......@@ -11,10 +11,9 @@ QUDA_HDRS = blas_quda.h clover_quda.h dslash_quda.h enum_quda.h gauge_quda.h \
# files containing complex macros and other code fragments to be inlined,
# found in lib/
QUDA_INLN = blas_param.h check_params.h clover_def.h dslash_common.h \
dslash_def.h dslash_textures.h io_spinor.h read_clover.h \
read_gauge.h reduce_complex_core.h reduce_core.h \
reduce_triple_core.h
QUDA_INLN = check_params.h clover_def.h dslash_common.h dslash_def.h \
dslash_textures.h io_spinor.h read_clover.h read_gauge.h \
reduce_complex_core.h reduce_core.h reduce_triple_core.h
# files generated by the scripts in lib/generate/, found in lib/dslash_core/
# (The current clover_core.h was edited by hand.)
......@@ -41,6 +40,9 @@ clean:
%.o: %.cpp $(HDRS)
$(CXX) $(CXXFLAGS) $< -c -o $@
blas_quda.o: blas_quda.cu blas_param.h $(HDRS)
$(NVCC) $(NVCCFLAGS) $< -c -o $@
%.o: %.cu $(HDRS) $(CORE)
$(NVCC) $(NVCCFLAGS) $< -c -o $@
......
/*
Auto-tuned blas CUDA parameters, generated by blas_test
*/
// Kernel: copyCuda
blas_threads[0][0] = 64;
blas_blocks[0][0] = 2048;
// Kernel: axpbyCuda
blas_threads[0][1] = 64;
blas_blocks[0][1] = 2048;
// Kernel: xpyCuda
blas_threads[0][2] = 64;
blas_blocks[0][2] = 2048;
// Kernel: axpyCuda
blas_threads[0][3] = 64;
blas_blocks[0][3] = 2048;
// Kernel: xpayCuda
blas_threads[0][4] = 64;
blas_blocks[0][4] = 2048;
// Kernel: mxpyCuda
blas_threads[0][5] = 64;
blas_blocks[0][5] = 2048;
// Kernel: axCuda
blas_threads[0][6] = 64;
blas_blocks[0][6] = 2048;
// Kernel: caxpyCuda
blas_threads[0][7] = 64;
blas_blocks[0][7] = 2048;
// Kernel: caxpbyCuda
blas_threads[0][8] = 64;
blas_blocks[0][8] = 2048;
// Kernel: cxpaypbzCuda
blas_threads[0][9] = 64;
blas_blocks[0][9] = 2048;
// Kernel: axpyZpbxCuda
blas_threads[0][10] = 64;
blas_blocks[0][10] = 512;
// Kernel: caxpbypzYmbwCuda
blas_threads[0][11] = 64;
blas_blocks[0][11] = 1024;
// Kernel: sumCuda
blas_threads[0][12] = 64;
blas_blocks[0][12] = 128;
// Kernel: normCuda
blas_threads[0][13] = 64;
blas_blocks[0][13] = 128;
// Kernel: reDotProductCuda
blas_threads[0][14] = 64;
blas_blocks[0][14] = 64;
// Kernel: axpyNormCuda
blas_threads[0][15] = 64;
blas_blocks[0][15] = 256;
// Kernel: xmyNormCuda
blas_threads[0][16] = 64;
blas_blocks[0][16] = 512;
// Kernel: cDotProductCuda
blas_threads[0][17] = 64;
blas_blocks[0][17] = 64;
// Kernel: xpaycDotzyCuda
blas_threads[0][18] = 64;
blas_blocks[0][18] = 256;
// Kernel: cDotProductNormACuda
blas_threads[0][19] = 64;
blas_blocks[0][19] = 64;
// Kernel: cDotProductNormBCuda
blas_threads[0][20] = 64;
blas_blocks[0][20] = 64;
// Kernel: caxpbypzYmbwcDotProductWYNormYQuda
blas_threads[0][21] = 64;
blas_blocks[0][21] = 512;
// Kernel: copyCuda
blas_threads[1][0] = 64;
blas_blocks[1][0] = 1024;
// Kernel: axpbyCuda
blas_threads[1][1] = 128;
blas_blocks[1][1] = 128;
// Kernel: xpyCuda
blas_threads[1][2] = 128;
blas_blocks[1][2] = 128;
// Kernel: axpyCuda
blas_threads[1][3] = 128;
blas_blocks[1][3] = 128;
// Kernel: xpayCuda
blas_threads[1][4] = 128;
blas_blocks[1][4] = 128;
// Kernel: mxpyCuda
blas_threads[1][5] = 128;
blas_blocks[1][5] = 128;
// Kernel: axCuda
blas_threads[1][6] = 64;
blas_blocks[1][6] = 128;
// Kernel: caxpyCuda
blas_threads[1][7] = 64;
blas_blocks[1][7] = 128;
// Kernel: caxpbyCuda
blas_threads[1][8] = 64;
blas_blocks[1][8] = 128;
// Kernel: cxpaypbzCuda
blas_threads[1][9] = 64;
blas_blocks[1][9] = 128;
// Kernel: axpyZpbxCuda
blas_threads[1][10] = 64;
blas_blocks[1][10] = 2048;
// Kernel: caxpbypzYmbwCuda
blas_threads[1][11] = 64;
blas_blocks[1][11] = 128;
// Kernel: sumCuda
blas_threads[1][12] = 128;
blas_blocks[1][12] = 1024;
// Kernel: normCuda
blas_threads[1][13] = 128;
blas_blocks[1][13] = 1024;
// Kernel: reDotProductCuda
blas_threads[1][14] = 128;
blas_blocks[1][14] = 1024;
// Kernel: axpyNormCuda
blas_threads[1][15] = 128;
blas_blocks[1][15] = 1024;
// Kernel: xmyNormCuda
blas_threads[1][16] = 128;
blas_blocks[1][16] = 1024;
// Kernel: cDotProductCuda
blas_threads[1][17] = 128;
blas_blocks[1][17] = 512;
// Kernel: xpaycDotzyCuda
blas_threads[1][18] = 64;
blas_blocks[1][18] = 128;
// Kernel: cDotProductNormACuda
blas_threads[1][19] = 64;
blas_blocks[1][19] = 1024;
// Kernel: cDotProductNormBCuda
blas_threads[1][20] = 64;
blas_blocks[1][20] = 1024;
// Kernel: caxpbypzYmbwcDotProductWYNormYQuda
blas_threads[1][21] = 64;
blas_blocks[1][21] = 128;
// Kernel: copyCuda
blas_threads[2][0] = 64;
blas_blocks[2][0] = 128;
// Kernel: axpbyCuda
blas_threads[2][1] = 64;
blas_blocks[2][1] = 128;
// Kernel: xpyCuda
blas_threads[2][2] = 64;
blas_blocks[2][2] = 128;
// Kernel: axpyCuda
blas_threads[2][3] = 64;
blas_blocks[2][3] = 128;
// Kernel: xpayCuda
blas_threads[2][4] = 64;
blas_blocks[2][4] = 128;
// Kernel: mxpyCuda
blas_threads[2][5] = 64;
blas_blocks[2][5] = 128;
// Kernel: axCuda
blas_threads[2][6] = 64;
blas_blocks[2][6] = 2048;
// Kernel: caxpyCuda
blas_threads[2][7] = 64;
blas_blocks[2][7] = 2048;
// Kernel: caxpbyCuda
blas_threads[2][8] = 64;
blas_blocks[2][8] = 2048;
// Kernel: cxpaypbzCuda
blas_threads[2][9] = 64;
blas_blocks[2][9] = 2048;
// Kernel: axpyZpbxCuda
blas_threads[2][10] = 64;
blas_blocks[2][10] = 128;
// Kernel: caxpbypzYmbwCuda
blas_threads[2][11] = 64;
blas_blocks[2][11] = 128;
// Kernel: sumCuda
blas_threads[2][12] = 128;
blas_blocks[2][12] = 128;
// Kernel: normCuda
blas_threads[2][13] = 128;
blas_blocks[2][13] = 128;
// Kernel: reDotProductCuda
blas_threads[2][14] = 128;
blas_blocks[2][14] = 128;
// Kernel: axpyNormCuda
blas_threads[2][15] = 64;
blas_blocks[2][15] = 128;
// Kernel: xmyNormCuda
blas_threads[2][16] = 64;
blas_blocks[2][16] = 128;
// Kernel: cDotProductCuda
blas_threads[2][17] = 64;
blas_blocks[2][17] = 128;
// Kernel: xpaycDotzyCuda
blas_threads[2][18] = 64;
blas_blocks[2][18] = 128;
// Kernel: cDotProductNormACuda
blas_threads[2][19] = 64;
blas_blocks[2][19] = 128;
// Kernel: cDotProductNormBCuda
blas_threads[2][20] = 64;
blas_blocks[2][20] = 128;
// Kernel: caxpbypzYmbwcDotProductWYNormYQuda
blas_threads[2][21] = 64;
blas_blocks[2][21] = 256;
//
// Auto-tuned blas CUDA parameters, generated by blas_test
//
static int blas_threads[22][3] = {
{ 64, 64, 64}, // Kernel 0: copyCuda
{ 64, 128, 64}, // Kernel 1: axpbyCuda
{ 64, 128, 64}, // Kernel 2: xpyCuda
{ 64, 128, 64}, // Kernel 3: axpyCuda
{ 64, 128, 64}, // Kernel 4: xpayCuda
{ 64, 128, 64}, // Kernel 5: mxpyCuda
{ 64, 64, 64}, // Kernel 6: axCuda
{ 64, 64, 64}, // Kernel 7: caxpyCuda
{ 64, 64, 64}, // Kernel 8: caxpbyCuda
{ 64, 64, 64}, // Kernel 9: cxpaypbzCuda
{ 64, 128, 64}, // Kernel 10: axpyZpbxCuda
{ 64, 64, 64}, // Kernel 11: caxpbypzYmbwCuda
{ 64, 128, 128}, // Kernel 12: sumCuda
{ 64, 128, 128}, // Kernel 13: normCuda
{ 64, 128, 128}, // Kernel 14: reDotProductCuda
{ 64, 128, 64}, // Kernel 15: axpyNormCuda
{ 64, 128, 64}, // Kernel 16: xmyNormCuda
{ 64, 128, 64}, // Kernel 17: cDotProductCuda
{ 64, 64, 64}, // Kernel 18: xpaycDotzyCuda
{ 64, 64, 64}, // Kernel 19: cDotProductNormACuda
{ 64, 64, 64}, // Kernel 20: cDotProductNormBCuda
{ 64, 64, 64} // Kernel 21: caxpbypzYmbwcDotProductWYNormYQuda
};
static int blas_blocks[22][3] = {
{2048, 1024, 128}, // Kernel 0: copyCuda
{2048, 128, 128}, // Kernel 1: axpbyCuda
{2048, 128, 128}, // Kernel 2: xpyCuda
{2048, 128, 128}, // Kernel 3: axpyCuda
{2048, 128, 128}, // Kernel 4: xpayCuda
{2048, 128, 128}, // Kernel 5: mxpyCuda
{2048, 128, 2048}, // Kernel 6: axCuda
{2048, 128, 2048}, // Kernel 7: caxpyCuda
{2048, 128, 2048}, // Kernel 8: caxpbyCuda
{2048, 128, 2048}, // Kernel 9: cxpaypbzCuda
{ 512, 128, 128}, // Kernel 10: axpyZpbxCuda
{1024, 128, 128}, // Kernel 11: caxpbypzYmbwCuda
{ 128, 1024, 128}, // Kernel 12: sumCuda
{ 128, 1024, 128}, // Kernel 13: normCuda
{ 64, 1024, 128}, // Kernel 14: reDotProductCuda
{ 256, 1024, 128}, // Kernel 15: axpyNormCuda
{ 512, 1024, 128}, // Kernel 16: xmyNormCuda
{ 64, 512, 128}, // Kernel 17: cDotProductCuda
{ 256, 128, 128}, // Kernel 18: xpaycDotzyCuda
{ 64, 1024, 128}, // Kernel 19: cDotProductNormACuda
{ 64, 1024, 128}, // Kernel 20: cDotProductNormBCuda
{ 512, 128, 256} // Kernel 21: caxpbypzYmbwcDotProductWYNormYQuda
};
......@@ -33,14 +33,12 @@ static QudaSumFloat3 *h_reduceFloat3=0;
unsigned long long blas_quda_flops;
unsigned long long blas_quda_bytes;
// Number of threads used for each blas kernel
static int blas_threads[3][22];
// Number of thread blocks for each blas kernel
static int blas_blocks[3][22];
static dim3 blasBlock;
static dim3 blasGrid;
// generated by blas_test
#include <blas_param.h>
void initBlas(void)
{
if (!d_reduceFloat) {
......@@ -78,10 +76,6 @@ void initBlas(void)
errorQuda("Error allocating host reduction array");
}
}
// Output from blas_test
#include <blas_param.h>
}
void endBlas(void)
......@@ -104,11 +98,12 @@ void setBlasTuning(int tuning)
void setBlasParam(int kernel, int prec, int threads, int blocks)
{
blas_threads[prec][kernel] = threads;
blas_blocks[prec][kernel] = blocks;
blas_threads[kernel][prec] = threads;
blas_blocks[kernel][prec] = blocks;
}
void setBlock(int kernel, int length, QudaPrecision precision) {
void setBlock(int kernel, int length, QudaPrecision precision)
{
int prec;
switch(precision) {
case QUDA_HALF_PRECISION:
......@@ -122,8 +117,8 @@ void setBlock(int kernel, int length, QudaPrecision precision) {
break;
}
int blocks = min(blas_blocks[prec][kernel], max(length/blas_threads[prec][kernel], 1));
blasBlock.x = blas_threads[prec][kernel];
int blocks = min(blas_blocks[kernel][prec], max(length/blas_threads[kernel][prec], 1));
blasBlock.x = blas_threads[kernel][prec];
blasBlock.y = 1;
blasBlock.z = 1;
......
......@@ -7,6 +7,8 @@
#include <test_util.h>
#define Nkernels 22
QudaPrecision cuda_prec;
QudaPrecision other_prec; // Used for copy benchmark
ParitySpinor x, y, z, w, v, p;
......@@ -20,8 +22,8 @@ int gridSizes[] = {64, 128, 256, 512, 1024, 2048};
int prec;
void init() {
void init()
{
int X[4];
X[0] = 24;
......@@ -63,7 +65,9 @@ void init() {
setBlasTuning(1);
}
void end() {
void end()
{
// release memory
freeParitySpinor(p);
freeParitySpinor(v);
......@@ -73,6 +77,7 @@ void end() {
freeParitySpinor(z);
}
double benchmark(int kernel) {
double a, b;
......@@ -195,14 +200,44 @@ double benchmark(int kernel) {
}
int main(int argc, char** argv) {
void write(char *names[], int threads[][3], int blocks[][3])
{
printf("\nWriting optimal parameters to blas_param.h\n");
FILE *fp = fopen("blas_param.h", "w");
fprintf(fp, "//\n// Auto-tuned blas CUDA parameters, generated by blas_test\n//\n\n");
fprintf(fp, "static int blas_threads[%d][3] = {\n", Nkernels);
for (int i=0; i<Nkernels; i++) {
fprintf(fp, " {%4d, %4d, %4d}%c // Kernel %2d: %s\n", threads[i][0], threads[i][1], threads[i][2],
((i == Nkernels-1) ? ' ' : ','), i, names[i]);
}
fprintf(fp, "};\n\n");
fprintf(fp, "static int blas_blocks[%d][3] = {\n", Nkernels);
for (int i=0; i<Nkernels; i++) {
fprintf(fp, " {%4d, %4d, %4d}%c // Kernel %2d: %s\n", blocks[i][0], blocks[i][1], blocks[i][2],
((i == Nkernels-1) ? ' ' : ','), i, names[i]);
}
fprintf(fp, "};\n");
fclose(fp);
}
int main(int argc, char** argv)
{
int dev = 0;
if (argc == 2) dev = atoi(argv[1]);
initQuda(dev);
int threads[Nkernels][3];
int blocks[Nkernels][3];
int kernels[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21};
char names[][100] = {
char *names[] = {
"copyCuda",
"axpbyCuda",
"xpyCuda",
......@@ -227,22 +262,21 @@ int main(int argc, char** argv) {
"caxpbypzYmbwcDotProductWYNormYQuda"
};
FILE *blas_out = fopen("blas_param.h", "w");
fprintf(blas_out, "/*\n Auto-tuned blas CUDA parameters, generated by blas_test\n*/\n");
for (prec = 0; prec<3; prec++) {
for (prec = 0; prec < 3; prec++) {
init();
printf("\nBenchmarking %d bit precision\n", (int)(pow(2.0,prec)*16));
for (int i = 0; i <= 21; i++) {
for (int i = 0; i < Nkernels; i++) {
double gflops_max = 0.0;
double gbytes_max = 0.0;
int threads_max = 0;
int blocks_max = 0;
for (int thread=0; thread<Nthreads; thread++) {
for (int grid=0; grid<Ngrids; grid++) {
for (int thread = 0; thread < Nthreads; thread++) {
for (int grid = 0; grid < Ngrids; grid++) {
setBlasParam(i, prec, blockSizes[thread], gridSizes[grid]);
// first do warmup run
......@@ -269,27 +303,22 @@ int main(int argc, char** argv) {
blocks_max = gridSizes[grid];
}
//printf("%d %d %-36s %f s, flops = %e, Gflops/s = %f, GiB/s = %f\n\n",
// blockSizes[thread], gridSizes[grid], names[i], secs, flops, gflops, gbytes);
// printf("%d %d %-36s %f s, flops = %e, Gflops/s = %f, GiB/s = %f\n\n",
// blockSizes[thread], gridSizes[grid], names[i], secs, flops, gflops, gbytes);
}
}
if (threads_max == 0 || blocks_max == 0)
errorQuda("Autotuning failed for %s kernel", names[i]);
if (threads_max == 0) errorQuda("Autotuning failed for %s kernel", names[i]);
printf("%-36s Performance maximum at %d threads per block, %d blocks per grid, Gflops/s = %f, GiB/s = %f\n",
names[i], threads_max, blocks_max, gflops_max, gbytes_max);
fprintf(blas_out, "// Kernel: %s\n", names[i]);
fprintf(blas_out, "blas_threads[%d][%d] = %d;\n", prec, i, threads_max);
fprintf(blas_out, "blas_blocks[%d][%d] = %d;\n\n", prec, i, blocks_max);
threads[i][prec] = threads_max;
blocks[i][prec] = blocks_max;
}
end();
}
fclose(blas_out);
write(names, threads, blocks);
endQuda();
}
......
......@@ -31,9 +31,9 @@ void *spinorRef, *spinorRefEven, *spinorRefOdd;
void *spinorGPU, *spinorGPUEven, *spinorGPUOdd;
double kappa = 1.0;
int ODD_BIT = 1;
int DAGGER_BIT = 0;
int TRANSFER = 0; // include transfer time in the benchmark?
int parity = 1; // even or odd? (0 = even, 1 = odd)