Advanced Computing Platform for Theoretical Physics

Commit 7364b3e7 authored by rbabich's avatar rbabich
Browse files

quda: added 'tune' target to Makefile (still debugging blas_test)


git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@604 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 99f30e5e
......@@ -10,6 +10,11 @@ lib:
tests:
$(MAKE) -C tests/
tune:
$(MAKE) -C tests/ tune
@echo "Autotuning completed successfully. " \
"Please type 'make' to rebuild library."
gen:
$(MAKE) -C lib/ gen
......@@ -17,4 +22,4 @@ clean:
$(MAKE) -C lib/ clean
$(MAKE) -C tests/ clean
.PHONY: all lib tests gen clean
.PHONY: all lib tests tune gen clean
......@@ -20,6 +20,9 @@ extern "C" {
void initBlas(void);
void endBlas(void);
void setBlasTuning(int tuning);
void setBlasParam(int kernel, int prec, int threads, int blocks);
void zeroCuda(ParitySpinor a);
void copyCuda(ParitySpinor dst, ParitySpinor src);
......
......@@ -41,8 +41,8 @@ int blas_blocks[3][22];
static dim3 blasBlock;
static dim3 blasGrid;
void initBlas(void) {
void initBlas(void)
{
if (!d_reduceFloat) {
if (cudaMalloc((void**) &d_reduceFloat, REDUCE_MAX_BLOCKS*sizeof(QudaSumFloat)) == cudaErrorMemoryAllocation) {
errorQuda("Error allocating device reduction array");
......@@ -84,7 +84,8 @@ void initBlas(void) {
}
void endBlas(void) {
void endBlas(void)
{
if (d_reduceFloat) cudaFree(d_reduceFloat);
if (d_reduceComplex) cudaFree(d_reduceComplex);
if (d_reduceFloat3) cudaFree(d_reduceFloat3);
......@@ -93,6 +94,20 @@ void endBlas(void) {
if (h_reduceFloat3) cudaFreeHost(h_reduceFloat3);
}
// blasTuning = 1 turns off error checking
static int blasTuning = 0;
void setBlasTuning(int tuning)
{
blasTuning = tuning;
}
void setBlasParam(int kernel, int prec, int threads, int blocks)
{
blas_threads[prec][kernel] = threads;
blas_blocks[prec][kernel] = blocks;
}
void setBlock(int kernel, int length, QudaPrecision precision) {
int prec;
switch(precision) {
......
......@@ -132,7 +132,9 @@ cuDoubleComplex REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n, int kernel, QudaPre
// copy result from device to host, and perform final reduction on CPU
cudaMemcpy(h_reduceComplex, d_reduceComplex, blasGrid.x*sizeof(QudaSumComplex), cudaMemcpyDeviceToHost);
checkCudaError();
// for a tuning run, let blas_test check the error condition
if (!blasTuning) checkCudaError();
cuDoubleComplex gpu_result;
gpu_result.x = 0;
......
......@@ -114,12 +114,12 @@ double REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n, int kernel, QudaPrecision pr
// copy result from device to host, and perform final reduction on CPU
cudaMemcpy(h_reduceFloat, d_reduceFloat, blasGrid.x*sizeof(QudaSumFloat), cudaMemcpyDeviceToHost);
checkCudaError();
// for a tuning run, let blas_test check the error condition
if (!blasTuning) checkCudaError();
double cpu_sum = 0;
for (int i = 0; i < blasGrid.x; i++) cpu_sum += h_reduceFloat[i];
return cpu_sum;
}
......@@ -146,7 +146,9 @@ double3 REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n, int kernel, QudaPrecision p
// copy result from device to host, and perform final reduction on CPU
cudaMemcpy(h_reduceFloat3, d_reduceFloat3, blasGrid.x*sizeof(QudaSumFloat3), cudaMemcpyDeviceToHost);
checkCudaError();
// for a tuning run, let blas_test check the error condition
if (!blasTuning) checkCudaError();
double3 gpu_result;
gpu_result.x = 0;
......
......@@ -6,7 +6,8 @@ CPU_ARCH = x86_64 # x86 or x86_64
GPU_ARCH = sm_13 # sm_10, sm_11, sm_12, or sm_13
GPU_EMU = false # set to 'true' for device emulation
PYTHON = python2.6 # python 2.5 or later required for 'make gen'
PYTHON = python # python 2.5 or later required for 'make gen'
DEVICE = 0 # CUDA device to use for 'make tune'
######
......
......@@ -25,6 +25,9 @@ pack_test: pack_test.o test_util.o dslash_reference.o $(QUDA)
blas_test: blas_test.o $(QUDA)
$(CXX) $(LDFLAGS) $^ -o $@
tune: blas_test
./blas_test $(DEVICE) && cp -f blas_param.h ../lib/
clean:
-rm -f *.o $(TESTS)
......@@ -37,4 +40,4 @@ clean:
%.o: %.cu $(HDRS)
$(NVCC) $(NVCCFLAGS) $< -c -o $@
.PHONY: all clean
.PHONY: all tune clean
......@@ -48,12 +48,19 @@ void init() {
// need single parity dimensions
X[0] /= 2;
v = allocateParitySpinor(X, cuda_prec, sp_pad);
w = allocateParitySpinor(X, cuda_prec, sp_pad);
x = allocateParitySpinor(X, cuda_prec, sp_pad);
y = allocateParitySpinor(X, cuda_prec, sp_pad);
z = allocateParitySpinor(X, cuda_prec, sp_pad);
p = allocateParitySpinor(X, other_prec, sp_pad);
// check for successful allocation
checkCudaError();
// turn off error checking in blas kernels
setBlasTuning(1);
}
void end() {
......@@ -186,33 +193,35 @@ double benchmark(int kernel) {
int main(int argc, char** argv) {
int dev = 0;
if (argc == 2) dev = atoi(argv[1]);
initQuda(dev);
int kernels[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21};
char names[][100] = {
"copyCuda ",
"axpbyCuda ",
"xpyCuda ",
"axpyCuda ",
"xpayCuda ",
"mxpyCuda ",
"axCuda ",
"caxpyCuda ",
"caxpbyCuda ",
"cxpaypbzCuda ",
"axpyZpbxCuda ",
"caxpbypzYmbwCuda ",
"sumCuda ",
"normCuda ",
"reDotProductCuda ",
"axpyNormCuda ",
"xmyNormCuda ",
"cDotProductCuda ",
"xpaycDotzyCuda ",
"cDotProductNormACuda ",
"cDotProductNormBCuda ",
"caxpbypzYmbwcDotProductWYNormYQuda "
"copyCuda",
"axpbyCuda",
"xpyCuda",
"axpyCuda",
"xpayCuda",
"mxpyCuda",
"axCuda",
"caxpyCuda",
"caxpbyCuda",
"cxpaypbzCuda",
"axpyZpbxCuda",
"caxpbypzYmbwCuda",
"sumCuda",
"normCuda",
"reDotProductCuda",
"axpyNormCuda",
"xmyNormCuda",
"cDotProductCuda",
"xpaycDotzyCuda",
"cDotProductNormACuda",
"cDotProductNormBCuda",
"caxpbypzYmbwcDotProductWYNormYQuda"
};
FILE *blas_out = fopen("blas_param.h", "w");
......@@ -230,9 +239,10 @@ int main(int argc, char** argv) {
int threads_max = 0;
int blocks_max = 0;
for (int thread=0; thread<Nthreads; thread++) {
blas_threads[prec][i] = blockSizes[thread];
for (int grid=0; grid<Ngrids; grid++) {
blas_blocks[prec][i] = gridSizes[grid];
setBlasParam(i, prec, blockSizes[thread], gridSizes[grid]);
if (i==12) printfQuda("warmup %d %d\n", blockSizes[thread], gridSizes[grid]); // DEBUG
// first do warmup run
nIters = 1;
......@@ -242,6 +252,14 @@ int main(int argc, char** argv) {
blas_quda_flops = 0;
blas_quda_bytes = 0;
// DEBUG
{
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) warningQuda("%s", cudaGetErrorString(error));
}
if (i==12) printfQuda("running %d %d\n", blockSizes[thread], gridSizes[grid]);
// END DEBUG
double secs = benchmark(kernels[i]);
double flops = blas_quda_flops;
double bytes = blas_quda_bytes;
......@@ -249,19 +267,26 @@ int main(int argc, char** argv) {
double gflops = (flops*1e-9)/(secs);
double gbytes = bytes/(secs*(1<<30));
if (gbytes > gbytes_max && gbytes < 300) { // prevents selection of failed parameters
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) warningQuda("%s", cudaGetErrorString(error)); // DEBUG
if (gbytes > gbytes_max && error == cudaSuccess) { // prevents selection of failed parameters
gflops_max = gflops;
gbytes_max = gbytes;
threads_max = blockSizes[thread];
blocks_max = gridSizes[grid];
}
//printf("%d %d %s %f s, flops = %e, Gflops/s = %f, GiB/s = %f\n\n",
//printf("%d %d %-36s %f s, flops = %e, Gflops/s = %f, GiB/s = %f\n\n",
// blockSizes[thread], gridSizes[grid], names[i], secs, flops, gflops, gbytes);
}
}
printf("%s Performance maximum at %d threads per block, %d blocks per grid, Gflops/s = %f, GiB/s = %f\n",
if (threads_max == 0 || blocks_max == 0)
errorQuda("Autotuning failed for %s kernel", names[i]);
printf("%-36s Performance maximum at %d threads per block, %d blocks per grid, Gflops/s = %f, GiB/s = %f\n",
names[i], threads_max, blocks_max, gflops_max, gbytes_max);
fprintf(blas_out, "// Kernel: %s\n", names[i]);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment