Advanced Computing Platform for Theoretical Physics

Commit 81df13a7 authored by rbabich's avatar rbabich
Browse files

quda-0.2: merged blas improvements from quda r623


git-svn-id: http://lattice.bu.edu/qcdalg/cuda/branches/quda-0.2@690 be54200a-260c-0410-bdd7-ce6af2a381ab
parent be8375c9
......@@ -4,6 +4,8 @@ Version 0.2.1
padding, mixed precision, and the "PRESERVE_SOURCE_NO" option were
all used at the same time.
- Improved performance of the half and double precision blas routines.
Version 0.2 - 16 December 2009
......
......@@ -41,7 +41,7 @@ clean:
$(CXX) $(CXXFLAGS) $< -c -o $@
blas_quda.o: blas_quda.cu blas_param.h $(HDRS)
$(NVCC) $(NVCCFLAGS) $< -c -o $@
$(NVCC) $(NVCCFLAGS) --maxrregcount=80 $< -c -o $@
%.o: %.cu $(HDRS) $(CORE)
$(NVCC) $(NVCCFLAGS) $< -c -o $@
......
......@@ -14,17 +14,17 @@ static int blas_threads[22][3] = {
{ 64, 64, 64}, // Kernel 8: caxpbyCuda
{ 64, 64, 64}, // Kernel 9: cxpaypbzCuda
{ 64, 128, 64}, // Kernel 10: axpyZpbxCuda
{ 64, 64, 64}, // Kernel 11: caxpbypzYmbwCuda
{ 64, 128, 128}, // Kernel 12: sumCuda
{ 64, 128, 64}, // Kernel 11: caxpbypzYmbwCuda
{ 128, 128, 128}, // Kernel 12: sumCuda
{ 64, 128, 128}, // Kernel 13: normCuda
{ 64, 128, 128}, // Kernel 14: reDotProductCuda
{ 64, 128, 64}, // Kernel 14: reDotProductCuda
{ 64, 128, 64}, // Kernel 15: axpyNormCuda
{ 64, 128, 64}, // Kernel 16: xmyNormCuda
{ 64, 256, 64}, // Kernel 16: xmyNormCuda
{ 64, 128, 64}, // Kernel 17: cDotProductCuda
{ 64, 64, 64}, // Kernel 18: xpaycDotzyCuda
{ 64, 64, 64}, // Kernel 19: cDotProductNormACuda
{ 64, 64, 64}, // Kernel 20: cDotProductNormBCuda
{ 64, 64, 64} // Kernel 21: caxpbypzYmbwcDotProductWYNormYQuda
{ 128, 64, 64} // Kernel 21: caxpbypzYmbwcDotProductWYNormYQuda
};
static int blas_blocks[22][3] = {
......@@ -34,20 +34,20 @@ static int blas_blocks[22][3] = {
{2048, 128, 4096}, // Kernel 3: axpyCuda
{2048, 128, 128}, // Kernel 4: xpayCuda
{2048, 128, 4096}, // Kernel 5: mxpyCuda
{2048, 128, 2048}, // Kernel 6: axCuda
{2048, 128, 2048}, // Kernel 7: caxpyCuda
{2048, 128, 2048}, // Kernel 8: caxpbyCuda
{1024, 128, 2048}, // Kernel 6: axCuda
{2048, 128, 1024}, // Kernel 7: caxpyCuda
{1024, 128, 128}, // Kernel 8: caxpbyCuda
{1024, 128, 4096}, // Kernel 9: cxpaypbzCuda
{ 512, 128, 128}, // Kernel 10: axpyZpbxCuda
{2048, 2048, 128}, // Kernel 10: axpyZpbxCuda
{1024, 128, 128}, // Kernel 11: caxpbypzYmbwCuda
{ 128, 1024, 128}, // Kernel 12: sumCuda
{ 128, 1024, 128}, // Kernel 13: normCuda
{ 64, 1024, 128}, // Kernel 14: reDotProductCuda
{ 256, 1024, 128}, // Kernel 15: axpyNormCuda
{ 128, 512, 1024}, // Kernel 12: sumCuda
{ 128, 1024, 1024}, // Kernel 13: normCuda
{ 512, 1024, 128}, // Kernel 14: reDotProductCuda
{2048, 2048, 128}, // Kernel 15: axpyNormCuda
{ 512, 1024, 128}, // Kernel 16: xmyNormCuda
{ 64, 512, 128}, // Kernel 17: cDotProductCuda
{ 256, 512, 128}, // Kernel 17: cDotProductCuda
{ 256, 128, 128}, // Kernel 18: xpaycDotzyCuda
{ 64, 1024, 128}, // Kernel 19: cDotProductNormACuda
{ 256, 1024, 128}, // Kernel 19: cDotProductNormACuda
{ 64, 1024, 128}, // Kernel 20: cDotProductNormBCuda
{ 512, 128, 256} // Kernel 21: caxpbypzYmbwcDotProductWYNormYQuda
{ 256, 128, 256} // Kernel 21: caxpbypzYmbwcDotProductWYNormYQuda
};
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment