Advanced Computing Platform for Theoretical Physics

Commit 99f30e5e authored by rbabich's avatar rbabich
Browse files

quda: changed indexing of shared memory in Dslash kernel and set

sharedFloats=8


git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@602 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 12b32dea
......@@ -4,15 +4,22 @@ Version 0.x
- Introduced new interface functions newQudaGaugeParam() and
newQudaInvertParam() to allow for enhanced error checking. See
invert_test for an example of their use.
- Added auto-tuning blas to improve performance (see README for details).
- Improved stability of the half precision 8-parameter SU(3)
reconstruction (with thanks to Guochun Shi).
- Cleaned up the invert_test example to remove unnecessary dependencies.
- Fixed bug affecting saveGaugeQuda() that caused su3_test to fail.
- Added compiler directive to Makefile to improve performance of the
half-precision clover Dslash on sm_13 hardware.
- Tuned parameters to improve performance of the half-precision clover
Dslash on sm_13 hardware.
- Formally adopted the MIT/X11 license.
Version 0.1 - 17 November 2009
- Initial public release.
......
......@@ -39,7 +39,7 @@
#define checkCudaError() do { \
cudaError_t error = cudaGetLastError(); \
if (error != cudaSuccess) \
errorQuda("CUDA: %s", cudaGetErrorString(error)); \
errorQuda("(CUDA) %s", cudaGetErrorString(error)); \
} while (0)
#ifdef __cplusplus
......
......@@ -38,9 +38,6 @@ gen:
clean:
-rm -f *.o $(QUDA)
dslash_quda.o: dslash_quda.cu $(HDRS) $(CORE)
$(NVCC) $(NVCCFLAGS) -maxrregcount=80 $< -c -o $@
%.o: %.cpp $(HDRS)
$(CXX) $(CXXFLAGS) $< -c -o $@
......
// *** CUDA CLOVER ***
#define SHARED_FLOATS_PER_THREAD 0
#define SHARED_FLOATS_PER_THREAD 8
// input spinor
#ifdef SPINOR_DOUBLE
......@@ -234,14 +234,14 @@
#define c32_32_re c12_12_re
// output spinor
volatile spinorFloat o00_re;
volatile spinorFloat o00_im;
volatile spinorFloat o01_re;
volatile spinorFloat o01_im;
volatile spinorFloat o02_re;
volatile spinorFloat o02_im;
volatile spinorFloat o10_re;
volatile spinorFloat o10_im;
#define o00_re s[0*SHARED_STRIDE]
#define o00_im s[1*SHARED_STRIDE]
#define o01_re s[2*SHARED_STRIDE]
#define o01_im s[3*SHARED_STRIDE]
#define o02_re s[4*SHARED_STRIDE]
#define o02_im s[5*SHARED_STRIDE]
#define o10_re s[6*SHARED_STRIDE]
#define o10_im s[7*SHARED_STRIDE]
volatile spinorFloat o11_re;
volatile spinorFloat o11_im;
volatile spinorFloat o12_re;
......@@ -266,6 +266,18 @@ volatile spinorFloat o32_im;
int sid = blockIdx.x*blockDim.x + threadIdx.x;
#ifdef SPINOR_DOUBLE
#define SHARED_STRIDE 8 // to avoid bank conflicts
extern __shared__ spinorFloat sd_data[];
volatile spinorFloat *s = sd_data + SHARED_FLOATS_PER_THREAD*SHARED_STRIDE*(threadIdx.x/SHARED_STRIDE)
+ (threadIdx.x % SHARED_STRIDE);
#else
#define SHARED_STRIDE 16 // to avoid bank conflicts
extern __shared__ spinorFloat ss_data[];
volatile spinorFloat *s = ss_data + SHARED_FLOATS_PER_THREAD*SHARED_STRIDE*(threadIdx.x/SHARED_STRIDE)
+ (threadIdx.x % SHARED_STRIDE);
#endif
// read spinor from device memory
READ_SPINOR(SPINORTEX);
......@@ -601,6 +613,7 @@ READ_SPINOR(SPINORTEX);
// undefine to prevent warning when precision is changed
#undef spinorFloat
#undef SHARED_STRIDE
#undef i00_re
#undef i00_im
......
// *** CUDA DSLASH ***
#define SHARED_FLOATS_PER_THREAD 0
#define SHARED_FLOATS_PER_THREAD 8
// input spinor
#ifdef SPINOR_DOUBLE
#define spinorFloat double
......@@ -303,14 +304,14 @@
#define c32_32_re c12_12_re
// output spinor
volatile spinorFloat o00_re;
volatile spinorFloat o00_im;
volatile spinorFloat o01_re;
volatile spinorFloat o01_im;
volatile spinorFloat o02_re;
volatile spinorFloat o02_im;
volatile spinorFloat o10_re;
volatile spinorFloat o10_im;
#define o00_re s[0*SHARED_STRIDE]
#define o00_im s[1*SHARED_STRIDE]
#define o01_re s[2*SHARED_STRIDE]
#define o01_im s[3*SHARED_STRIDE]
#define o02_re s[4*SHARED_STRIDE]
#define o02_im s[5*SHARED_STRIDE]
#define o10_re s[6*SHARED_STRIDE]
#define o10_im s[7*SHARED_STRIDE]
volatile spinorFloat o11_re;
volatile spinorFloat o11_im;
volatile spinorFloat o12_re;
......@@ -345,6 +346,18 @@ int x1odd = (x2 + x3 + x4 + oddBit) & 1;
int x1 = 2*x1h + x1odd;
int X = 2*sid + x1odd;
#ifdef SPINOR_DOUBLE
#define SHARED_STRIDE 8 // to avoid bank conflicts
extern __shared__ spinorFloat sd_data[];
volatile spinorFloat *s = sd_data + SHARED_FLOATS_PER_THREAD*SHARED_STRIDE*(threadIdx.x/SHARED_STRIDE)
+ (threadIdx.x % SHARED_STRIDE);
#else
#define SHARED_STRIDE 16 // to avoid bank conflicts
extern __shared__ spinorFloat ss_data[];
volatile spinorFloat *s = ss_data + SHARED_FLOATS_PER_THREAD*SHARED_STRIDE*(threadIdx.x/SHARED_STRIDE)
+ (threadIdx.x % SHARED_STRIDE);
#endif
o00_re = o00_im = 0;
o01_re = o01_im = 0;
o02_re = o02_im = 0;
......@@ -1428,6 +1441,8 @@ o32_re = o32_im = 0;
// undefine to prevent warning when precision is changed
#undef spinorFloat
#undef SHARED_STRIDE
#undef A_re
#undef A_im
......
// *** CUDA DSLASH DAGGER ***
#define SHARED_FLOATS_PER_THREAD 0
#define SHARED_FLOATS_PER_THREAD 8
// input spinor
#ifdef SPINOR_DOUBLE
#define spinorFloat double
......@@ -303,14 +304,14 @@
#define c32_32_re c12_12_re
// output spinor
volatile spinorFloat o00_re;
volatile spinorFloat o00_im;
volatile spinorFloat o01_re;
volatile spinorFloat o01_im;
volatile spinorFloat o02_re;
volatile spinorFloat o02_im;
volatile spinorFloat o10_re;
volatile spinorFloat o10_im;
#define o00_re s[0*SHARED_STRIDE]
#define o00_im s[1*SHARED_STRIDE]
#define o01_re s[2*SHARED_STRIDE]
#define o01_im s[3*SHARED_STRIDE]
#define o02_re s[4*SHARED_STRIDE]
#define o02_im s[5*SHARED_STRIDE]
#define o10_re s[6*SHARED_STRIDE]
#define o10_im s[7*SHARED_STRIDE]
volatile spinorFloat o11_re;
volatile spinorFloat o11_im;
volatile spinorFloat o12_re;
......@@ -345,6 +346,18 @@ int x1odd = (x2 + x3 + x4 + oddBit) & 1;
int x1 = 2*x1h + x1odd;
int X = 2*sid + x1odd;
#ifdef SPINOR_DOUBLE
#define SHARED_STRIDE 8 // to avoid bank conflicts
extern __shared__ spinorFloat sd_data[];
volatile spinorFloat *s = sd_data + SHARED_FLOATS_PER_THREAD*SHARED_STRIDE*(threadIdx.x/SHARED_STRIDE)
+ (threadIdx.x % SHARED_STRIDE);
#else
#define SHARED_STRIDE 16 // to avoid bank conflicts
extern __shared__ spinorFloat ss_data[];
volatile spinorFloat *s = ss_data + SHARED_FLOATS_PER_THREAD*SHARED_STRIDE*(threadIdx.x/SHARED_STRIDE)
+ (threadIdx.x % SHARED_STRIDE);
#endif
o00_re = o00_im = 0;
o01_re = o01_im = 0;
o02_re = o02_im = 0;
......@@ -1428,6 +1441,8 @@ o32_re = o32_im = 0;
// undefine to prevent warning when precision is changed
#undef spinorFloat
#undef SHARED_STRIDE
#undef A_re
#undef A_im
......
......@@ -133,7 +133,7 @@ def a_im(b, s, c): return "a"+`(s+2*b)`+`c`+"_im"
def prolog():
str = []
str.append("// *** CUDA DSLASH ***\n\n" if not dagger else "// *** CUDA DSLASH DAGGER ***\n\n")
str.append("#define SHARED_FLOATS_PER_THREAD "+`sharedFloats`+"\n")
str.append("#define SHARED_FLOATS_PER_THREAD "+`sharedFloats`+"\n\n")
# str.append("#define SHARED_BYTES_DOUBLE (BLOCK_DIM*SHARED_FLOATS_PER_THREAD*sizeof(double))\n\n")
# str.append("#define SHARED_BYTES_SINGLE (BLOCK_DIM*SHARED_FLOATS_PER_THREAD*sizeof(float))\n\n")
......@@ -251,11 +251,11 @@ def prolog():
for c in range(0,3):
i = 3*s+c
if 2*i < sharedFloats:
str.append("#define "+out_re(s,c)+" s["+`(2*i+0)`+"]\n")
str.append("#define "+out_re(s,c)+" s["+`(2*i+0)`+"*SHARED_STRIDE]\n")
else:
str.append("volatile spinorFloat "+out_re(s,c)+";\n")
if 2*i+1 < sharedFloats:
str.append("#define "+out_im(s,c)+" s["+`(2*i+1)`+"]\n")
str.append("#define "+out_im(s,c)+" s["+`(2*i+1)`+"*SHARED_STRIDE]\n")
else:
str.append("volatile spinorFloat "+out_im(s,c)+";\n")
str.append("\n")
......@@ -282,11 +282,15 @@ int X = 2*sid + x1odd;
if sharedFloats > 0:
str.append("#ifdef SPINOR_DOUBLE\n")
str.append("#define SHARED_STRIDE 8 // to avoid bank conflicts\n")
str.append("extern __shared__ spinorFloat sd_data[];\n")
str.append("volatile spinorFloat *s = sd_data+SHARED_FLOATS_PER_THREAD*threadIdx.x;\n")
str.append("volatile spinorFloat *s = sd_data + SHARED_FLOATS_PER_THREAD*SHARED_STRIDE*(threadIdx.x/SHARED_STRIDE)\n")
str.append(" + (threadIdx.x % SHARED_STRIDE);\n")
str.append("#else\n")
str.append("#define SHARED_STRIDE 16 // to avoid bank conflicts\n")
str.append("extern __shared__ spinorFloat ss_data[];\n")
str.append("volatile spinorFloat *s = ss_data+SHARED_FLOATS_PER_THREAD*threadIdx.x;\n")
str.append("volatile spinorFloat *s = ss_data + SHARED_FLOATS_PER_THREAD*SHARED_STRIDE*(threadIdx.x/SHARED_STRIDE)\n")
str.append(" + (threadIdx.x % SHARED_STRIDE);\n")
str.append("#endif\n\n")
for s in range(0,4):
......@@ -580,8 +584,8 @@ def epilog():
""")
str.append("// undefine to prevent warning when precision is changed\n")
str.append("#undef spinorFloat\n")
str.append("#undef SHARED_STRIDE\n\n")
str.append("#undef A_re\n")
str.append("#undef A_im\n\n")
......@@ -619,8 +623,8 @@ def epilog():
def generate():
return prolog() + gen(0) + gen(1) + gen(2) + gen(3) + gen(4) + gen(5) + gen(6) + gen(7) + clover() + epilog()
# To fit 192 threads/SM with 16K shared memory, set sharedFloats to 19 or smaller
sharedFloats = 0
# To fit 192 threads/SM (single precision) with 16K shared memory, set sharedFloats to 19 or smaller
sharedFloats = 8
dagger = False
print sys.argv[0] + ": generating dslash_core.h";
......
......@@ -143,7 +143,7 @@ void init() {
gauge = cudaGaugePrecise;
if (clover_yes) {
loadCloverQuda(NULL, hostCloverInv, &inv_param);
loadCloverQuda(hostClover, hostCloverInv, &inv_param);
clover = cudaCloverPrecise;
cloverInv = cudaCloverInvPrecise;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment