Advanced Computing Platform for Theoretical Physics

Commit 2f900f63 authored by rbabich's avatar rbabich
Browse files

small changes to quda (double precision clover and general clean-up)


git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@454 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 8a52d6ab
CUFILES := dslash_cuda.cu blas_cuda.cu
CCFILES := inv_bicgstab_cuda.cpp inv_cg.cpp util_cuda.cpp gauge_cuda.cpp spinor_quda.cpp
CUDA_INSTALL_PATH = /usr/local/cuda
INCLUDES = -I. -I$(CUDA_INSTALL_PATH)/include -I$(CUDA_INSTALL_PATH)/../cuda_sdk/common/inc
LIB = -L$(CUDA_INSTALL_PATH)/lib64 -lcudart
INCLUDES = -I. -I$(CUDA_INSTALL_PATH)/include
# On 64-bit platforms:
LIB = -L$(CUDA_INSTALL_PATH)/lib64 -lcudart # for release 2.3 and newer
# LIB = -L$(CUDA_INSTALL_PATH)/lib -lcudart # for release 2.2 and older
COPT =
# On 32-bit platforms:
# LIB = -L$(CUDA_INSTALL_PATH)/lib -lcudart
# COPT = -malign-double
DFLAGS = #-D__DEVICE_EMULATION__
CC = gcc
CFLAGS = -Wall -O3 -std=c99 $(INCLUDES) ${DFLAGS}
CFLAGS = -Wall -O3 -std=c99 $(COPT) $(INCLUDES) ${DFLAGS}
CXX = g++
CXXFLAGS = -Wall -O3 $(INCLUDES) ${DFLAGS}
CXXFLAGS = -Wall -O3 $(COPT) $(INCLUDES) ${DFLAGS}
NVCC = $(CUDA_INSTALL_PATH)/bin/nvcc
NVCCFLAGS = -O3 $(INCLUDES) ${DFLAGS} -arch=sm_13 #-deviceemu
LDFLAGS = -fPIC $(LIB)
CCOBJECTS = $(CCFILES:.cpp=.o)
CUOBJECTS = $(CUFILES:.cu=.o)
all: dslash_test invert_test su3_test pack_test
ILIB = libquda.a
ILIB_OBJS = inv_bicgstab_quda.o inv_cg_quda.o dslash_quda.o blas_quda.o util_quda.o \
dslash_reference.o blas_reference.o invert_quda.o gauge_quda.o spinor_quda.o
ILIB_DEPS = $(ILIB_OBJS) blas_quda.h quda.h util_quda.h invert_quda.h gauge_quda.h spinor_quda.h enum_quda.h dslash_reference.h
ILIB_OBJS = inv_bicgstab_quda.o inv_cg_quda.o dslash_quda.o blas_quda.o \
util_quda.o dslash_reference.o blas_reference.o invert_quda.o \
gauge_quda.o spinor_quda.o
ILIB_DEPS = $(ILIB_OBJS) blas_quda.h quda.h util_quda.h invert_quda.h \
gauge_quda.h spinor_quda.h enum_quda.h dslash_reference.h
$(ILIB): $(ILIB_DEPS)
ar cru $@ $(ILIB_OBJS)
invert_test: invert_test.o $(ILIB)
dslash_test: dslash_test.o $(ILIB)
$(CXX) $(LDFLAGS) $< $(ILIB) -o $@
dslash_test: dslash_test.o $(ILIB)
invert_test: invert_test.o $(ILIB)
$(CXX) $(LDFLAGS) $< $(ILIB) -o $@
su3_test: su3_test.o $(ILIB)
......
QUDA v0.x Release Notes
-----------------------
Known issues:
* For compatibility with CUDA, the library is compiled with the GCC option
-malign-double. On 32-bit machines, this differs from the GCC default
and may affect the alignment of various structures, notably those of type
QudaGaugeParam and QudaInvertParam, defined in invert_quda.h. As a result,
it may be necessary to use this switch when compiling code that is to be
linked against the QUDA library.
......@@ -2,7 +2,7 @@
#define SHARED_FLOATS_PER_THREAD 0
// input spinor
#if (DD_SPREC==0)
#ifdef SPINOR_DOUBLE
#define spinorFloat double
#define i00_re I0.x
#define i00_im I0.y
......@@ -55,10 +55,10 @@
#define i31_im I5.y
#define i32_re I5.z
#define i32_im I5.w
#endif
#endif // SPINOR_DOUBLE
// gauge link
#if (DD_GPREC==0)
#ifdef GAUGE_DOUBLE
#define g00_re G0.x
#define g00_im G0.y
#define g01_re G1.x
......@@ -104,7 +104,7 @@
#define A_re G4.z
#define A_im G4.w
#endif
#endif // GAUGE_DOUBLE
// conjugated gauge link
#define gT00_re (+g00_re)
......@@ -127,6 +127,44 @@
#define gT22_im (-g22_im)
// first chiral block of inverted clover term
#ifdef CLOVER_DOUBLE
#define c00_00_re C0.x
#define c01_01_re C0.y
#define c02_02_re C1.x
#define c10_10_re C1.y
#define c11_11_re C2.x
#define c12_12_re C2.y
#define c01_00_re C3.x
#define c01_00_im C3.y
#define c02_00_re C4.x
#define c02_00_im C4.y
#define c10_00_re C5.x
#define c10_00_im C5.y
#define c11_00_re C6.x
#define c11_00_im C6.y
#define c12_00_re C7.x
#define c12_00_im C7.y
#define c02_01_re C8.x
#define c02_01_im C8.y
#define c10_01_re C9.x
#define c10_01_im C9.y
#define c11_01_re C10.x
#define c11_01_im C10.y
#define c12_01_re C11.x
#define c12_01_im C11.y
#define c10_02_re C12.x
#define c10_02_im C12.y
#define c11_02_re C13.x
#define c11_02_im C13.y
#define c12_02_re C14.x
#define c12_02_im C14.y
#define c11_10_re C15.x
#define c11_10_im C15.y
#define c12_10_re C16.x
#define c12_10_im C16.y
#define c12_11_re C17.x
#define c12_11_im C17.y
#else
#define c00_00_re C0.x
#define c01_01_re C0.y
#define c02_02_re C0.z
......@@ -163,6 +201,8 @@
#define c12_10_im C8.y
#define c12_11_re C8.z
#define c12_11_im C8.w
#endif // CLOVER_DOUBLE
#define c00_01_re (+c01_00_re)
#define c00_01_im (-c01_00_im)
#define c00_02_re (+c02_00_re)
......@@ -1329,7 +1369,7 @@ o32_re = o32_im = 0;
#ifdef DSLASH_XPAY
READ_ACCUM(ACCUMTEX)
#if (DD_SPREC==0)
#ifdef SPINOR_DOUBLE
o00_re = a*o00_re + accum0.x;
o00_im = a*o00_im + accum0.y;
o01_re = a*o01_re + accum1.x;
......@@ -1379,7 +1419,7 @@ o32_re = o32_im = 0;
o31_im = a*o31_im + accum5.y;
o32_re = a*o32_re + accum5.z;
o32_im = a*o32_im + accum5.w;
#endif // DD_SPREC
#endif // SPINOR_DOUBLE
#endif // DSLASH_XPAY
......@@ -1434,4 +1474,3 @@ o32_re = o32_im = 0;
#undef i31_im
#undef i32_re
#undef i32_im
......@@ -138,7 +138,7 @@ def prolog():
str.append("// input spinor\n")
str.append("#if (DD_SPREC==0)\n")
str.append("#ifdef SPINOR_DOUBLE\n")
str.append("#define spinorFloat double\n")
for s in range(0,4):
for c in range(0,3):
......@@ -153,11 +153,11 @@ def prolog():
i = 3*s+c
str.append("#define "+in_re(s,c)+" I"+nthFloat4(2*i+0)+"\n")
str.append("#define "+in_im(s,c)+" I"+nthFloat4(2*i+1)+"\n")
str.append("#endif\n\n")
str.append("#endif // SPINOR_DOUBLE\n\n")
str.append("// gauge link\n")
str.append("#if (DD_GPREC==0)\n")
str.append("#ifdef GAUGE_DOUBLE\n")
for m in range(0,3):
for n in range(0,3):
i = 3*m+n
......@@ -179,7 +179,7 @@ def prolog():
str.append("#define A_re G"+nthFloat4(18)+"\n")
str.append("#define A_im G"+nthFloat4(19)+"\n")
str.append("\n")
str.append("#endif\n\n")
str.append("#endif // GAUGE_DOUBLE\n\n")
str.append("// conjugated gauge link\n")
for m in range(0,3):
......@@ -190,6 +190,23 @@ def prolog():
str.append("\n")
str.append("// first chiral block of inverted clover term\n")
str.append("#ifdef CLOVER_DOUBLE\n")
i = 0
for m in range(0,6):
s = m/3
c = m%3
str.append("#define "+c_re(0,s,c,s,c)+" C"+nthFloat2(i)+"\n")
i += 1
for n in range(0,6):
sn = n/3
cn = n%3
for m in range(n+1,6):
sm = m/3
cm = m%3
str.append("#define "+c_re(0,sm,cm,sn,cn)+" C"+nthFloat2(i)+"\n")
str.append("#define "+c_im(0,sm,cm,sn,cn)+" C"+nthFloat2(i+1)+"\n")
i += 2
str.append("#else\n")
i = 0
for m in range(0,6):
s = m/3
......@@ -205,6 +222,8 @@ def prolog():
str.append("#define "+c_re(0,sm,cm,sn,cn)+" C"+nthFloat4(i)+"\n")
str.append("#define "+c_im(0,sm,cm,sn,cn)+" C"+nthFloat4(i+1)+"\n")
i += 2
str.append("#endif // CLOVER_DOUBLE\n\n")
for n in range(0,6):
sn = n/3
cn = n%3
......@@ -261,7 +280,7 @@ int X = 2*sid + x1odd;
""")
if sharedFloats > 0:
str.append("#if (DD_SPREC==0)\n")
str.append("#ifdef SPINOR_DOUBLE\n")
str.append("extern __shared__ spinorFloat sd_data[];\n")
str.append("volatile spinorFloat *s = sd_data+SHARED_FLOATS_PER_THREAD*threadIdx.x;\n")
str.append("#else\n")
......@@ -529,7 +548,7 @@ def epilog():
READ_ACCUM(ACCUMTEX)
""")
str.append("#if (DD_SPREC==0)\n")
str.append("#ifdef SPINOR_DOUBLE\n")
for s in range(0,4):
for c in range(0,3):
......@@ -545,7 +564,7 @@ def epilog():
str.append(" "+out_re(s,c) +" = a*"+out_re(s,c)+" + accum"+nthFloat4(2*i+0)+";\n")
str.append(" "+out_im(s,c) +" = a*"+out_im(s,c)+" + accum"+nthFloat4(2*i+1)+";\n")
str.append("#endif // DD_SPREC\n")
str.append("#endif // SPINOR_DOUBLE\n")
str.append("#endif // DSLASH_XPAY\n\n")
......@@ -579,12 +598,22 @@ def epilog():
return ''.join(str)
# end def epilog
def generate():
return prolog() + gen(0) + gen(1) + gen(2) + gen(3) + gen(4) + gen(5) + gen(6) + gen(7) + clover() + epilog()
dagger = False
#dagger = True
# To fit 192 threads/SM with 16K shared memory, set sharedFloats to 19 or smaller
sharedFloats = 0
print generate()
dagger = False
f = open('dslash_core.h', 'w')
f.write(generate())
f.close()
dagger = True
f = open('dslash_dagger_core.h', 'w')
f.write(generate())
f.close()
#f = open('clover_core.h', 'w')
#f.write(prolog() + clover() + epilog())
#f.close()
......@@ -2,7 +2,7 @@
#define SHARED_FLOATS_PER_THREAD 0
// input spinor
#if (DD_SPREC==0)
#ifdef SPINOR_DOUBLE
#define spinorFloat double
#define i00_re I0.x
#define i00_im I0.y
......@@ -55,10 +55,10 @@
#define i31_im I5.y
#define i32_re I5.z
#define i32_im I5.w
#endif
#endif // SPINOR_DOUBLE
// gauge link
#if (DD_GPREC==0)
#ifdef GAUGE_DOUBLE
#define g00_re G0.x
#define g00_im G0.y
#define g01_re G1.x
......@@ -104,7 +104,7 @@
#define A_re G4.z
#define A_im G4.w
#endif
#endif // GAUGE_DOUBLE
// conjugated gauge link
#define gT00_re (+g00_re)
......@@ -127,6 +127,44 @@
#define gT22_im (-g22_im)
// first chiral block of inverted clover term
#ifdef CLOVER_DOUBLE
#define c00_00_re C0.x
#define c01_01_re C0.y
#define c02_02_re C1.x
#define c10_10_re C1.y
#define c11_11_re C2.x
#define c12_12_re C2.y
#define c01_00_re C3.x
#define c01_00_im C3.y
#define c02_00_re C4.x
#define c02_00_im C4.y
#define c10_00_re C5.x
#define c10_00_im C5.y
#define c11_00_re C6.x
#define c11_00_im C6.y
#define c12_00_re C7.x
#define c12_00_im C7.y
#define c02_01_re C8.x
#define c02_01_im C8.y
#define c10_01_re C9.x
#define c10_01_im C9.y
#define c11_01_re C10.x
#define c11_01_im C10.y
#define c12_01_re C11.x
#define c12_01_im C11.y
#define c10_02_re C12.x
#define c10_02_im C12.y
#define c11_02_re C13.x
#define c11_02_im C13.y
#define c12_02_re C14.x
#define c12_02_im C14.y
#define c11_10_re C15.x
#define c11_10_im C15.y
#define c12_10_re C16.x
#define c12_10_im C16.y
#define c12_11_re C17.x
#define c12_11_im C17.y
#else
#define c00_00_re C0.x
#define c01_01_re C0.y
#define c02_02_re C0.z
......@@ -163,6 +201,8 @@
#define c12_10_im C8.y
#define c12_11_re C8.z
#define c12_11_im C8.w
#endif // CLOVER_DOUBLE
#define c00_01_re (+c01_00_re)
#define c00_01_im (-c01_00_im)
#define c00_02_re (+c02_00_re)
......@@ -1329,7 +1369,7 @@ o32_re = o32_im = 0;
#ifdef DSLASH_XPAY
READ_ACCUM(ACCUMTEX)
#if (DD_SPREC==0)
#ifdef SPINOR_DOUBLE
o00_re = a*o00_re + accum0.x;
o00_im = a*o00_im + accum0.y;
o01_re = a*o01_re + accum1.x;
......@@ -1379,7 +1419,7 @@ o32_re = o32_im = 0;
o31_im = a*o31_im + accum5.y;
o32_re = a*o32_re + accum5.z;
o32_im = a*o32_im + accum5.w;
#endif // DD_SPREC
#endif // SPINOR_DOUBLE
#endif // DSLASH_XPAY
......@@ -1434,4 +1474,3 @@ o32_re = o32_im = 0;
#undef i31_im
#undef i32_re
#undef i32_im
......@@ -89,6 +89,7 @@
#define DD_GPREC_F D
#define GAUGE0TEX gauge0TexDouble
#define GAUGE1TEX gauge1TexDouble
#define GAUGE_DOUBLE
#elif (DD_GPREC==1) // single-precision gauge field
#define DD_GPREC_F S
#define GAUGE0TEX gauge0TexSingle
......@@ -107,6 +108,7 @@
#define READ_SPINOR_DOWN READ_SPINOR_DOUBLE_DOWN
#define SPINORTEX spinorTexDouble
#define WRITE_SPINOR WRITE_SPINOR_DOUBLE2
#define SPINOR_DOUBLE
#if (DD_XPAY==1)
#define ACCUMTEX accumTexDouble
#define READ_ACCUM READ_ACCUM_DOUBLE
......@@ -142,12 +144,13 @@
#define CLOVERTEX cloverTexDouble
#define READ_CLOVER READ_CLOVER_DOUBLE
#define DSLASH_CLOVER
#define CLOVER_DOUBLE
#elif (DD_CPREC==1) // single-precision clover term
#define DD_CPREC_F S
#define CLOVERTEX cloverTexSingle
#define READ_CLOVER READ_CLOVER_SINGLE
#define DSLASH_CLOVER
#elif (DD_CPREC==2) // single-precision clover term
#elif (DD_CPREC==2) // half-precision clover term
#define DD_CPREC_F H
#define CLOVERTEX cloverTexHalf
#define READ_CLOVER READ_CLOVER_HALF
......@@ -202,6 +205,9 @@ DD_FUNC(DD_GPREC_F, DD_SPREC_F, DD_CPREC_F, DD_RECON_F, DD_DAG_F, DD_XPAY_F)(DD_
#undef CLOVERTEX
#undef READ_CLOVER
#undef DSLASH_CLOVER
#undef GAUGE_DOUBLE
#undef SPINOR_DOUBLE
#undef CLOVER_DOUBLE
// prepare next set of options, or clean up after final iteration
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment