Advanced Computing Platform for Theoretical Physics

Commit e5e5e289 authored by mikeaclark's avatar mikeaclark
Browse files

Runtime lattice dimensions and reliable updates for CG

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@416 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 1e57e6e1
......@@ -22,6 +22,20 @@
#define QudaSumFloat3 float3
#endif
#if (__CUDA_ARCH__ == 130)
static __inline__ __device__ double2 fetch_double2(texture<int4, 1> t, int i)
{
int4 v = tex1Dfetch(t,i);
return make_double2(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z));
}
#else
static __inline__ __device__ double2 fetch_double2(texture<int4, 1> t, int i)
{
// do nothing
return make_double2(0.0, 0.0);
}
#endif
#define RECONSTRUCT_HALF_SPINOR(a, texHalf, texNorm, length) \
float4 a##0 = tex1Dfetch(texHalf, i + 0*length); \
float4 a##1 = tex1Dfetch(texHalf, i + 1*length); \
......@@ -158,7 +172,7 @@
Z.x = z.x; Z.y = z.y; \
z.x = X.z + a.x*Y.z - a.y*Y.w + b.x*Z.z - b.y*Z.w; \
z.y = X.w + a.y*Y.z + a.x*Y.w + b.y*Z.z + b.x*Z.w; \
Z.z = z.x; Z.w = z.y;} \
Z.z = z.x; Z.w = z.y;}
#define CAXPBYPZ_FLOAT4(a, X, b, Y, Z) \
Z.x += a.x*X.x - a.y*X.y + b.x*Y.x - b.y*Y.y; \
......@@ -192,14 +206,6 @@ texture<float, 1, cudaReadModeElementType> texNorm4;
texture<short4, 1, cudaReadModeNormalizedFloat> texHalf5;
texture<float, 1, cudaReadModeElementType> texNorm5;
#if (__CUDA_ARCH__ == 130)
static __inline__ __device__ double2 fetch_double2(texture<int4, 1> t, int i)
{
int4 v = tex1Dfetch(t,i);
return make_double2(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z));
}
#endif
inline void checkSpinor(ParitySpinor &a, ParitySpinor &b) {
if (a.precision != b.precision) {
printf("checkSpinor error, precisions do not match: %d %d\n", a.precision, b.precision);
......
......@@ -299,12 +299,15 @@ volatile spinorFloat o32_im;
#include "io_spinor.h"
int sid = BLOCK_DIM*blockIdx.x + threadIdx.x;
int boundaryCrossings = sid/L1h + sid/(L2*L1h) + sid/(L3*L2*L1h);
int X = 2*sid + (boundaryCrossings + oddBit) % 2;
int x4 = X/(L3*L2*L1);
int x3 = (X/(L2*L1)) % L3;
int x2 = (X/L1) % L2;
int x1 = X % L1;
int boundaryCrossings = FAST_INT_DIVIDE(sid,X1h) +
FAST_INT_DIVIDE(sid,X2X1h) + FAST_INT_DIVIDE(sid,X3X2X1h);
int X = 2*sid + ((boundaryCrossings + oddBit)&1);
int x4 = FAST_INT_DIVIDE(X, X3X2X1);
int x3 = FAST_INT_MOD(FAST_INT_DIVIDE(X, X2X1), X3);
int x2 = FAST_INT_DIVIDE(X, X1);
int x1 = X - x2*X1;
x2 = FAST_INT_MOD(x2, X2);
o00_re = o00_im = 0;
o01_re = o01_im = 0;
......@@ -326,7 +329,7 @@ o32_re = o32_im = 0;
// 0 i 1 0
// i 0 0 1
int sp_idx = ((x1==L1-1) ? X-(L1-1) : X+1) / 2;
int sp_idx = ((x1==X1-1) ? X-X1+1 : X+1) >> 1;
int ga_idx = sid;
// read gauge matrix from device memory
......@@ -407,7 +410,7 @@ o32_re = o32_im = 0;
// 0 -i 1 0
// -i 0 0 1
int sp_idx = ((x1==0) ? X+(L1-1) : X-1) / 2;
int sp_idx = ((x1==0) ? X+X1-1 : X-1) >> 1;
int ga_idx = sp_idx;
// read gauge matrix from device memory
......@@ -488,7 +491,7 @@ o32_re = o32_im = 0;
// 0 1 1 0
// -1 0 0 1
int sp_idx = ((x2==L2-1) ? X-(L2-1)*L1 : X+L1) / 2;
int sp_idx = ((x2==X2-1) ? X-X2X1+X1 : X+X1) >> 1;
int ga_idx = sid;
// read gauge matrix from device memory
......@@ -569,7 +572,7 @@ o32_re = o32_im = 0;
// 0 -1 1 0
// 1 0 0 1
int sp_idx = ((x2==0) ? X+(L2-1)*L1 : X-L1) / 2;
int sp_idx = ((x2==0) ? X+X2X1-X1 : X-X1) >> 1;
int ga_idx = sp_idx;
// read gauge matrix from device memory
......@@ -650,7 +653,7 @@ o32_re = o32_im = 0;
// i 0 1 0
// 0 -i 0 1
int sp_idx = ((x3==L3-1) ? X-(L3-1)*L2*L1 : X+L2*L1) / 2;
int sp_idx = ((x3==X3-1) ? X-X3X2X1+X2X1 : X+X2X1) >> 1;
int ga_idx = sid;
// read gauge matrix from device memory
......@@ -731,7 +734,7 @@ o32_re = o32_im = 0;
// -i 0 1 0
// 0 i 0 1
int sp_idx = ((x3==0) ? X+(L3-1)*L2*L1 : X-L2*L1) / 2;
int sp_idx = ((x3==0) ? X+X3X2X1-X2X1 : X-X2X1) >> 1;
int ga_idx = sp_idx;
// read gauge matrix from device memory
......@@ -812,10 +815,10 @@ o32_re = o32_im = 0;
// 0 0 2 0
// 0 0 0 2
int sp_idx = ((x4==L4-1) ? X-(L4-1)*L3*L2*L1 : X+L3*L2*L1) / 2;
int sp_idx = ((x4==X4-1) ? X-X4X3X2X1+X3X2X1 : X+X3X2X1) >> 1;
int ga_idx = sid;
if (gauge_fixed && ga_idx < (L4-1)*L1h*L2*L3) {
if (gauge_fixed && ga_idx < X4X3X2X1h-X3X2X1h) {
// read spinor from device memory
READ_SPINOR_DOWN(SPINORTEX);
......@@ -926,10 +929,10 @@ o32_re = o32_im = 0;
// 0 0 0 0
// 0 0 0 0
int sp_idx = ((x4==0) ? X+(L4-1)*L3*L2*L1 : X-L3*L2*L1) / 2;
int sp_idx = ((x4==0) ? X+X4X3X2X1-X3X2X1 : X-X3X2X1) >> 1;
int ga_idx = sp_idx;
if (gauge_fixed && ga_idx < (L4-1)*L1h*L2*L3) {
if (gauge_fixed && ga_idx < X4X3X2X1h-X3X2X1h) {
// read spinor from device memory
READ_SPINOR_UP(SPINORTEX);
......
......@@ -94,7 +94,6 @@ projectors = [
### code generation ########################################################################
### parameters
sharedFloats = 0
dagger = False
def block(code):
......@@ -110,6 +109,10 @@ def sign(x):
def nthFloat4(n):
return `(n/4)` + "." + ["x", "y", "z", "w"][n%4]
def nthFloat2(n):
return `(n/2)` + "." + ["x", "y"][n%2]
def in_re(s, c): return "i"+`s`+`c`+"_re"
def in_im(s, c): return "i"+`s`+`c`+"_im"
def g_re(d, m, n): return ("g" if (d%2==0) else "gT")+`m`+`n`+"_re"
......@@ -130,24 +133,54 @@ def prolog():
str = []
str.append("// *** CUDA DSLASH ***\n\n" if not dagger else "// *** CUDA DSLASH DAGGER ***\n\n")
str.append("#define SHARED_FLOATS_PER_THREAD "+`sharedFloats`+"\n")
str.append("#define SHARED_BYTES (BLOCK_DIM*SHARED_FLOATS_PER_THREAD*sizeof(float))\n\n")
str.append("#define SHARED_BYTES_DOUBLE (BLOCK_DIM*SHARED_FLOATS_PER_THREAD*sizeof(double))\n\n")
str.append("#define SHARED_BYTES_SINGLE (BLOCK_DIM*SHARED_FLOATS_PER_THREAD*sizeof(float))\n\n")
str.append("// input spinor\n")
str.append("#if (DD_SPREC==0)\n")
str.append("#define spinorFloat double\n")
for s in range(0,4):
for c in range(0,3):
i = 3*s+c
str.append("#define "+in_re(s,c)+" I"+nthFloat2(2*i+0)+"\n")
str.append("#define "+in_im(s,c)+" I"+nthFloat2(2*i+1)+"\n")
str.append("\n")
str.append("#else\n")
str.append("#define spinorFloat float\n")
for s in range(0,4):
for c in range(0,3):
i = 3*s+c
str.append("#define "+in_re(s,c)+" I"+nthFloat4(2*i+0)+"\n")
str.append("#define "+in_im(s,c)+" I"+nthFloat4(2*i+1)+"\n")
str.append("\n")
str.append("#endif\n\n")
str.append("// gauge link\n")
str.append("#if (DD_GPREC==0)\n")
for m in range(0,3):
for n in range(0,3):
i = 3*m+n
str.append("#define "+g_re(0,m,n)+" G"+nthFloat2(2*i+0)+"\n")
str.append("#define "+g_im(0,m,n)+" G"+nthFloat2(2*i+1)+"\n")
str.append("// temporaries\n")
str.append("#define A_re G"+nthFloat2(18)+"\n")
str.append("#define A_im G"+nthFloat2(19)+"\n")
str.append("\n")
str.append("#else\n")
for m in range(0,3):
for n in range(0,3):
i = 3*m+n
str.append("#define "+g_re(0,m,n)+" G"+nthFloat4(2*i+0)+"\n")
str.append("#define "+g_im(0,m,n)+" G"+nthFloat4(2*i+1)+"\n")
str.append("\n")
str.append("// temporaries\n")
str.append("#define A_re G"+nthFloat4(18)+"\n")
str.append("#define A_im G"+nthFloat4(19)+"\n")
str.append("\n")
str.append("#endif\n\n")
str.append("// conjugated gauge link\n")
for m in range(0,3):
for n in range(0,3):
......@@ -156,11 +189,6 @@ def prolog():
str.append("#define "+g_im(1,m,n)+" (-"+g_im(0,n,m)+")\n")
str.append("\n")
str.append("// temporaries\n")
str.append("#define A_re G"+nthFloat4(18)+"\n")
str.append("#define A_im G"+nthFloat4(19)+"\n")
str.append("\n")
str.append("// first chiral block of inverted clover term\n")
i = 0
for m in range(0,6):
......@@ -205,11 +233,11 @@ def prolog():
if 2*i < sharedFloats:
str.append("#define "+out_re(s,c)+" s["+`(2*i+0)`+"]\n")
else:
str.append("volatile float "+out_re(s,c)+";\n")
str.append("volatile spinorFloat "+out_re(s,c)+";\n")
if 2*i+1 < sharedFloats:
str.append("#define "+out_im(s,c)+" s["+`(2*i+1)`+"]\n")
else:
str.append("volatile float "+out_im(s,c)+";\n")
str.append("volatile spinorFloat "+out_im(s,c)+";\n")
str.append("\n")
str.append(
......@@ -220,18 +248,26 @@ def prolog():
#include "io_spinor.h"
int sid = BLOCK_DIM*blockIdx.x + threadIdx.x;
int boundaryCrossings = sid/L1h + sid/(L2*L1h) + sid/(L3*L2*L1h);
int X = 2*sid + (boundaryCrossings + oddBit) % 2;
int x4 = X/(L3*L2*L1);
int x3 = (X/(L2*L1)) % L3;
int x2 = (X/L1) % L2;
int x1 = X % L1;
int boundaryCrossings = FAST_INT_DIVIDE(sid,X1h) +
FAST_INT_DIVIDE(sid,X2X1h) + FAST_INT_DIVIDE(sid,X3X2X1h);
int X = 2*sid + ((boundaryCrossings + oddBit)&1);
int x4 = FAST_INT_DIVIDE(X, X3X2X1);
int x3 = FAST_INT_MOD(FAST_INT_DIVIDE(X, X2X1), X3);
int x2 = FAST_INT_DIVIDE(X, X1);
int x1 = X - x2*X1;
x2 = FAST_INT_MOD(x2, X2);
""")
if sharedFloats > 0:
str.append("extern __shared__ float s_data[];\n")
str.append("volatile float *s = s_data+SHARED_FLOATS_PER_THREAD*threadIdx.x;\n\n")
str.append("#if (DD_SPREC==0)\n")
str.append("extern __shared__ spinorFloat sd_data[];\n")
str.append("volatile spinorFloat *s = sd_data+SHARED_FLOATS_PER_THREAD*threadIdx.x;\n")
str.append("#else\n")
str.append("extern __shared__ spinorFloat ss_data[];\n")
str.append("volatile spinorFloat *s = ss_data+SHARED_FLOATS_PER_THREAD*threadIdx.x;\n")
str.append("#endif\n\n")
for s in range(0,4):
for c in range(0,3):
......@@ -266,14 +302,14 @@ def gen(dir):
str.append("// "+l+"\n")
str.append("\n")
if dir == 0: str.append("int sp_idx = ((x1==L1-1) ? X-(L1-1) : X+1) / 2;\n")
if dir == 1: str.append("int sp_idx = ((x1==0) ? X+(L1-1) : X-1) / 2;\n")
if dir == 2: str.append("int sp_idx = ((x2==L2-1) ? X-(L2-1)*L1 : X+L1) / 2;\n")
if dir == 3: str.append("int sp_idx = ((x2==0) ? X+(L2-1)*L1 : X-L1) / 2;\n")
if dir == 4: str.append("int sp_idx = ((x3==L3-1) ? X-(L3-1)*L2*L1 : X+L2*L1) / 2;\n")
if dir == 5: str.append("int sp_idx = ((x3==0) ? X+(L3-1)*L2*L1 : X-L2*L1) / 2;\n")
if dir == 6: str.append("int sp_idx = ((x4==L4-1) ? X-(L4-1)*L3*L2*L1 : X+L3*L2*L1) / 2;\n")
if dir == 7: str.append("int sp_idx = ((x4==0) ? X+(L4-1)*L3*L2*L1 : X-L3*L2*L1) / 2;\n")
if dir == 0: str.append("int sp_idx = ((x1==X1-1) ? X-X1+1 : X+1) >> 1;\n")
if dir == 1: str.append("int sp_idx = ((x1==0) ? X+X1-1 : X-1) >> 1;\n")
if dir == 2: str.append("int sp_idx = ((x2==X2-1) ? X-X2X1+X1 : X+X1) >> 1;\n")
if dir == 3: str.append("int sp_idx = ((x2==0) ? X+X2X1-X1 : X-X1) >> 1;\n")
if dir == 4: str.append("int sp_idx = ((x3==X3-1) ? X-X3X2X1+X2X1 : X+X2X1) >> 1;\n")
if dir == 5: str.append("int sp_idx = ((x3==0) ? X+X3X2X1-X2X1 : X-X2X1) >> 1;\n")
if dir == 6: str.append("int sp_idx = ((x4==X4-1) ? X-X4X3X2X1+X3X2X1 : X+X3X2X1) >> 1;\n")
if dir == 7: str.append("int sp_idx = ((x4==0) ? X+X4X3X2X1-X3X2X1 : X-X3X2X1) >> 1;\n")
ga_idx = "sid" if dir % 2 == 0 else "sp_idx"
str.append("int ga_idx = "+ga_idx+";\n\n")
......@@ -334,24 +370,24 @@ def gen(dir):
strRe.append(sign(-im)+in_im(s,c))
strIm.append(sign(im)+in_re(s,c))
project.append("float "+h1_re(h,c)+ " = "+''.join(strRe)+";\n")
project.append("float "+h1_im(h,c)+ " = "+''.join(strIm)+";\n")
project.append("spinorFloat "+h1_re(h,c)+ " = "+''.join(strRe)+";\n")
project.append("spinorFloat "+h1_im(h,c)+ " = "+''.join(strIm)+";\n")
project.append("\n")
ident = []
ident.append("// identity gauge matrix\n")
for m in range(0,3):
for h in range(0,2):
ident.append("float "+h2_re(h,m)+" = " + h1_re(h,m) + "; ")
ident.append("float "+h2_im(h,m)+" = " + h1_im(h,m) + ";\n")
ident.append("spinorFloat "+h2_re(h,m)+" = " + h1_re(h,m) + "; ")
ident.append("spinorFloat "+h2_im(h,m)+" = " + h1_im(h,m) + ";\n")
ident.append("\n")
mult = []
for m in range(0,3):
mult.append("// multiply row "+`m`+"\n")
for h in range(0,2):
re = ["float "+h2_re(h,m)+" ="]
im = ["float "+h2_im(h,m)+" ="]
re = ["spinorFloat "+h2_re(h,m)+" ="]
im = ["spinorFloat "+h2_im(h,m)+" ="]
for c in range(0,3):
re.append(" + ("+g_re(dir,m,c)+" * "+h1_re(h,c)+" - "+g_im(dir,m,c)+" * "+h1_im(h,c)+")")
im.append(" + ("+g_re(dir,m,c)+" * "+h1_im(h,c)+" + "+g_im(dir,m,c)+" * "+h1_re(h,c)+")")
......@@ -385,7 +421,7 @@ def gen(dir):
reconstruct.append("\n")
if dir >= 6:
str.append("if (gauge_fixed && ga_idx < (L4-1)*L1h*L2*L3) ")
str.append("if (gauge_fixed && ga_idx < X4X3X2X1h-X3X2X1h) ")
str.append(block(''.join(load_spinor) + ''.join(project) + ''.join(ident) + ''.join(reconstruct)))
str.append("else ")
str.append(block(''.join(load_gauge) + ''.join(load_spinor) + ''.join(reconstruct_gauge) +
......@@ -400,14 +436,14 @@ def gen(dir):
def toChiralBasis(c):
str = []
str.append("float "+a_re(0,0,c)+" = -"+out_re(1,c)+" - "+out_re(3,c)+";\n")
str.append("float "+a_im(0,0,c)+" = -"+out_im(1,c)+" - "+out_im(3,c)+";\n")
str.append("float "+a_re(0,1,c)+" = "+out_re(0,c)+" + "+out_re(2,c)+";\n")
str.append("float "+a_im(0,1,c)+" = "+out_im(0,c)+" + "+out_im(2,c)+";\n")
str.append("float "+a_re(0,2,c)+" = -"+out_re(1,c)+" + "+out_re(3,c)+";\n")
str.append("float "+a_im(0,2,c)+" = -"+out_im(1,c)+" + "+out_im(3,c)+";\n")
str.append("float "+a_re(0,3,c)+" = "+out_re(0,c)+" - "+out_re(2,c)+";\n")
str.append("float "+a_im(0,3,c)+" = "+out_im(0,c)+" - "+out_im(2,c)+";\n")
str.append("spinorFloat "+a_re(0,0,c)+" = -"+out_re(1,c)+" - "+out_re(3,c)+";\n")
str.append("spinorFloat "+a_im(0,0,c)+" = -"+out_im(1,c)+" - "+out_im(3,c)+";\n")
str.append("spinorFloat "+a_re(0,1,c)+" = "+out_re(0,c)+" + "+out_re(2,c)+";\n")
str.append("spinorFloat "+a_im(0,1,c)+" = "+out_im(0,c)+" + "+out_im(2,c)+";\n")
str.append("spinorFloat "+a_re(0,2,c)+" = -"+out_re(1,c)+" + "+out_re(3,c)+";\n")
str.append("spinorFloat "+a_im(0,2,c)+" = -"+out_im(1,c)+" + "+out_im(3,c)+";\n")
str.append("spinorFloat "+a_re(0,3,c)+" = "+out_re(0,c)+" - "+out_re(2,c)+";\n")
str.append("spinorFloat "+a_im(0,3,c)+" = "+out_im(0,c)+" - "+out_im(2,c)+";\n")
str.append("\n")
for s in range (0,4):
......@@ -419,14 +455,14 @@ def toChiralBasis(c):
def fromChiralBasis(c): # note: factor of 1/2 is included in clover term normalization
str = []
str.append("float "+a_re(0,0,c)+" = "+out_re(1,c)+" + "+out_re(3,c)+";\n")
str.append("float "+a_im(0,0,c)+" = "+out_im(1,c)+" + "+out_im(3,c)+";\n")
str.append("float "+a_re(0,1,c)+" = -"+out_re(0,c)+" - "+out_re(2,c)+";\n")
str.append("float "+a_im(0,1,c)+" = -"+out_im(0,c)+" - "+out_im(2,c)+";\n")
str.append("float "+a_re(0,2,c)+" = "+out_re(1,c)+" - "+out_re(3,c)+";\n")
str.append("float "+a_im(0,2,c)+" = "+out_im(1,c)+" - "+out_im(3,c)+";\n")
str.append("float "+a_re(0,3,c)+" = -"+out_re(0,c)+" + "+out_re(2,c)+";\n")
str.append("float "+a_im(0,3,c)+" = -"+out_im(0,c)+" + "+out_im(2,c)+";\n")
str.append("spinorFloat "+a_re(0,0,c)+" = "+out_re(1,c)+" + "+out_re(3,c)+";\n")
str.append("spinorFloat "+a_im(0,0,c)+" = "+out_im(1,c)+" + "+out_im(3,c)+";\n")
str.append("spinorFloat "+a_re(0,1,c)+" = -"+out_re(0,c)+" - "+out_re(2,c)+";\n")
str.append("spinorFloat "+a_im(0,1,c)+" = -"+out_im(0,c)+" - "+out_im(2,c)+";\n")
str.append("spinorFloat "+a_re(0,2,c)+" = "+out_re(1,c)+" - "+out_re(3,c)+";\n")
str.append("spinorFloat "+a_im(0,2,c)+" = "+out_im(1,c)+" - "+out_im(3,c)+";\n")
str.append("spinorFloat "+a_re(0,3,c)+" = -"+out_re(0,c)+" + "+out_re(2,c)+";\n")
str.append("spinorFloat "+a_im(0,3,c)+" = -"+out_im(0,c)+" + "+out_im(2,c)+";\n")
str.append("\n")
for s in range (0,4):
......@@ -443,7 +479,7 @@ def cloverMult(chi):
for s in range (0,2):
for c in range (0,3):
str.append("float "+a_re(chi,s,c)+" = 0; float "+a_im(chi,s,c)+" = 0;\n")
str.append("spinorFloat "+a_re(chi,s,c)+" = 0; spinorFloat "+a_im(chi,s,c)+" = 0;\n")
str.append("\n")
for sm in range (0,2):
......@@ -492,11 +528,25 @@ def epilog():
#ifdef DSLASH_XPAY
READ_ACCUM(ACCUMTEX)
""")
str.append("#if (DD_SPREC==0)\n")
for s in range(0,4):
for c in range(0,3):
i = 3*s+c
str.append(" "+out_re(s,c) +" = a*"+out_re(s,c)+" + accum"+nthFloat2(2*i+0)+";\n")
str.append(" "+out_im(s,c) +" = a*"+out_im(s,c)+" + accum"+nthFloat2(2*i+1)+";\n")
str.append("#else\n")
for s in range(0,4):
for c in range(0,3):
i = 3*s+c
str.append(" "+out_re(s,c) +" = a*"+out_re(s,c)+" + accum"+nthFloat4(2*i+0)+";\n")
str.append(" "+out_im(s,c) +" = a*"+out_im(s,c)+" + accum"+nthFloat4(2*i+1)+";\n")
str.append("#endif // DD_SPREC\n")
str.append("#endif // DSLASH_XPAY\n\n")
str.append(
......@@ -505,6 +555,27 @@ def epilog():
WRITE_SPINOR();
""")
str.append("// undefine to prevent warning when precision is changed\n")
str.append("#undef spinorFloat\n")
str.append("#undef A_re\n")
str.append("#undef A_im\n\n")
for m in range(0,3):
for n in range(0,3):
i = 3*m+n
str.append("#undef "+g_re(0,m,n)+"\n")
str.append("#undef "+g_im(0,m,n)+"\n")
str.append("\n")
for s in range(0,4):
for c in range(0,3):
i = 3*s+c
str.append("#undef "+in_re(s,c)+"\n")
str.append("#undef "+in_im(s,c)+"\n")
return ''.join(str)
# end def epilog
......@@ -513,7 +584,7 @@ def generate():
return prolog() + gen(0) + gen(1) + gen(2) + gen(3) + gen(4) + gen(5) + gen(6) + gen(7) + clover() + epilog()
dagger = False
sharedFloats = 19
#dagger = True
dagger = True
sharedFloats = 0
print generate()
......@@ -299,12 +299,15 @@ volatile spinorFloat o32_im;
#include "io_spinor.h"
int sid = BLOCK_DIM*blockIdx.x + threadIdx.x;
int boundaryCrossings = sid/L1h + sid/(L2*L1h) + sid/(L3*L2*L1h);
int X = 2*sid + (boundaryCrossings + oddBit) % 2;
int x4 = X/(L3*L2*L1);
int x3 = (X/(L2*L1)) % L3;
int x2 = (X/L1) % L2;
int x1 = X % L1;
int boundaryCrossings = FAST_INT_DIVIDE(sid,X1h) +
FAST_INT_DIVIDE(sid,X2X1h) + FAST_INT_DIVIDE(sid,X3X2X1h);
int X = 2*sid + ((boundaryCrossings + oddBit)&1);
int x4 = FAST_INT_DIVIDE(X, X3X2X1);
int x3 = FAST_INT_MOD(FAST_INT_DIVIDE(X, X2X1), X3);
int x2 = FAST_INT_DIVIDE(X, X1);
int x1 = X - x2*X1;
x2 = FAST_INT_MOD(x2, X2);
o00_re = o00_im = 0;
o01_re = o01_im = 0;
......@@ -326,7 +329,7 @@ o32_re = o32_im = 0;
// 0 -i 1 0
// -i 0 0 1
int sp_idx = ((x1==L1-1) ? X-(L1-1) : X+1) / 2;
int sp_idx = ((x1==X1-1) ? X-X1+1 : X+1) >> 1;
int ga_idx = sid;
// read gauge matrix from device memory
......@@ -407,7 +410,7 @@ o32_re = o32_im = 0;
// 0 i 1 0
// i 0 0 1
int sp_idx = ((x1==0) ? X+(L1-1) : X-1) / 2;
int sp_idx = ((x1==0) ? X+X1-1 : X-1) >> 1;
int ga_idx = sp_idx;
// read gauge matrix from device memory
......@@ -488,7 +491,7 @@ o32_re = o32_im = 0;
// 0 -1 1 0
// 1 0 0 1
int sp_idx = ((x2==L2-1) ? X-(L2-1)*L1 : X+L1) / 2;
int sp_idx = ((x2==X2-1) ? X-X2X1+X1 : X+X1) >> 1;
int ga_idx = sid;
// read gauge matrix from device memory
......@@ -569,7 +572,7 @@ o32_re = o32_im = 0;
// 0 1 1 0
// -1 0 0 1
int sp_idx = ((x2==0) ? X+(L2-1)*L1 : X-L1) / 2;
int sp_idx = ((x2==0) ? X+X2X1-X1 : X-X1) >> 1;
int ga_idx = sp_idx;
// read gauge matrix from device memory
......@@ -650,7 +653,7 @@ o32_re = o32_im = 0;
// -i 0 1 0
// 0 i 0 1
int sp_idx = ((x3==L3-1) ? X-(L3-1)*L2*L1 : X+L2*L1) / 2;
int sp_idx = ((x3==X3-1) ? X-X3X2X1+X2X1 : X+X2X1) >> 1;
int ga_idx = sid;
// read gauge matrix from device memory
......@@ -731,7 +734,7 @@ o32_re = o32_im = 0;
// i 0 1 0
// 0 -i 0 1
int sp_idx = ((x3==0) ? X+(L3-1)*L2*L1 : X-L2*L1) / 2;
int sp_idx = ((x3==0) ? X+X3X2X1-X2X1 : X-X2X1) >> 1;
int ga_idx = sp_idx;
// read gauge matrix from device memory
......@@ -812,10 +815,10 @@ o32_re = o32_im = 0;
// 0 0 0 0
// 0 0 0 0
int sp_idx = ((x4==L4-1) ? X-(L4-1)*L3*L2*L1 : X+L3*L2*L1) / 2;
int sp_idx = ((x4==X4-1) ? X-X4X3X2X1+X3X2X1 : X+X3X2X1) >> 1;
int ga_idx = sid;
if (gauge_fixed && ga_idx < (L4-1)*L1h*L2*L3) {
if (gauge_fixed && ga_idx < X4X3X2X1h-X3X2X1h) {
// read spinor from device memory
READ_SPINOR_UP(SPINORTEX);
......@@ -926,10 +929,10 @@ o32_re = o32_im = 0;
// 0 0 2 0
// 0 0 0 2
int sp_idx = ((x4==0) ? X+(L4-1)*L3*L2*L1 : X-L3*L2*L1) / 2;
int sp_idx = ((x4==0) ? X+X4X3X2X1-X3X2X1 : X-X3X2X1) >> 1;
int ga_idx = sp_idx;
if (gauge_fixed && ga_idx < (L4-1)*L1h*L2*L3) {
if (gauge_fixed && ga_idx < X4X3X2X1h-X3X2X1h) {
// read spinor from device memory
READ_SPINOR_DOWN(SPINORTEX);
......
......@@ -21,12 +21,11 @@
//
// DD_GPREC_F = D, S, H
// DD_SPREC_F = D, S, H
// DD_CPREC_F = S, [blank]; the latter corresponds to plain Wilson
// DD_CPREC_F = D, S, H, [blank]; the latter corresponds to plain Wilson
// DD_RECON_F = 12, 8
// DD_DAG_F = Dagger, [blank]
// DD_XPAY_F = Xpay, [blank]
// initialize on first iteration
#ifndef DD_LOOP
......@@ -36,7 +35,7 @@
#define DD_RECON 0
#define DD_GPREC 0
#define DD_SPREC 0
#define DD_CPREC 0
#define DD_CPREC 1
#endif
// set options for current iteration
......@@ -138,16 +137,26 @@
#endif
#endif
#if (DD_CPREC==0) // single-precision clover term
#if (DD_CPREC==0) // double-precision clover term
#define DD_CPREC_F D
#define CLOVERTEX cloverTexDouble
#define READ_CLOVER READ_CLOVER_DOUBLE
#define DSLASH_CLOVER
#elif (DD_CPREC==1) // single-precision clover term
#define DD_CPREC_F S
#define CLOVERTEX cloverTexSingle
#define READ_CLOVER READ_CLOVER_SINGLE
#define DSLASH_CLOVER
#elif (DD_CPREC==2) // single-precision clover term
#define DD_CPREC_F H
#define CLOVERTEX cloverTexHalf
#define READ_CLOVER READ_CLOVER_HALF
#define DSLASH_CLOVER
#else // no clover term
#define DD_CPREC_F
#endif
#if !(__CUDA_ARCH__ != 130 && (DD_SPREC == 0 || DD_GPREC == 0))
#if !(__CUDA_ARCH__ != 130 && (DD_SPREC == 0 || DD_GPREC == 0 || DD_CPREC == 0))
#define DD_CONCAT(g,s,c,r,d,x) dslash ## g ## s ## c ## r ## d ## x ## Kernel
#define DD_FUNC(g,s,c,r,d,x) DD_CONCAT(g,s,c,r,d,x)
......@@ -237,9 +246,15 @@ DD_FUNC(DD_GPREC_F, DD_SPREC_F, DD_CPREC_F, DD_RECON_F, DD_DAG_F, DD_XPAY_F)(DD_
#undef DD_SPREC
#define DD_SPREC 0
#if (DD_CPREC==0)
//#if (DD_CPREC==0)
//#undef DD_CPREC
//#define DD_CPREC 1
#if (DD_CPREC==1)
#undef DD_CPREC
#define DD_CPREC 1
//#define DD_CPREC 2
//#elif (DD_CPREC==2)
//#undef DD_CPREC
#define DD_CPREC 3
#else
#undef DD_LOOP
......
......@@ -3,14 +3,11 @@
#include <dslash_quda.h>
// ----------------------------------------------------------------------
// Cuda code
#if (__CUDA_ARCH__ == 130)
static __inline__ __device__ double2 fetch_double2(texture<int4, 1> t, int i)
{
int4 v = tex1Dfetch(t,i);
return make_double2(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z));
int4 v = tex1Dfetch(t,i);
return make_double2(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z));
}
#endif
......@@ -46,9 +43,16 @@ texture<float4, 1, cudaReadModeElementType> accumTexSingle;
texture<short4, 1, cudaReadModeNormalizedFloat> accumTexHalf;
texture<float, 1, cudaReadModeElementType> accumTexNorm;
// Double precision clover term
texture<int4, 1> cloverTexDouble;
// Single precision clover term
texture<float4, 1, cudaReadModeElementType> cloverTexSingle;
// Half precision clover term
texture<short4, 1, cudaReadModeNormalizedFloat> cloverTexHalf;
texture<float, 1, cudaReadModeElementType> cloverTexNorm;
QudaGaugeParam *gauge_param;
QudaInvertParam *invert_param;
......@@ -56,7 +60,16 @@ __constant__ int X1;
__constant__ int X2;
__constant__ int X3;
__constant__ int X4;
__constant__ int X1h;
__constant__ int X2X1h;
__constant__ int X3X2X1h;
__constant__ int X4X3X2X1h;
__constant__ int X2X1;
__constant__ int X3X2X1;
__constant__ int X4X3X2X1;
__constant__ int Vh;
__constant__ int gauge_fixed;
......@@ -71,7 +84,43 @@ __constant__ double t_boundary;
#include <dslash_def.h>
void setCudaGaugeParam() {
void initDslashCuda(FullGauge gauge) {
int Vh = gauge.volume;
cudaMemcpyToSymbol("Vh", &Vh, sizeof(int));
int X1 = 2*gauge.X[0];
cudaMemcpyToSymbol("X1", &X1, sizeof(int));
int X2 = gauge.X[1];
cudaMemcpyToSymbol("X2", &X2, sizeof(int));
int X3 = gauge.X[2];
cudaMemcpyToSymbol("X3", &X3, sizeof(int));
int X4 = gauge.X[3];
cudaMemcpyToSymbol("X4", &X4, sizeof(int));
int X2X1 = X2*X1;
cudaMemcpyToSymbol("X2X1", &X2X1, sizeof(int));
int X3X2X1 = X3*X2*X1;
cudaMemcpyToSymbol("X3X2X1", &X3X2X1, sizeof(int));
int X4X3X2X1 = X4*X3*X2*X1;
cudaMemcpyToSymbol("X4X3X2X1", &X4X3X2X1, sizeof(int));
int X1h = X1/2;
cudaMemcpyToSymbol("X1h", &X1h, sizeof(int));
int X2X1h = X2*X1h;
cudaMemcpyToSymbol("X2X1h", &X2X1h, sizeof(int));
int X3X2X1h = X3*X2*X1h;
cudaMemcpyToSymbol("X3X2X1h", &X3X2X1h, sizeof(int));
int X4X3X2X1h = X4*X3*X2*X1h;
cudaMemcpyToSymbol("X4X3X2X1h", &X4X3X2X1h, sizeof(int));
int gf = (gauge_param->gauge_fix == QUDA_GAUGE_FIXED_YES) ? 1 : 0;
cudaMemcpyToSymbol("gauge_fixed", &(gf), sizeof(int));
......@@ -91,35 +140,29 @@ void setCudaGaugeParam() {
}
void bindGaugeTex(FullGauge gauge, int oddBit) {
int reconstruct = (gauge.reconstruct == QUDA_RECONSTRUCT_12) ? 12 : 8;
int packed_gauge_bytes = 4*Nh*reconstruct;
if (gauge.precision == QUDA_DOUBLE_PRECISION) {
packed_gauge_bytes *= sizeof(double);
if (oddBit) {
cudaBindTexture(0, gauge0TexDouble, gauge.odd, packed_gauge_bytes);
cudaBindTexture(0, gauge1TexDouble, gauge.even, packed_gauge_bytes);
cudaBindTexture(0, gauge0TexDouble, gauge.odd, gauge.bytes);
cudaBindTexture(0, gauge1TexDouble, gauge.even, gauge.bytes);
} else {
cudaBindTexture(0, gauge0TexDouble, gauge.even, packed_gauge_bytes);
cudaBindTexture(0, gauge1TexDouble, gauge.odd, packed_gauge_bytes);
cudaBindTexture(0, gauge0TexDouble, gauge.even, gauge.bytes);
cudaBindTexture(0, gauge1TexDouble, gauge.odd, gauge.bytes);
}
} else if (gauge.precision == QUDA_SINGLE_PRECISION) {
packed_gauge_bytes *= sizeof(float);
if (oddBit) {
cudaBindTexture(0, gauge0TexSingle, gauge.odd, packed_gauge_bytes);
cudaBindTexture(0, gauge1TexSingle, gauge.even, packed_gauge_bytes);
cudaBindTexture(0, gauge0TexSingle, gauge.odd, gauge.bytes);
cudaBindTexture(0, gauge1TexSingle, gauge.even, gauge.bytes);
} else {
cudaBindTexture(0, gauge0TexSingle, gauge.even, packed_gauge_bytes);
cudaBindTexture(0, gauge1TexSingle, gauge.odd, packed_gauge_bytes);
cudaBindTexture(0, gauge0TexSingle, gauge.even, gauge.bytes);
cudaBindTexture(0, gauge1TexSingle, gauge.odd, gauge.bytes);
}
} else {
packed_gauge_bytes *= sizeof(float)/2;
if (oddBit) {
cudaBindTexture(0, gauge0TexHalf, gauge.odd, packed_gauge_bytes);
cudaBindTexture(0, gauge1TexHalf, gauge.even, packed_gauge_bytes);
cudaBindTexture(0, gauge0TexHalf, gauge.odd, gauge.bytes);
cudaBindTexture(0, gauge1TexHalf, gauge.even, gauge.bytes);
} else {
cudaBindTexture(0, gauge0TexHalf, gauge.even, packed_gauge_bytes);
cudaBindTexture(0, gauge1TexHalf, gauge.odd, packed_gauge_bytes);
cudaBindTexture(0, gauge0TexHalf, gauge.even, gauge.bytes);
cudaBindTexture(0, gauge1TexHalf, gauge.odd, gauge.bytes);
}
}
}
......@@ -132,12 +175,33 @@ void checkSpinor(ParitySpinor out, ParitySpinor in) {
printf("Error in dslash quda: input and out spinor precision's don't match\n");
exit(-1);
}
#if (__CUDA_ARCH__ != 130)
if (in.precision == QUDA_DOUBLE_PRECISION) {
printf("Double precision not supported on this GPU\n");
exit(-1);
}
#endif
}
void checkGaugeSpinor(ParitySpinor spinor, FullGauge gauge) {
if (spinor.volume != gauge.volume) {
printf("Error, spinor volume %d doesn't match gauge volume %d\n", spinor.volume, gauge.volume);
exit(-1);
}
#if (__CUDA_ARCH__ != 130)
if (gauge.precision == QUDA_DOUBLE_PRECISION) {
printf("Double precision not supported on this GPU\n");
exit(-1);
}
#endif
}
void dslashCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity, int dagger) {
initDslashCuda(gauge);
checkSpinor(in, out);
checkGaugeSpinor(in, gauge);
if (in.precision == QUDA_DOUBLE_PRECISION) {
dslashDCuda(out, gauge, in, parity, dagger);
......@@ -152,14 +216,15 @@ void dslashCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity,
void dslashDCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit) {
dim3 gridDim(GRID_DIM, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
bindGaugeTex(gauge, oddBit);
int spinor_bytes = Nh*spinorSiteSize*sizeof(double);
int spinor_bytes = res.length*sizeof(double);
cudaBindTexture(0, spinorTexDouble, spinor.spinor, spinor_bytes);
#if (__CUDA_ARCH__ == 130)
if (gauge.precision == QUDA_DOUBLE_PRECISION) {
if (gauge.reconstruct == QUDA_RECONSTRUCT_12) {
if (!daggerBit) {
......@@ -203,6 +268,7 @@ void dslashDCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
}
}
}
#endif
}
......@@ -210,15 +276,16 @@ void dslashDCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
void dslashSCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit) {
dim3 gridDim(GRID_DIM, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
bindGaugeTex(gauge, oddBit);
int spinor_bytes = Nh*spinorSiteSize*sizeof(float);
int spinor_bytes = res.length*sizeof(float);
cudaBindTexture(0, spinorTexSingle, spinor.spinor, spinor_bytes);
if (gauge.precision == QUDA_DOUBLE_PRECISION) {
#if (__CUDA_ARCH__ == 130)
if (gauge.reconstruct == QUDA_RECONSTRUCT_12) {
if (!daggerBit) {
dslashDS12Kernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit);
......@@ -232,6 +299,7 @@ void dslashSCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
dslashDS8DaggerKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit);
}
}
#endif
} else if (gauge.precision == QUDA_SINGLE_PRECISION) {
if (gauge.reconstruct == QUDA_RECONSTRUCT_12) {
if (!daggerBit) {
......@@ -268,16 +336,17 @@ void dslashSCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
void dslashHCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit) {
dim3 gridDim(GRID_DIM, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
bindGaugeTex(gauge, oddBit);
int spinor_bytes = Nh*spinorSiteSize*sizeof(float)/2;
int spinor_bytes = res.length*sizeof(float)/2;
cudaBindTexture(0, spinorTexHalf, spinor.spinor, spinor_bytes);
cudaBindTexture(0, spinorTexNorm, spinor.spinorNorm, spinor_bytes/12);
if (gauge.precision == QUDA_DOUBLE_PRECISION) {
#if (__CUDA_ARCH__ == 130)
if (gauge.reconstruct == QUDA_RECONSTRUCT_12) {
if (!daggerBit) {
dslashDH12Kernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit);
......@@ -291,6 +360,7 @@ void dslashHCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
dslashDH8DaggerKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit);
}
}
#endif
} else if (gauge.precision == QUDA_SINGLE_PRECISION) {
if (gauge.reconstruct == QUDA_RECONSTRUCT_12) {
if (!daggerBit) {
......@@ -325,7 +395,9 @@ void dslashHCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
void dslashXpayCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity, int dagger,
ParitySpinor x, double a) {
initDslashCuda(gauge);
checkSpinor(in, out);
checkGaugeSpinor(in, gauge);
if (in.precision == QUDA_DOUBLE_PRECISION) {
dslashXpayDCuda(out, gauge, in, parity, dagger, x, a);
......@@ -340,15 +412,16 @@ void dslashXpayCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int pari
void dslashXpayDCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, double a) {
dim3 gridDim(GRID_DIM, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
bindGaugeTex(gauge, oddBit);
int spinor_bytes = Nh*spinorSiteSize*sizeof(double);
int spinor_bytes = res.length*sizeof(double);
cudaBindTexture(0, spinorTexDouble, spinor.spinor, spinor_bytes);
cudaBindTexture(0, accumTexDouble, x.spinor, spinor_bytes);
#if (__CUDA_ARCH__ == 130)
if (gauge.precision == QUDA_DOUBLE_PRECISION) {
if (gauge.reconstruct == QUDA_RECONSTRUCT_12) {
if (!daggerBit) {
......@@ -395,22 +468,24 @@ void dslashXpayDCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
}
}
}
#endif
}
void dslashXpaySCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, double a) {
dim3 gridDim(GRID_DIM, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
bindGaugeTex(gauge, oddBit);
int spinor_bytes = Nh*spinorSiteSize*sizeof(float);
int spinor_bytes = res.length*sizeof(float);
cudaBindTexture(0, spinorTexSingle, spinor.spinor, spinor_bytes);
cudaBindTexture(0, accumTexSingle, x.spinor, spinor_bytes);
if (gauge.precision == QUDA_DOUBLE_PRECISION) {
#if (__CUDA_ARCH__ == 130)
if (gauge.reconstruct == QUDA_RECONSTRUCT_12) {
if (!daggerBit) {
dslashDS12XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
......@@ -425,6 +500,7 @@ void dslashXpaySCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
dslashDS8DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
}
}
#endif
} else if (gauge.precision == QUDA_SINGLE_PRECISION) {
if (gauge.reconstruct == QUDA_RECONSTRUCT_12) {
if (!daggerBit) {
......@@ -462,18 +538,19 @@ void dslashXpaySCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
void dslashXpayHCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, double a) {
dim3 gridDim(GRID_DIM, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
bindGaugeTex(gauge, oddBit);
int spinor_bytes = Nh*spinorSiteSize*sizeof(float)/2;
int spinor_bytes = res.length*sizeof(float)/2;
cudaBindTexture(0, spinorTexHalf, spinor.spinor, spinor_bytes);
cudaBindTexture(0, spinorTexNorm, spinor.spinorNorm, spinor_bytes/12);
cudaBindTexture(0, accumTexHalf, x.spinor, spinor_bytes);
cudaBindTexture(0, accumTexNorm, x.spinorNorm, spinor_bytes/12);
if (gauge.precision == QUDA_DOUBLE_PRECISION) {
#if (__CUDA_ARCH__ == 130)
if (gauge.reconstruct == QUDA_RECONSTRUCT_12) {
if (!daggerBit) {
dslashDH12XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>
......@@ -491,6 +568,7 @@ void dslashXpayHCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a);
}
}
#endif
} else if (gauge.precision == QUDA_SINGLE_PRECISION) {
if (gauge.reconstruct == QUDA_RECONSTRUCT_12) {
if (!daggerBit) {
......@@ -539,37 +617,14 @@ int dslashCudaSharedBytes() {
void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa,
ParitySpinor tmp, MatPCType matpc_type, int dagger) {
checkSpinor(in, out);
checkSpinor(in, tmp);
double kappa2 = -kappa*kappa;
if (in.precision == QUDA_DOUBLE_PRECISION) {
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashDCuda(tmp, gauge, in, 1, dagger);
dslashXpayDCuda(out, gauge, tmp, 0, dagger, in, kappa2);
} else {
dslashDCuda(tmp, gauge, in, 0, dagger);
dslashXpayDCuda(out, gauge, tmp, 1, dagger, in, kappa2);
}
} else if (in.precision == QUDA_SINGLE_PRECISION) {
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashSCuda(tmp, gauge, in, 1, dagger);
dslashXpaySCuda(out, gauge, tmp, 0, dagger, in, kappa2);
} else {
dslashSCuda(tmp, gauge, in, 0, dagger);
dslashXpaySCuda(out, gauge, tmp, 1, dagger, in, kappa2);
}
} else if (in.precision == QUDA_HALF_PRECISION) {
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashHCuda(tmp, gauge, in, 1, dagger);
dslashXpayHCuda(out, gauge, tmp, 0, dagger, in, kappa2);
} else {
dslashHCuda(tmp, gauge, in, 0, dagger);
dslashXpayHCuda(out, gauge, tmp, 1, dagger, in, kappa2);
}
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashCuda(tmp, gauge, in, 1, dagger);
dslashXpayCuda(out, gauge, tmp, 0, dagger, in, kappa2);
} else {
dslashCuda(tmp, gauge, in, 0, dagger);
dslashXpayCuda(out, gauge, tmp, 1, dagger, in, kappa2);
}
}
void MatPCDagMatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in,
......@@ -580,53 +635,6 @@ void MatPCDagMatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in,
// Apply the full operator
void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa, int dagger) {
checkSpinor(in.even, out.even);
if (in.even.precision == QUDA_DOUBLE_PRECISION) {
dslashXpayDCuda(out.odd, gauge, in.even, 1, dagger, in.odd, -kappa);
dslashXpayDCuda(out.even, gauge, in.odd, 0, dagger, in.even, -kappa);
} else if (in.even.precision == QUDA_SINGLE_PRECISION) {
dslashXpaySCuda(out.odd, gauge, in.even, 1, dagger, in.odd, -kappa);
dslashXpaySCuda(out.even, gauge, in.odd, 0, dagger, in.even, -kappa);
} else if (in.even.precision == QUDA_HALF_PRECISION) {
dslashXpayHCuda(out.odd, gauge, in.even, 1, dagger, in.odd, -kappa);
dslashXpayHCuda(out.even, gauge, in.odd, 0, dagger, in.even, -kappa);
}
}
/*
// Apply the even-odd preconditioned Dirac operator
QudaSumComplex MatPCcDotWXCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, float kappa,
ParitySpinor tmp, ParitySpinor d, MatPCType matpc_type) {
float kappa2 = -kappa*kappa;
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashSCuda(tmp, gauge, in, 1, 0);
dslashSCuda(out, gauge, tmp, 0, 0);
} else {
dslashSCuda(tmp, gauge, in, 0, 0);
dslashSCuda(out, gauge, tmp, 1, 0);
}
// out = in - kappa2*out, dot = (d, out)
return xpaycDotzyCuda((float2*)in.spinor, kappa2, (float2*)out.spinor, (float2*)d.spinor, Nh*spinorSiteSize/2);
dslashXpayCuda(out.odd, gauge, in.even, 1, dagger, in.odd, -kappa);
dslashXpayCuda(out.even, gauge, in.odd, 0, dagger, in.even, -kappa);
}
// Apply the even-odd preconditioned Dirac operator
QudaSumComplex MatPCDagcDotWXCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, float kappa,
ParitySpinor tmp, ParitySpinor d, MatPCType matpc_type) {
float kappa2 = -kappa*kappa;
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashSCuda(tmp, gauge, in, 1, 1);
dslashSCuda(out, gauge, tmp, 0, 1);
} else {
dslashSCuda(tmp, gauge, in, 0, 1);
dslashSCuda(out, gauge, tmp, 1, 1);
}
// out = in - kappa2*out, dot = (d, out)
return xpaycDotzyCuda((float2*)in.spinor, kappa2, (float2*)out.spinor, (float2*)d.spinor, Nh*spinorSiteSize/2);
}*/
......@@ -10,12 +10,6 @@
#define cloverSiteSize 72 // real numbers per block-diagonal clover matrix
#define BLOCK_DIM (64) // threads per block
#define GRID_DIM (Nh/BLOCK_DIM) // there are Nh threads in total
#define PACKED12_GAUGE_BYTES (4*Nh*12*sizeof(float))
#define PACKED8_GAUGE_BYTES (4*Nh*8*sizeof(float))
#define CLOVER_BYTES (Nh*cloverSiteSize*sizeof(float))
#ifdef __cplusplus
extern "C" {
......@@ -32,7 +26,7 @@ extern "C" {
// ---------- dslash_quda.cu ----------
int dslashCudaSharedBytes();
void setCudaGaugeParam();
void initDslashCuda();
void bindGaugeTex(FullGauge gauge, int oddBit);
// Double precision routines
......@@ -67,18 +61,13 @@ extern "C" {
void MatPCDagMatPCCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven,
double kappa, ParitySpinor tmp, MatPCType matpc_type);
/*QudaSumComplex MatPCcDotWXCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven,
float kappa, ParitySpinor tmp, ParitySpinor d, MatPCType matpc_type);
QudaSumComplex MatPCDagcDotWXCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven,
float kappa, ParitySpinor tmp, ParitySpinor d, MatPCType matpc_type);*/
// -- inv_cg_cuda.cpp
void invertCgCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge,
ParitySpinor tmp, QudaInvertParam *param);
FullGauge gaugeSloppy, ParitySpinor tmp, QudaInvertParam *param);
// -- inv_bicgstab_cuda.cpp
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, FullGauge gaugeSloppy,
FullGauge gaugePrecise, ParitySpinor tmp,
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge,
FullGauge gaugeSloppy, ParitySpinor tmp,
QudaInvertParam *param, DagType dag_type);
#ifdef __cplusplus
......
......@@ -6,6 +6,19 @@
#include <util_quda.h>
#include <dslash_reference.h>
int Z[4];
int V;
int Vh;
void setDims(int *X) {
V = 1;
for (int d=0; d< 4; d++) {
V *= X[d];
Z[d] = X[d];
}
Vh = V/2;
}
template <typename Float>
void sum(Float *dst, Float *a, Float *b, int cnt) {
for (int i = 0; i < cnt; i++)
......@@ -28,21 +41,22 @@ void xpay(Float *x, Float a, Float *y, int len) {
// new "half index", which can be an index into either the even or odd lattices.
// displacements of magnitude one always interchange odd and even lattices.
//
int neighborIndex(int i, int oddBit, int dx4, int dx3, int dx2, int dx1) {
int X = fullLatticeIndex(i, oddBit);
int x4 = X/(L3*L2*L1);
int x3 = (X/(L2*L1)) % L3;
int x2 = (X/L1) % L2;
int x1 = X % L1;
int Y = fullLatticeIndex(i, oddBit);
int x4 = Y/(Z[2]*Z[1]*Z[0]);
int x3 = (Y/(Z[1]*Z[0])) % Z[2];
int x2 = (Y/Z[0]) % Z[1];
int x1 = Y % Z[0];
// assert (oddBit == (x+y+z+t)%2);
x4 = (x4+dx4+L4) % L4;
x3 = (x3+dx3+L3) % L3;
x2 = (x2+dx2+L2) % L2;
x1 = (x1+dx1+L1) % L1;
x4 = (x4+dx4+Z[3]) % Z[3];
x3 = (x3+dx3+Z[2]) % Z[2];
x2 = (x2+dx2+Z[1]) % Z[1];
x1 = (x1+dx1+Z[0]) % Z[0];
return (x4*(L3*L2*L1) + x3*(L2*L1) + x2*(L1) + x1) / 2;
return (x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
}
template <typename Float>
......@@ -176,7 +190,7 @@ const double projector[8][4][4][2] = {
// todo pass projector
template <typename Float>
void multiplySpinorByDiracProjector(Float *res, int projIdx, Float *spinorIn) {
for (uint i=0; i<4*3*2; i++) res[i] = 0.0;
for (int i=0; i<4*3*2; i++) res[i] = 0.0;
for (int s = 0; s < 4; s++) {
for (int t = 0; t < 4; t++) {
......@@ -205,15 +219,15 @@ void multiplySpinorByDiracProjector(Float *res, int projIdx, Float *spinorIn) {
//
template <typename sFloat, typename gFloat>
void dslashReference(sFloat *res, gFloat **gaugeFull, sFloat *spinorField, int oddBit, int daggerBit) {
for (uint i=0; i<Nh*4*3*2; i++) res[i] = 0.0;
for (int i=0; i<Vh*4*3*2; i++) res[i] = 0.0;
gFloat *gaugeEven[4], *gaugeOdd[4];
for (int dir = 0; dir < 4; dir++) {
gaugeEven[dir] = gaugeFull[dir];
gaugeOdd[dir] = gaugeFull[dir]+Nh*gaugeSiteSize;
gaugeOdd[dir] = gaugeFull[dir]+Vh*gaugeSiteSize;
}
for (int i = 0; i < Nh; i++) {
for (int i = 0; i < Vh; i++) {
for (int dir = 0; dir < 8; dir++) {
gFloat *gauge = gaugeLink(i, dir, oddBit, gaugeEven, gaugeOdd);
sFloat *spinor = spinorNeighbor(i, dir, oddBit, spinorField);
......@@ -234,8 +248,8 @@ void dslashReference(sFloat *res, gFloat **gaugeFull, sFloat *spinorField, int o
}
}
void dslash_reference(void *res, void **gaugeFull, void *spinorField, int oddBit, int daggerBit,
Precision sPrecision, Precision gPrecision) {
void dslash(void *res, void **gaugeFull, void *spinorField, int oddBit, int daggerBit,
Precision sPrecision, Precision gPrecision) {
if (sPrecision == QUDA_DOUBLE_PRECISION)
if (gPrecision == QUDA_DOUBLE_PRECISION)
......@@ -251,135 +265,68 @@ void dslash_reference(void *res, void **gaugeFull, void *spinorField, int oddBit
}
template <typename sFloat, typename gFloat>
void Mat(sFloat *out, gFloat **gauge, sFloat *in, sFloat kappa) {
void Mat(sFloat *out, gFloat **gauge, sFloat *in, sFloat kappa, int daggerBit) {
sFloat *inEven = in;
sFloat *inOdd = in + Nh*spinorSiteSize;
sFloat *inOdd = in + Vh*spinorSiteSize;
sFloat *outEven = out;
sFloat *outOdd = out + Nh*spinorSiteSize;
sFloat *outOdd = out + Vh*spinorSiteSize;
// full dslash operator
dslashReference(outOdd, gauge, inEven, 1, 0);
dslashReference(outEven, gauge, inOdd, 0, 0);
dslashReference(outOdd, gauge, inEven, 1, daggerBit);
dslashReference(outEven, gauge, inOdd, 0, daggerBit);
// lastly apply the kappa term
xpay(in, -kappa, out, N*spinorSiteSize);
xpay(in, -kappa, out, V*spinorSiteSize);
}
template <typename sFloat, typename gFloat>
void MatDag(sFloat *out, gFloat **gauge, sFloat *in, sFloat kappa) {
sFloat *inEven = in;
sFloat *inOdd = in + Nh*spinorSiteSize;
sFloat *outEven = out;
sFloat *outOdd = out + Nh*spinorSiteSize;
// full dslash operator
dslashReference(outOdd, gauge, inEven, 1, 1);
dslashReference(outEven, gauge, inOdd, 0, 1);
// lastly apply the kappa term
xpay(in, -kappa, out, N*spinorSiteSize);
}
void mat(void *out, void **gauge, void *in, double kappa, int dagger_bit,
void mat(void *out, void **gauge, void *in, double kappa, int dagger_bit,
Precision sPrecision, Precision gPrecision) {
if (!dagger_bit) {
if (sPrecision == QUDA_DOUBLE_PRECISION)
if (gPrecision == QUDA_DOUBLE_PRECISION) Mat((double*)out, (double**)gauge, (double*)in, (double)kappa);
else Mat((double*)out, (float**)gauge, (double*)in, (double)kappa);
else
if (gPrecision == QUDA_DOUBLE_PRECISION) Mat((float*)out, (double**)gauge, (float*)in, (float)kappa);
else Mat((float*)out, (float**)gauge, (float*)in, (float)kappa);
} else {
if (sPrecision == QUDA_DOUBLE_PRECISION)
if (gPrecision == QUDA_DOUBLE_PRECISION) MatDag((double*)out, (double**)gauge, (double*)in, (double)kappa);
else MatDag((float*)out, (double**)gauge, (float*)in, (float)kappa);
else
if (gPrecision == QUDA_DOUBLE_PRECISION) MatDag((float*)out, (double**)gauge, (float*)in, (float)kappa);
else MatDag((float*)out, (float**)gauge, (float*)in, (float)kappa);
}
if (sPrecision == QUDA_DOUBLE_PRECISION)
if (gPrecision == QUDA_DOUBLE_PRECISION)
Mat((double*)out, (double**)gauge, (double*)in, (double)kappa, dagger_bit);
else
Mat((double*)out, (float**)gauge, (double*)in, (double)kappa, dagger_bit);
else
if (gPrecision == QUDA_DOUBLE_PRECISION)
Mat((float*)out, (double**)gauge, (float*)in, (float)kappa, dagger_bit);
else
Mat((float*)out, (float**)gauge, (float*)in, (float)kappa, dagger_bit);
}
// Apply the even-odd preconditioned Dirac operator
template <typename sFloat, typename gFloat>
void MatPC(sFloat *outEven, gFloat **gauge, sFloat *inEven, sFloat kappa,
MatPCType matpc_type) {
void MatPC(sFloat *outEven, gFloat **gauge, sFloat *inEven, sFloat kappa,
int daggerBit, MatPCType matpc_type) {
sFloat *tmp = (sFloat*)malloc(Nh*spinorSiteSize*sizeof(sFloat));
sFloat *tmp = (sFloat*)malloc(Vh*spinorSiteSize*sizeof(sFloat));
// full dslash operator
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashReference(tmp, gauge, inEven, 1, 0);
dslashReference(outEven, gauge, tmp, 0, 0);
dslashReference(tmp, gauge, inEven, 1, daggerBit);
dslashReference(outEven, gauge, tmp, 0, daggerBit);
} else {
dslashReference(tmp, gauge, inEven, 0, 0);
dslashReference(outEven, gauge, tmp, 1, 0);
dslashReference(tmp, gauge, inEven, 0, daggerBit);
dslashReference(outEven, gauge, tmp, 1, daggerBit);
}
// lastly apply the kappa term
sFloat kappa2 = -kappa*kappa;
xpay(inEven, kappa2, outEven, Nh*spinorSiteSize);
free(tmp);
}
// Apply the even-odd preconditioned Dirac operator
template <typename sFloat, typename gFloat>
void MatPCDag(sFloat *outEven, gFloat **gauge, sFloat *inEven, sFloat kappa,
MatPCType matpc_type) {
sFloat *tmp = (sFloat*)malloc(Nh*spinorSiteSize*sizeof(sFloat));
// full dslash operator
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashReference(tmp, gauge, inEven, 1, 1);
dslashReference(outEven, gauge, tmp, 0, 1);
} else {
dslashReference(tmp, gauge, inEven, 0, 1);
dslashReference(outEven, gauge, tmp, 1, 1);
}
sFloat kappa2 = -kappa*kappa;
xpay(inEven, kappa2, outEven, Nh*spinorSiteSize);
xpay(inEven, kappa2, outEven, Vh*spinorSiteSize);
free(tmp);
}
void matpc(void *outEven, void **gauge, void *inEven, double kappa,
MatPCType matpc_type, int dagger_bit, Precision sPrecision, Precision gPrecision) {
if (!dagger_bit) {
if (sPrecision == QUDA_DOUBLE_PRECISION)
if (gPrecision == QUDA_DOUBLE_PRECISION)
MatPC((double*)outEven, (double**)gauge, (double*)inEven, (double)kappa, matpc_type);
else
MatPC((double*)outEven, (float**)gauge, (double*)inEven, (double)kappa, matpc_type);
if (sPrecision == QUDA_DOUBLE_PRECISION)
if (gPrecision == QUDA_DOUBLE_PRECISION)
MatPC((double*)outEven, (double**)gauge, (double*)inEven, (double)kappa, dagger_bit, matpc_type);
else
if (gPrecision == QUDA_DOUBLE_PRECISION)
MatPC((float*)outEven, (double**)gauge, (float*)inEven, (float)kappa, matpc_type);
else
MatPC((float*)outEven, (float**)gauge, (float*)inEven, (float)kappa, matpc_type);
} else {
if (sPrecision == QUDA_DOUBLE_PRECISION)
if (gPrecision == QUDA_DOUBLE_PRECISION)
MatPCDag((double*)outEven, (double**)gauge, (double*)inEven, kappa, matpc_type);
else
MatPCDag((double*)outEven, (float**)gauge, (double*)inEven, (double)kappa, matpc_type);
MatPC((double*)outEven, (float**)gauge, (double*)inEven, (double)kappa, dagger_bit, matpc_type);
else
if (gPrecision == QUDA_DOUBLE_PRECISION)
MatPC((float*)outEven, (double**)gauge, (float*)inEven, (float)kappa, dagger_bit, matpc_type);
else
if (gPrecision == QUDA_DOUBLE_PRECISION)
MatPCDag((float*)outEven, (double**)gauge, (float*)inEven, (float)kappa, matpc_type);
else
MatPCDag((float*)outEven, (float**)gauge, (float*)inEven, (float)kappa, matpc_type);
}
MatPC((float*)outEven, (float**)gauge, (float*)inEven, (float)kappa, dagger_bit, matpc_type);
}
/*void MatDagMat(float *out, float **gauge, float *in, float kappa) {
float *tmp = (float*)malloc(N*spinorSiteSize*sizeof(float));
Mat(tmp, gauge, in, kappa);
MatDag(out, gauge, tmp, kappa);
free(tmp);
}*/
/*void MatPCDagMatPC(float *out, float **gauge, float *in, float kappa,
MatPCType matpc_type) {
float *tmp = (float*)malloc(Nh*spinorSiteSize*sizeof(float));
MatPC(tmp, gauge, in, kappa, matpc_type);
MatPCDag(out, gauge, tmp, kappa, matpc_type);
free(tmp);
}*/
......@@ -7,12 +7,17 @@
extern "C" {
#endif
// -- dslash_reference.cpp
void dslash_reference(void *res, void **gauge, void *spinorField,
int oddBit, int daggerBit, Precision sPrecision, Precision gPrecision);
extern int Z[4];
extern int Vh;
extern int V;
void setDims(int *);
void dslash(void *res, void **gauge, void *spinorField,
int oddBit, int daggerBit, Precision sPrecision, Precision gPrecision);
void mat(void *out, void **gauge, void *in, double kappa, int daggerBit, Precision sPrecision, Precision gPrecision);
void mat(void *out, void **gauge, void *in, double kappa, int daggerBit,
Precision sPrecision, Precision gPrecision);
void matpc(void *out, void **gauge, void *in, double kappa, MatPCType matpc_type,
int daggerBit, Precision sPrecision, Precision gPrecision);
......
......@@ -19,25 +19,29 @@ FullSpinor cudaSpinorOut;
ParitySpinor tmp;
void *hostGauge[4];
void *spinor, *spinorRef, *spinorGPU;
void *spinorEven, *spinorOdd;
void *spinor, *spinorEven, *spinorOdd;
void *spinorRef, *spinorRefEven, *spinorRefOdd;
void *spinorGPU, *spinorGPUEven, *spinorGPUOdd;
double kappa = 1.0;
int ODD_BIT = 0;
int ODD_BIT = 1;
int DAGGER_BIT = 0;
int TRANSFER = 1; // include transfer time in the benchmark?
void init() {
gaugeParam.X[0] = 8;
gaugeParam.X[1] = 8;
gaugeParam.X[2] = 8;
gaugeParam.X[3] = 8;
setDims(gaugeParam.X);
gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION;
gaugeParam.cuda_prec = QUDA_DOUBLE_PRECISION;
gaugeParam.reconstruct = QUDA_RECONSTRUCT_12;
gaugeParam.reconstruct_sloppy = gaugeParam.reconstruct;
gaugeParam.cuda_prec_sloppy = gaugeParam.cuda_prec;
gaugeParam.X = L1;
gaugeParam.Y = L2;
gaugeParam.Z = L3;
gaugeParam.T = L4;
gaugeParam.anisotropy = 2.3;
gaugeParam.gauge_order = QUDA_QDP_GAUGE_ORDER;
gaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;
......@@ -55,18 +59,26 @@ void init() {
size_t sSize = (inv_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
// construct input fields
for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(N*gaugeSiteSize*gSize);
for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gSize);
spinor = malloc(N*spinorSiteSize*sSize);
spinorRef = malloc(N*spinorSiteSize*sSize);
spinorGPU = malloc(N*spinorSiteSize*sSize);
spinor = malloc(V*spinorSiteSize*sSize);
spinorRef = malloc(V*spinorSiteSize*sSize);
spinorGPU = malloc(V*spinorSiteSize*sSize);
spinorEven = spinor;
if (inv_param.cpu_prec == QUDA_DOUBLE_PRECISION)
spinorOdd = (void*)((double*)spinor + Nh*spinorSiteSize);
else
spinorOdd = (void*)((float*)spinor + Nh*spinorSiteSize);
spinorRefEven = spinorRef;
spinorGPUEven = spinorGPU;
if (inv_param.cpu_prec == QUDA_DOUBLE_PRECISION) {
spinorOdd = (void*)((double*)spinor + Vh*spinorSiteSize);
spinorRefOdd = (void*)((double*)spinorRef + Vh*spinorSiteSize);
spinorGPUOdd = (void*)((double*)spinorGPU + Vh*spinorSiteSize);
} else {
spinorOdd = (void*)((float*)spinor + Vh*spinorSiteSize);
spinorRefOdd = (void*)((float*)spinorRef + Vh*spinorSiteSize);
spinorGPUOdd = (void*)((float*)spinorGPU + Vh*spinorSiteSize);
}
printf("Randomizing fields...");
construct_gauge_field(hostGauge, 1, gaugeParam.cpu_prec);
construct_spinor_field(spinor, 1, 0, 0, 0, inv_param.cpu_prec);
......@@ -82,9 +94,11 @@ void init() {
if (!TRANSFER) {
tmp = allocateParitySpinor(Nh, inv_param.cuda_prec);
cudaSpinor = allocateSpinorField(N, inv_param.cuda_prec);
cudaSpinorOut = allocateSpinorField(N, inv_param.cuda_prec);
gaugeParam.X[0] /= 2;
tmp = allocateParitySpinor(gaugeParam.X, inv_param.cuda_prec);
cudaSpinor = allocateSpinorField(gaugeParam.X, inv_param.cuda_prec);
cudaSpinorOut = allocateSpinorField(gaugeParam.X, inv_param.cuda_prec);
gaugeParam.X[0] *= 2;
if (test_type < 2) {
loadParitySpinor(cudaSpinor.even, spinorEven, inv_param.cpu_prec,
......@@ -93,8 +107,8 @@ void init() {
loadSpinorField(cudaSpinor, spinor, inv_param.cpu_prec,
inv_param.dirac_order);
}
}
}
}
......@@ -154,8 +168,8 @@ void dslashRef() {
fflush(stdout);
switch (test_type) {
case 0:
dslash_reference(spinorRef, hostGauge, spinorEven, ODD_BIT, DAGGER_BIT,
inv_param.cpu_prec, gaugeParam.cpu_prec);
dslash(spinorRef, hostGauge, spinorEven, ODD_BIT, DAGGER_BIT,
inv_param.cpu_prec, gaugeParam.cpu_prec);
break;
case 1:
matpc(spinorRef, hostGauge, spinorEven, kappa, QUDA_MATPC_EVEN_EVEN, DAGGER_BIT,
......@@ -178,7 +192,7 @@ void dslashTest() {
init();
float spinorGiB = (float)Nh*spinorSiteSize*sizeof(inv_param.cpu_prec) / (1 << 30);
float spinorGiB = (float)Vh*spinorSiteSize*sizeof(inv_param.cpu_prec) / (1 << 30);
float sharedKB = (float)dslashCudaSharedBytes() / (1 << 10);
printf("\nSpinor mem: %.3f GiB\n", spinorGiB);
printf("Gauge mem: %.3f GiB\n", gaugeParam.gaugeGiB);
......@@ -191,23 +205,33 @@ void dslashTest() {
double secs = dslashCUDA();
if (!TRANSFER) {
if (test_type < 2) retrieveParitySpinor(spinorOdd, cudaSpinor.odd, inv_param.cpu_prec, inv_param.dirac_order);
else retrieveSpinorField(spinorGPU, cudaSpinorOut, inv_param.cpu_prec, inv_param.dirac_order);
if (test_type < 2)
retrieveParitySpinor(spinorOdd, cudaSpinor.odd, inv_param.cpu_prec, inv_param.dirac_order);
else
retrieveSpinorField(spinorGPU, cudaSpinorOut, inv_param.cpu_prec, inv_param.dirac_order);
}
// print timing information
printf("%fms per loop\n", 1000*secs);
int flops = test_type ? 1320*2 + 48 : 1320;
int floats = test_type ? 2*(7*24+8*gaugeParam.packed_size+24)+24 : 7*24+8*gaugeParam.packed_size+24;
printf("GFLOPS = %f\n", 1.0e-9*flops*Nh/secs);
printf("GiB/s = %f\n\n", Nh*floats*sizeof(float)/(secs*(1<<30)));
printf("GFLOPS = %f\n", 1.0e-9*flops*Vh/secs);
printf("GiB/s = %f\n\n", Vh*floats*sizeof(float)/(secs*(1<<30)));
/* for (int is=0; is<Vh; is++) {
printf("%e %e\n", ((double*)spinorRef)[is*24], ((double*)spinorOdd)[is*24]);
}
exit(0);*/
int res;
if (test_type < 2) res = compare_floats(spinorOdd, spinorRef, Nh*4*3*2, 1e-4, inv_param.cpu_prec);
else res = compare_floats(spinorGPU, spinorRef, N*4*3*2, 1e-4, inv_param.cpu_prec);
if (test_type < 2) res = compare_floats(spinorOdd, spinorRef, Vh*4*3*2, 1e-4, inv_param.cpu_prec);
else res = compare_floats(spinorGPU, spinorRef, V*4*3*2, 1e-4, inv_param.cpu_prec);
printf("%d Test %s\n", i, (1 == res) ? "PASSED" : "FAILED");
if (test_type < 2) strong_check(spinorRef, spinorOdd, Nh, inv_param.cpu_prec);
else strong_check(spinorRef, spinorGPU, Nh, inv_param.cpu_prec);
if (test_type < 2) strong_check(spinorRef, spinorOdd, Vh, inv_param.cpu_prec);
else strong_check(spinorRef, spinorGPU, V, inv_param.cpu_prec);
exit(0);
}
......
......@@ -6,8 +6,6 @@
#include <xmmintrin.h>
#define __DEVICE_EMULATION__
#define SHORT_LENGTH 65536
#define SCALE_FLOAT (SHORT_LENGTH-1) / 2.f
#define SHIFT_FLOAT -1.f / (SHORT_LENGTH-1)
......@@ -18,88 +16,88 @@ inline short FloatToShort(Float a) {
}
template <typename Float>
inline void pack8(double2 *res, Float *g, int dir) {
double2 *r = res + dir*2*Nh;
inline void pack8(double2 *res, Float *g, int dir, int V) {
double2 *r = res + dir*2*V;
r[0].x = atan2(g[1], g[0]);
r[0].y = atan2(g[13], g[12]);
r[Nh].x = g[2];
r[Nh].y = g[3];
r[2*Nh].x = g[4];
r[2*Nh].y = g[5];
r[3*Nh].x = g[6];
r[3*Nh].y = g[7];
r[V].x = g[2];
r[V].y = g[3];
r[2*V].x = g[4];
r[2*V].y = g[5];
r[3*V].x = g[6];
r[3*V].y = g[7];
}
template <typename Float>
inline void pack8(float4 *res, Float *g, int dir) {
float4 *r = res + dir*4*Nh;
inline void pack8(float4 *res, Float *g, int dir, int V) {
float4 *r = res + dir*4*V;
r[0].x = atan2(g[1], g[0]);
r[0].y = atan2(g[13], g[12]);
r[0].z = g[2];
r[0].w = g[3];
r[Nh].x = g[4];
r[Nh].y = g[5];
r[Nh].z = g[6];
r[Nh].w = g[7];
r[V].x = g[4];
r[V].y = g[5];
r[V].z = g[6];
r[V].w = g[7];
}
template <typename Float>
inline void pack8(short4 *res, Float *g, int dir) {
short4 *r = res + dir*4*Nh;
inline void pack8(short4 *res, Float *g, int dir, int V) {
short4 *r = res + dir*4*V;
r[0].x = FloatToShort(atan2(g[1], g[0])/ M_PI);
r[0].y = FloatToShort(atan2(g[13], g[12])/ M_PI);
r[0].z = FloatToShort(g[2]);
r[0].w = FloatToShort(g[3]);
r[Nh].x = FloatToShort(g[4]);
r[Nh].y = FloatToShort(g[5]);
r[Nh].z = FloatToShort(g[6]);
r[Nh].w = FloatToShort(g[7]);
r[V].x = FloatToShort(g[4]);
r[V].y = FloatToShort(g[5]);
r[V].z = FloatToShort(g[6]);
r[V].w = FloatToShort(g[7]);
}
template <typename Float>
inline void pack12(double2 *res, Float *g, int dir) {
double2 *r = res + dir*6*Nh;
inline void pack12(double2 *res, Float *g, int dir, int V) {
double2 *r = res + dir*6*V;
for (int j=0; j<6; j++) {
r[j*Nh].x = g[j*2+0];
r[j*Nh].y = g[j*2+1];
r[j*V].x = g[j*2+0];
r[j*V].y = g[j*2+1];
}
}
template <typename Float>
inline void pack12(float4 *res, Float *g, int dir) {
float4 *r = res + dir*3*Nh;
inline void pack12(float4 *res, Float *g, int dir, int V) {
float4 *r = res + dir*3*V;
for (int j=0; j<3; j++) {
r[j*Nh].x = (float)g[j*4+0];
r[j*Nh].y = (float)g[j*4+1];
r[j*Nh].z = (float)g[j*4+2];
r[j*Nh].w = (float)g[j*4+3];
r[j*V].x = (float)g[j*4+0];
r[j*V].y = (float)g[j*4+1];
r[j*V].z = (float)g[j*4+2];
r[j*V].w = (float)g[j*4+3];
}
}
template <typename Float>
inline void pack12(short4 *res, Float *g, int dir) {
short4 *r = res + dir*3*Nh;
inline void pack12(short4 *res, Float *g, int dir, int V) {
short4 *r = res + dir*3*V;
for (int j=0; j<3; j++) {
r[j*Nh].x = FloatToShort(g[j*4+0]);
r[j*Nh].y = FloatToShort(g[j*4+1]);
r[j*Nh].z = FloatToShort(g[j*4+2]);
r[j*Nh].w = FloatToShort(g[j*4+3]);
r[j*V].x = FloatToShort(g[j*4+0]);
r[j*V].y = FloatToShort(g[j*4+1]);
r[j*V].z = FloatToShort(g[j*4+2]);
r[j*V].w = FloatToShort(g[j*4+3]);
}
}
// Assume the gauge field is "QDP" ordered directions inside of
// space-time column-row ordering even-odd space-time
template <typename Float, typename FloatN>
void packQDPGaugeField(FloatN *res, Float **gauge, int oddBit, ReconstructType reconstruct) {
void packQDPGaugeField(FloatN *res, Float **gauge, int oddBit, ReconstructType reconstruct, int V) {
if (reconstruct == QUDA_RECONSTRUCT_12) {
for (int dir = 0; dir < 4; dir++) {
Float *g = gauge[dir] + oddBit*Nh*gaugeSiteSize;
for (int i = 0; i < Nh; i++) pack12(res+i, g+i*18, dir);
Float *g = gauge[dir] + oddBit*V*18;
for (int i = 0; i < V; i++) pack12(res+i, g+i*18, dir, V);
}
} else {
for (int dir = 0; dir < 4; dir++) {
Float *g = gauge[dir] + oddBit*Nh*gaugeSiteSize;
for (int i = 0; i < Nh; i++) pack8(res+i, g+i*18, dir);
Float *g = gauge[dir] + oddBit*V*18;
for (int i = 0; i < V; i++) pack8(res+i, g+i*18, dir, V);
}
}
}
......@@ -107,26 +105,26 @@ void packQDPGaugeField(FloatN *res, Float **gauge, int oddBit, ReconstructType r
// Assume the gauge field is "Wilson" ordered directions inside of
// space-time column-row ordering even-odd space-time
template <typename Float, typename FloatN>
void packCPSGaugeField(FloatN *res, Float *gauge, int oddBit, ReconstructType reconstruct) {
void packCPSGaugeField(FloatN *res, Float *gauge, int oddBit, ReconstructType reconstruct, int V) {
Float gT[18];
if (reconstruct == QUDA_RECONSTRUCT_12) {
for (int dir = 0; dir < 4; dir++) {
Float *g = gauge + (oddBit*Nh*4+dir)*gaugeSiteSize;
for (int i = 0; i < Nh; i++) {
Float *g = gauge + (oddBit*V*4+dir)*18;
for (int i = 0; i < V; i++) {
// Must reorder rows-columns
for (int ic=0; ic<2; ic++) for (int jc=0; jc<3; jc++) for (int r=0; r<2; r++)
gT[(ic*3+jc)*2+r] = g[4*i*18 + (jc*3+ic)*2+r];
pack12(res+i, gT, dir);
pack12(res+i, gT, dir, V);
}
}
} else {
for (int dir = 0; dir < 4; dir++) {
Float *g = gauge + (oddBit*Nh*4+dir)*gaugeSiteSize;
for (int i = 0; i < Nh; i++) {
Float *g = gauge + (oddBit*V*4+dir)*18;
for (int i = 0; i < V; i++) {
// Must reorder rows-columns
for (int ic=0; ic<3; ic++) for (int jc=0; jc<3; jc++) for (int r=0; r<2; r++)
gT[(ic*3+jc)*2+r] = g[4*i*18 + (jc*3+ic)*2+r];
pack8(res+i, gT, dir);
pack8(res+i, gT, dir, V);
}
}
}
......@@ -138,23 +136,25 @@ void allocateGaugeField(FullGauge *cudaGauge, ReconstructType reconstruct, Preci
cudaGauge->reconstruct = reconstruct;
cudaGauge->precision = precision;
cudaGauge->Nc = 3;
int floatSize;
if (precision == QUDA_DOUBLE_PRECISION) floatSize = sizeof(double);
else if (precision == QUDA_SINGLE_PRECISION) floatSize = sizeof(float);
else floatSize = sizeof(float)/2;
int elements = (reconstruct == QUDA_RECONSTRUCT_8) ? 8 : 12;
cudaGauge->packedGaugeBytes = 4*Nh*elements*floatSize;
cudaGauge->bytes = 4*cudaGauge->volume*elements*floatSize;
if (!cudaGauge->even) {
if (cudaMalloc((void **)&cudaGauge->even, cudaGauge->packedGaugeBytes) == cudaErrorMemoryAllocation) {
if (cudaMalloc((void **)&cudaGauge->even, cudaGauge->bytes) == cudaErrorMemoryAllocation) {
printf("Error allocating even gauge field\n");
exit(0);
}
}
if (!cudaGauge->odd) {
if (cudaMalloc((void **)&cudaGauge->odd, cudaGauge->packedGaugeBytes) == cudaErrorMemoryAllocation) {
if (cudaMalloc((void **)&cudaGauge->odd, cudaGauge->bytes) == cudaErrorMemoryAllocation) {
printf("Error allocating even odd gauge field\n");
exit(0);
}
......@@ -170,33 +170,34 @@ void freeGaugeField(FullGauge *cudaGauge) {
}
template <typename Float, typename FloatN>
void loadGaugeField(FloatN *even, FloatN *odd, Float *cpuGauge, ReconstructType reconstruct, int packedGaugeBytes) {
void loadGaugeField(FloatN *even, FloatN *odd, Float *cpuGauge, ReconstructType reconstruct,
int bytes, int Vh) {
// Use pinned memory
FloatN *packedEven, *packedOdd;
#ifndef __DEVICE_EMULATION__
cudaMallocHost((void**)&packedEven, packedGaugeBytes);
cudaMallocHost((void**)&packedOdd, packedGaugeBytes);
cudaMallocHost((void**)&packedEven, bytes);
cudaMallocHost((void**)&packedOdd, bytes);
#else
packedEven = (FloatN*)malloc(packedGaugeBytes);
packedOdd = (FloatN*)malloc(packedGaugeBytes);
packedEven = (FloatN*)malloc(bytes);
packedOdd = (FloatN*)malloc(bytes);
#endif
if (gauge_param->gauge_order == QUDA_QDP_GAUGE_ORDER) {
packQDPGaugeField(packedEven, (Float**)cpuGauge, 0, reconstruct);
packQDPGaugeField(packedOdd, (Float**)cpuGauge, 1, reconstruct);
packQDPGaugeField(packedEven, (Float**)cpuGauge, 0, reconstruct, Vh);
packQDPGaugeField(packedOdd, (Float**)cpuGauge, 1, reconstruct, Vh);
} else if (gauge_param->gauge_order == QUDA_CPS_WILSON_GAUGE_ORDER) {
packCPSGaugeField(packedEven, (Float*)cpuGauge, 0, reconstruct);
packCPSGaugeField(packedOdd, (Float*)cpuGauge, 1, reconstruct);
packCPSGaugeField(packedEven, (Float*)cpuGauge, 0, reconstruct, Vh);
packCPSGaugeField(packedOdd, (Float*)cpuGauge, 1, reconstruct, Vh);
} else {
printf("Sorry, %d GaugeFieldOrder not supported\n", gauge_param->gauge_order);
exit(-1);
}
cudaMemcpy(even, packedEven, packedGaugeBytes, cudaMemcpyHostToDevice);
cudaMemcpy(odd, packedOdd, packedGaugeBytes, cudaMemcpyHostToDevice);
cudaMemcpy(even, packedEven, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(odd, packedOdd, bytes, cudaMemcpyHostToDevice);
#ifndef __DEVICE_EMULATION__
cudaFreeHost(packedEven);
......@@ -208,7 +209,8 @@ void loadGaugeField(FloatN *even, FloatN *odd, Float *cpuGauge, ReconstructType
}
void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, ReconstructType reconstruct, Precision precision) {
void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, ReconstructType reconstruct,
Precision precision, int *X) {
if (gauge_param->cpu_prec == QUDA_HALF_PRECISION) {
printf("QUDA error: half precision not supported on cpu\n");
......@@ -225,24 +227,32 @@ void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, ReconstructType reco
exit(-1);
}
cudaGauge->volume = 1;
for (int d=0; d<4; d++) {
cudaGauge->X[d] = X[d];
cudaGauge->volume *= X[d];
}
cudaGauge->X[0] /= 2; // actually store the even-odd sublattice dimensions
cudaGauge->volume /= 2;
allocateGaugeField(cudaGauge, reconstruct, precision);
if (precision == QUDA_DOUBLE_PRECISION) {
loadGaugeField((double2*)(cudaGauge->even), (double2*)(cudaGauge->odd), (double*)cpuGauge,
cudaGauge->reconstruct, cudaGauge->packedGaugeBytes);
cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume);
} else if (precision == QUDA_SINGLE_PRECISION) {
if (gauge_param->cpu_prec == QUDA_DOUBLE_PRECISION)
loadGaugeField((float4*)(cudaGauge->even), (float4*)(cudaGauge->odd), (double*)cpuGauge,
cudaGauge->reconstruct, cudaGauge->packedGaugeBytes);
cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume);
else if (gauge_param->cpu_prec == QUDA_SINGLE_PRECISION)
loadGaugeField((float4*)(cudaGauge->even), (float4*)(cudaGauge->odd), (float*)cpuGauge,
cudaGauge->reconstruct, cudaGauge->packedGaugeBytes);
cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume);
} else if (precision == QUDA_HALF_PRECISION) {
if (gauge_param->cpu_prec == QUDA_DOUBLE_PRECISION)
loadGaugeField((short4*)(cudaGauge->even), (short4*)(cudaGauge->odd), (double*)cpuGauge,
cudaGauge->reconstruct, cudaGauge->packedGaugeBytes);
cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume);
else if (gauge_param->cpu_prec == QUDA_SINGLE_PRECISION)
loadGaugeField((short4*)(cudaGauge->even), (short4*)(cudaGauge->odd), (float*)cpuGauge,
cudaGauge->reconstruct, cudaGauge->packedGaugeBytes);
cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume);
}
}
......@@ -8,7 +8,8 @@
extern "C" {
#endif
void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, ReconstructType reconstruct, Precision precision);
void createGaugeField(FullGauge *cudaGauge, void *cpuGauge,
ReconstructType reconstruct, Precision precision, int *X);
void freeGaugeField(FullGauge *cudaCauge);
#ifdef __cplusplus
......
......@@ -8,17 +8,17 @@
#include <spinor_quda.h>
#include <gauge_quda.h>
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugeSloppy,
FullGauge gaugePrecise, ParitySpinor tmp,
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise,
FullGauge gaugeSloppy, ParitySpinor tmp,
QudaInvertParam *invert_param, DagType dag_type)
{
ParitySpinor r = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
ParitySpinor p = allocateParitySpinor(x.length/spinorSiteSize, invert_param->cuda_prec_sloppy);
ParitySpinor v = allocateParitySpinor(x.length/spinorSiteSize, invert_param->cuda_prec_sloppy);
ParitySpinor t = allocateParitySpinor(x.length/spinorSiteSize, invert_param->cuda_prec_sloppy);
ParitySpinor r = allocateParitySpinor(x.X, x.precision);
ParitySpinor p = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
ParitySpinor v = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
ParitySpinor t = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
ParitySpinor y = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
ParitySpinor b = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
ParitySpinor y = allocateParitySpinor(x.X, x.precision);
ParitySpinor b = allocateParitySpinor(x.X, x.precision);
ParitySpinor x_sloppy, r_sloppy, tmp_sloppy, src_sloppy;
if (invert_param->cuda_prec_sloppy == x.precision) {
......@@ -27,10 +27,10 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugeSloppy,
tmp_sloppy = tmp;
src_sloppy = src;
} else {
x_sloppy = allocateParitySpinor(x.length/spinorSiteSize, invert_param->cuda_prec_sloppy);
r_sloppy = allocateParitySpinor(x.length/spinorSiteSize, invert_param->cuda_prec_sloppy);
src_sloppy = allocateParitySpinor(x.length/spinorSiteSize, invert_param->cuda_prec_sloppy);
tmp_sloppy = allocateParitySpinor(x.length/spinorSiteSize, invert_param->cuda_prec_sloppy);
x_sloppy = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
r_sloppy = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
src_sloppy = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
tmp_sloppy = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
copyCuda(src_sloppy, src);
}
......@@ -151,13 +151,14 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugeSloppy,
invert_param->secs += stopwatchReadSeconds();
if (k==invert_param->maxiter) printf("Exceeded maximum iterations %d\n", invert_param->maxiter);
if (k==invert_param->maxiter)
printf("Exceeded maximum iterations %d\n", invert_param->maxiter);
printf("Residual updates = %d, Solution updates = %d\n", rUpdate, xUpdate);
float gflops = (1.0e-9*Nh)*(2*(2*1320+48)*k + (32*k + 8*(k-1))*spinorSiteSize);
gflops += 1.0e-9*Nh*rUpdate*((2*1320+48) + 3*spinorSiteSize);
gflops += 1.0e-9*Nh*xUpdate*spinorSiteSize;
float gflops = (1.0e-9*x.volume)*(2*(2*1320+48)*k + (32*k + 8*(k-1))*spinorSiteSize);
gflops += 1.0e-9*x.volume*rUpdate*((2*1320+48) + 3*spinorSiteSize);
gflops += 1.0e-9*x.volume*xUpdate*spinorSiteSize;
//printf("%f gflops\n", k*gflops / stopwatchReadSeconds());
invert_param->gflops += gflops;
invert_param->iter += k;
......
......@@ -7,12 +7,38 @@
#include <spinor_quda.h>
#include <gauge_quda.h>
void invertCgCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge,
ParitySpinor tmp, QudaInvertParam *perf)
void invertCgCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugePrecise,
FullGauge gaugeSloppy, ParitySpinor tmp, QudaInvertParam *perf)
{
ParitySpinor r;
ParitySpinor p = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
ParitySpinor Ap = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
ParitySpinor p = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
ParitySpinor Ap = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
ParitySpinor y = allocateParitySpinor(x.X, x.precision);
ParitySpinor r = allocateParitySpinor(x.X, x.precision);
ParitySpinor b;
if (perf->preserve_source == QUDA_PRESERVE_SOURCE_YES) {
b = allocateParitySpinor(x.X, x.precision);
copyCuda(b, source);
} else {
b = source;
}
ParitySpinor x_sloppy, r_sloppy, tmp_sloppy;
if (invert_param->cuda_prec_sloppy == x.precision) {
x_sloppy = x;
r_sloppy = r;
tmp_sloppy = tmp;
} else {
x_sloppy = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
r_sloppy = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
tmp_sloppy = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
}
copyCuda(r, b);
if (r_sloppy.precision != r.precision) copyCuda(r_sloppy, r);
copyCuda(p, r_sloppy);
zeroCuda(x_sloppy);
zeroCuda(y);
double b2 = 0.0;
b2 = normCuda(b);
......@@ -24,40 +50,80 @@ void invertCgCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge,
double alpha, beta;
double pAp;
if (perf->preserve_source == QUDA_PRESERVE_SOURCE_YES) {
r = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
copyCuda(r, b);
} else {
r = b;
}
copyCuda(p, r);
zeroCuda(x);
double rNorm = sqrt(r2);
double r0Norm = rNorm;
double maxrx = rNorm;
double maxrr = rNorm;
double delta = invert_param->reliable_delta;
int k=0;
printf("%d iterations, r2 = %e\n", k, r2);
int xUpdate = 0, rUpdate = 0;
//printf("%d iterations, r2 = %e\n", k, r2);
stopwatchStart();
while (r2 > stop && k<perf->maxiter) {
MatPCDagMatPCCuda(Ap, gauge, p, perf->kappa, tmp, perf->matpc_type);
MatPCDagMatPCCuda(Ap, gaugeSloppy, p, perf->kappa, tmp_sloppy, perf->matpc_type);
pAp = reDotProductCuda(p, Ap);
alpha = r2 / pAp;
r2_old = r2;
r2 = axpyNormCuda(-alpha, Ap, r);
beta = r2 / r2_old;
axpyZpbxCuda(alpha, p, x, r, beta);
r2 = axpyNormCuda(-alpha, Ap, r_sloppy);
// reliable update conditions
rNorm = sqrt(r2);
if (rNorm > maxrx) maxrx = rNorm;
if (rNorm > maxrr) maxrr = rNorm;
int updateX = (rNorm < delta*r0Norm && r0Norm <= maxrx) ? 1 : 0;
int updateR = ((rNorm < delta*maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0;
if (!updateR) {
beta = r2 / r2_old;
axpyZpbxCuda(alpha, p, x_sloppy, r_sloppy, beta);
} else {
axpyCuda(alpha, p, x_sloppy);
if (x.precision != x_sloppy.precision) copyCuda(x, x_sloppy);
MatPCDagMatPCCuda(r, gaugePrecise, x, invert_param->kappa,
tmp, invert_param->matpc_type);
r2 = xmyNormCuda(b, r);
if (x.precision != r_sloppy.precision) copyCuda(r_sloppy, r);
rNorm = sqrt(r2);
maxrr = rNorm;
rUpdate++;
if (updateX) {
xpyCuda(x, y);
zeroCuda(x_sloppy);
copyCuda(b, r);
r0Norm = rNorm;
maxrx = rNorm;
xUpdate++;
}
beta = r2 / r2_old;
xpayCuda(r_sloppy, beta, p);
}
k++;
printf("%d iterations, r2 = %e\n", k, r2);
//printf("%d iterations, r2 = %e\n", k, r2);
}
if (x.precision != x_sloppy.precision) copyCuda(x, x_sloppy);
xpyCuda(y, x);
perf->secs = stopwatchReadSeconds();
//if (k==maxiters)
//printf("Exceeded maximum iterations %d\n", maxiters);
if (k==invert_param->maxiter)
printf("Exceeded maximum iterations %d\n", invert_param->maxiter);
//printf("Residual updates = %d, Solution updates = %d\n", rUpdate, xUpdate);
float gflops = k*(1.0e-9*Nh)*(2*(2*1320+48) + 10*spinorSiteSize);
float gflops = k*(1.0e-9*x.volume)*(2*(2*1320+48) + 10*spinorSiteSize);
//printf("%f gflops\n", k*gflops / stopwatchReadSeconds());
perf->gflops = gflops;
perf->iter = k;
......@@ -73,9 +139,19 @@ void invertCgCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge,
k, r2, true_res / b2);
#endif
if (perf->preserve_source == QUDA_PRESERVE_SOURCE_YES) freeParitySpinor(r);
if (invert_param->cuda_prec_sloppy != x.precision) {
freeParitySpinor(tmp_sloppy);
freeParitySpinor(r_sloppy);
freeParitySpinor(x_sloppy);
}
if (perf->preserve_source == QUDA_PRESERVE_SOURCE_YES) freeParitySpinor(b);
freeParitySpinor(r);
freeParitySpinor(p);
freeParitySpinor(Ap);
freeParitySpinor(b);
freeParitySpinor(y);
return;
}
......@@ -17,10 +17,9 @@ FullGauge cudaGaugeSloppy; // sloppy gauge field
void printGaugeParam(QudaGaugeParam *param) {
printf("Gauge Params:\n");
printf("X = %d\n", param->X);
printf("Y = %d\n", param->Y);
printf("Z = %d\n", param->Z);
printf("T = %d\n", param->T);
for (int d=0; d<4; d++) {
printf("X[%d] = %d\n", d, param->X[d]);
}
printf("anisotropy = %e\n", param->anisotropy);
printf("gauge_order = %d\n", param->gauge_order);
printf("cpu_prec = %d\n", param->cpu_prec);
......@@ -94,20 +93,21 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
{
gauge_param = param;
setCudaGaugeParam();
if (gauge_param->X != L1 || gauge_param->Y != L2 || gauge_param->Z != L3 || gauge_param->T != L4) {
/* if (gauge_param->X != L1 || gauge_param->Y != L2 || gauge_param->Z != L3 || gauge_param->T != L4) {
printf("QUDA error: dimensions do not match: %d=%d, %d=%d, %d=%d, %d=%d\n",
gauge_param->X, L1, gauge_param->Y, L2, gauge_param->Z, L3, gauge_param->T, L4);
exit(-1);
}
}*/
gauge_param->packed_size = (gauge_param->reconstruct == QUDA_RECONSTRUCT_8) ? 8 : 12;
createGaugeField(&cudaGaugePrecise, h_gauge, gauge_param->reconstruct, gauge_param->cuda_prec);
gauge_param->gaugeGiB = 2.0*cudaGaugePrecise.packedGaugeBytes/ (1 << 30);
createGaugeField(&cudaGaugePrecise, h_gauge, gauge_param->reconstruct,
gauge_param->cuda_prec, gauge_param->X);
gauge_param->gaugeGiB = 2.0*cudaGaugePrecise.bytes/ (1 << 30);
if (gauge_param->cuda_prec_sloppy != gauge_param->cuda_prec ||
gauge_param->reconstruct_sloppy != gauge_param->reconstruct) {
createGaugeField(&cudaGaugeSloppy, h_gauge, gauge_param->reconstruct_sloppy, gauge_param->cuda_prec_sloppy);
gauge_param->gaugeGiB += 2.0*cudaGaugeSloppy.packedGaugeBytes/ (1 << 30);
createGaugeField(&cudaGaugeSloppy, h_gauge, gauge_param->reconstruct_sloppy,
gauge_param->cuda_prec_sloppy, gauge_param->X);
gauge_param->gaugeGiB += 2.0*cudaGaugeSloppy.bytes/ (1 << 30);
} else {
cudaGaugeSloppy = cudaGaugePrecise;
}
......@@ -121,10 +121,19 @@ void endQuda()
freeGaugeField(&cudaGaugeSloppy);
}
void checkPrecision(QudaInvertParam *param) {
if (param->cpu_prec == QUDA_HALF_PRECISION) {
printf("Half precision not supported on cpu\n");
exit(-1);
}
}
void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int parity, int dagger)
{
ParitySpinor in = allocateParitySpinor(Nh, inv_param->cuda_prec);
ParitySpinor out = allocateParitySpinor(Nh, inv_param->cuda_prec);
checkPrecision(inv_param);
ParitySpinor in = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec);
ParitySpinor out = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec);
loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->dirac_order);
dslashCuda(out, cudaGaugePrecise, in, parity, dagger);
......@@ -136,9 +145,11 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int parity,
void MatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int dagger)
{
ParitySpinor in = allocateParitySpinor(Nh, inv_param->cuda_prec);
ParitySpinor out = allocateParitySpinor(Nh, inv_param->cuda_prec);
ParitySpinor tmp = allocateParitySpinor(Nh, inv_param->cuda_prec);
checkPrecision(inv_param);
ParitySpinor in = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec);
ParitySpinor out = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec);
ParitySpinor tmp = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec);
loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->dirac_order);
MatPCCuda(out, cudaGaugePrecise, in, inv_param->kappa, tmp, inv_param->matpc_type, dagger);
......@@ -151,9 +162,11 @@ void MatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int dagger)
void MatPCDagMatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
{
ParitySpinor in = allocateParitySpinor(Nh, inv_param->cuda_prec);
ParitySpinor out = allocateParitySpinor(Nh, inv_param->cuda_prec);
ParitySpinor tmp = allocateParitySpinor(Nh, inv_param->cuda_prec);
checkPrecision(inv_param);
ParitySpinor in = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec);
ParitySpinor out = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec);
ParitySpinor tmp = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec);
loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->dirac_order);
MatPCDagMatPCCuda(out, cudaGaugePrecise, in, inv_param->kappa, tmp, inv_param->matpc_type);
......@@ -165,8 +178,10 @@ void MatPCDagMatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
}
void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int dagger) {
FullSpinor in = allocateSpinorField(N, inv_param->cuda_prec);
FullSpinor out = allocateSpinorField(N, inv_param->cuda_prec);
checkPrecision(inv_param);
FullSpinor in = allocateSpinorField(cudaGaugePrecise.X, inv_param->cuda_prec);
FullSpinor out = allocateSpinorField(cudaGaugePrecise.X, inv_param->cuda_prec);
loadSpinorField(in, h_in, inv_param->cpu_prec, inv_param->dirac_order);
......@@ -183,12 +198,9 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
{
invert_param = param;
if (param->cpu_prec == QUDA_HALF_PRECISION) {
printf("Half precision not supported on cpu\n");
exit(-1);
}
checkPrecision(param);
int slenh = Nh*spinorSiteSize;
int slenh = cudaGaugePrecise.volume*spinorSiteSize;
param->spinorGiB = (double)slenh*(param->cuda_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double): sizeof(float);
if (param->preserve_source == QUDA_PRESERVE_SOURCE_NO)
param->spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 5 : 7)/(1<<30);
......@@ -202,13 +214,13 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
double kappa = param->kappa;
FullSpinor b, x;
ParitySpinor in = allocateParitySpinor(Nh, invert_param->cuda_prec); // source vector
ParitySpinor out = allocateParitySpinor(Nh, invert_param->cuda_prec); // solution vector
ParitySpinor tmp = allocateParitySpinor(Nh, invert_param->cuda_prec); // temporary used when applying operator
ParitySpinor in = allocateParitySpinor(cudaGaugePrecise.X, invert_param->cuda_prec); // source vector
ParitySpinor out = allocateParitySpinor(cudaGaugePrecise.X, invert_param->cuda_prec); // solution vector
ParitySpinor tmp = allocateParitySpinor(cudaGaugePrecise.X, invert_param->cuda_prec); // temporary used when applying operator
if (param->solution_type == QUDA_MAT_SOLUTION) {
if (param->preserve_source == QUDA_PRESERVE_SOURCE_YES) {
b = allocateSpinorField(N, invert_param->cuda_prec);
b = allocateSpinorField(cudaGaugePrecise.X, invert_param->cuda_prec);
} else {
b.even = out;
b.odd = tmp;
......@@ -249,14 +261,14 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
copyCuda(out, in);
MatPCCuda(in, cudaGaugePrecise, out, kappa, tmp, param->matpc_type, QUDA_DAG_YES);
}
invertCgCuda(out, in, cudaGaugeSloppy, tmp, param);
invertCgCuda(out, in, cudaGaugePrecise, cudaGaugeSloppy, tmp, param);
break;
case QUDA_BICGSTAB_INVERTER:
if (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) {
invertBiCGstabCuda(out, in, cudaGaugeSloppy, cudaGaugePrecise, tmp, param, QUDA_DAG_YES);
invertBiCGstabCuda(out, in, cudaGaugePrecise, cudaGaugeSloppy, tmp, param, QUDA_DAG_YES);
copyCuda(in, out);
}
invertBiCGstabCuda(out, in, cudaGaugeSloppy, cudaGaugePrecise, tmp, param, QUDA_DAG_NO);
invertBiCGstabCuda(out, in, cudaGaugePrecise, cudaGaugeSloppy, tmp, param, QUDA_DAG_NO);
break;
default:
printf("Inverter type %d not implemented\n", param->inv_type);
......
......@@ -9,10 +9,7 @@ extern "C" {
typedef struct QudaGaugeParam_s {
int X;
int Y;
int Z;
int T;
int X[4];
double anisotropy;
......
......@@ -15,22 +15,25 @@ int main(int argc, char **argv)
QudaGaugeParam Gauge_param;
QudaInvertParam inv_param;
Gauge_param.X[0] = 24;
Gauge_param.X[1] = 24;
Gauge_param.X[2] = 24;
Gauge_param.X[3] = 32;
setDims(Gauge_param.X);
Gauge_param.cpu_prec = QUDA_DOUBLE_PRECISION;
Gauge_param.cuda_prec = QUDA_DOUBLE_PRECISION;
Gauge_param.reconstruct = QUDA_RECONSTRUCT_12;
Gauge_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
Gauge_param.cuda_prec_sloppy = QUDA_DOUBLE_PRECISION;
Gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12;
Gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
Gauge_param.X = L1;
Gauge_param.Y = L2;
Gauge_param.Z = L3;
Gauge_param.T = L4;
Gauge_param.anisotropy = 1.0;
inv_param.inv_type = QUDA_BICGSTAB_INVERTER;
inv_param.inv_type = QUDA_CG_INVERTER;
Gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T;
Gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
......@@ -44,23 +47,23 @@ int main(int argc, char **argv)
inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
inv_param.cuda_prec_sloppy = QUDA_DOUBLE_PRECISION;
inv_param.solution_type = QUDA_MAT_SOLUTION;
inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO;
inv_param.preserve_source = QUDA_PRESERVE_SOURCE_YES; // preservation doesn't work with reliable?
inv_param.dirac_order = QUDA_DIRAC_ORDER;
size_t gSize = (Gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
size_t sSize = (inv_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
for (int dir = 0; dir < 4; dir++) {
gauge[dir] = malloc(N*gaugeSiteSize*gSize);
gauge[dir] = malloc(V*gaugeSiteSize*gSize);
}
construct_gauge_field(gauge, 1, Gauge_param.cpu_prec);
void *spinorIn = malloc(N*spinorSiteSize*sSize);
void *spinorOut = malloc(N*spinorSiteSize*sSize);
void *spinorCheck = malloc(N*spinorSiteSize*sSize);
void *spinorIn = malloc(V*spinorSiteSize*sSize);
void *spinorOut = malloc(V*spinorSiteSize*sSize);
void *spinorCheck = malloc(V*spinorSiteSize*sSize);
int i0 = 0;
int s0 = 0;
......@@ -84,11 +87,11 @@ int main(int argc, char **argv)
mat(spinorCheck, gauge, spinorOut, inv_param.kappa, 0, inv_param.cpu_prec, Gauge_param.cpu_prec);
if (inv_param.mass_normalization == QUDA_MASS_NORMALIZATION)
ax(0.5/inv_param.kappa, spinorCheck, N*spinorSiteSize, inv_param.cpu_prec);
ax(0.5/inv_param.kappa, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec);
mxpy(spinorIn, spinorCheck, N*spinorSiteSize, inv_param.cpu_prec);
double nrm2 = norm_2(spinorCheck, N*spinorSiteSize, inv_param.cpu_prec);
double src2 = norm_2(spinorIn, N*spinorSiteSize, inv_param.cpu_prec);
mxpy(spinorIn, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec);
double nrm2 = norm_2(spinorCheck, V*spinorSiteSize, inv_param.cpu_prec);
double src2 = norm_2(spinorIn, V*spinorSiteSize, inv_param.cpu_prec);
printf("Relative residual, requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2));
endQuda();
......
#define READ_SPINOR_DOUBLE(spinor) \
double2 I0 = fetch_double2((spinor), sp_idx + 0*Nh); \
double2 I1 = fetch_double2((spinor), sp_idx + 1*Nh); \
double2 I2 = fetch_double2((spinor), sp_idx + 2*Nh); \
double2 I3 = fetch_double2((spinor), sp_idx + 3*Nh); \
double2 I4 = fetch_double2((spinor), sp_idx + 4*Nh); \
double2 I5 = fetch_double2((spinor), sp_idx + 5*Nh); \
double2 I6 = fetch_double2((spinor), sp_idx + 6*Nh); \
double2 I7 = fetch_double2((spinor), sp_idx + 7*Nh); \
double2 I8 = fetch_double2((spinor), sp_idx + 8*Nh); \
double2 I9 = fetch_double2((spinor), sp_idx + 9*Nh); \
double2 I10 = fetch_double2((spinor), sp_idx + 10*Nh); \
double2 I11 = fetch_double2((spinor), sp_idx + 11*Nh);
double2 I0 = fetch_double2((spinor), sp_idx + 0*Vh); \
double2 I1 = fetch_double2((spinor), sp_idx + 1*Vh); \
double2 I2 = fetch_double2((spinor), sp_idx + 2*Vh); \
double2 I3 = fetch_double2((spinor), sp_idx + 3*Vh); \
double2 I4 = fetch_double2((spinor), sp_idx + 4*Vh); \
double2 I5 = fetch_double2((spinor), sp_idx + 5*Vh); \
double2 I6 = fetch_double2((spinor), sp_idx + 6*Vh); \
double2 I7 = fetch_double2((spinor), sp_idx + 7*Vh); \
double2 I8 = fetch_double2((spinor), sp_idx + 8*Vh); \
double2 I9 = fetch_double2((spinor), sp_idx + 9*Vh); \
double2 I10 = fetch_double2((spinor), sp_idx + 10*Vh); \
double2 I11 = fetch_double2((spinor), sp_idx + 11*Vh);
#define READ_SPINOR_DOUBLE_UP(spinor) \
double2 I0 = fetch_double2((spinor), sp_idx + 0*Nh); \
double2 I1 = fetch_double2((spinor), sp_idx + 1*Nh); \
double2 I2 = fetch_double2((spinor), sp_idx + 2*Nh); \
double2 I3 = fetch_double2((spinor), sp_idx + 3*Nh); \
double2 I4 = fetch_double2((spinor), sp_idx + 4*Nh); \
double2 I5 = fetch_double2((spinor), sp_idx + 5*Nh);
double2 I0 = fetch_double2((spinor), sp_idx + 0*Vh); \
double2 I1 = fetch_double2((spinor), sp_idx + 1*Vh); \
double2 I2 = fetch_double2((spinor), sp_idx + 2*Vh); \
double2 I3 = fetch_double2((spinor), sp_idx + 3*Vh); \
double2 I4 = fetch_double2((spinor), sp_idx + 4*Vh); \
double2 I5 = fetch_double2((spinor), sp_idx + 5*Vh);
#define READ_SPINOR_DOUBLE_DOWN(spinor) \
double2 I6 = fetch_double2((spinor), sp_idx + 6*Nh); \
double2 I7 = fetch_double2((spinor), sp_idx + 7*Nh); \
double2 I8 = fetch_double2((spinor), sp_idx + 8*Nh); \
double2 I9 = fetch_double2((spinor), sp_idx + 9*Nh); \
double2 I10 = fetch_double2((spinor), sp_idx + 10*Nh); \
double2 I11 = fetch_double2((spinor), sp_idx + 11*Nh);
double2 I6 = fetch_double2((spinor), sp_idx + 6*Vh); \
double2 I7 = fetch_double2((spinor), sp_idx + 7*Vh); \
double2 I8 = fetch_double2((spinor), sp_idx + 8*Vh); \
double2 I9 = fetch_double2((spinor), sp_idx + 9*Vh); \
double2 I10 = fetch_double2((spinor), sp_idx + 10*Vh); \
double2 I11 = fetch_double2((spinor), sp_idx + 11*Vh);
#define READ_SPINOR_SINGLE(spinor) \
float4 I0 = tex1Dfetch((spinor), sp_idx + 0*Nh); \
float4 I1 = tex1Dfetch((spinor), sp_idx + 1*Nh); \
float4 I2 = tex1Dfetch((spinor), sp_idx + 2*Nh); \
float4 I3 = tex1Dfetch((spinor), sp_idx + 3*Nh); \
float4 I4 = tex1Dfetch((spinor), sp_idx + 4*Nh); \
float4 I5 = tex1Dfetch((spinor), sp_idx + 5*Nh);
float4 I0 = tex1Dfetch((spinor), sp_idx + 0*Vh); \
float4 I1 = tex1Dfetch((spinor), sp_idx + 1*Vh); \
float4 I2 = tex1Dfetch((spinor), sp_idx + 2*Vh); \
float4 I3 = tex1Dfetch((spinor), sp_idx + 3*Vh); \
float4 I4 = tex1Dfetch((spinor), sp_idx + 4*Vh); \
float4 I5 = tex1Dfetch((spinor), sp_idx + 5*Vh);
#define READ_SPINOR_SINGLE_UP(spinor) \
float4 I0 = tex1Dfetch((spinor), sp_idx + 0*Nh); \
float4 I1 = tex1Dfetch((spinor), sp_idx + 1*Nh); \
float4 I2 = tex1Dfetch((spinor), sp_idx + 2*Nh); \
float4 I0 = tex1Dfetch((spinor), sp_idx + 0*Vh); \
float4 I1 = tex1Dfetch((spinor), sp_idx + 1*Vh); \
float4 I2 = tex1Dfetch((spinor), sp_idx + 2*Vh); \
#define READ_SPINOR_SINGLE_DOWN(spinor) \
float4 I3 = tex1Dfetch((spinor), sp_idx + 3*Nh); \
float4 I4 = tex1Dfetch((spinor), sp_idx + 4*Nh); \
float4 I5 = tex1Dfetch((spinor), sp_idx + 5*Nh);
float4 I3 = tex1Dfetch((spinor), sp_idx + 3*Vh); \
float4 I4 = tex1Dfetch((spinor), sp_idx + 4*Vh); \
float4 I5 = tex1Dfetch((spinor), sp_idx + 5*Vh);
#define READ_SPINOR_HALF(spinor) \
float4 I0 = tex1Dfetch((spinor), sp_idx + 0*Nh); \
float4 I1 = tex1Dfetch((spinor), sp_idx + 1*Nh); \
float4 I2 = tex1Dfetch((spinor), sp_idx + 2*Nh); \
float4 I3 = tex1Dfetch((spinor), sp_idx + 3*Nh); \
float4 I4 = tex1Dfetch((spinor), sp_idx + 4*Nh); \
float4 I5 = tex1Dfetch((spinor), sp_idx + 5*Nh); \
float4 I0 = tex1Dfetch((spinor), sp_idx + 0*Vh); \
float4 I1 = tex1Dfetch((spinor), sp_idx + 1*Vh); \
float4 I2 = tex1Dfetch((spinor), sp_idx + 2*Vh); \
float4 I3 = tex1Dfetch((spinor), sp_idx + 3*Vh); \
float4 I4 = tex1Dfetch((spinor), sp_idx + 4*Vh); \
float4 I5 = tex1Dfetch((spinor), sp_idx + 5*Vh); \
float C = tex1Dfetch((spinorTexNorm), sp_idx); \
I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
......@@ -62,52 +62,52 @@
I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
#define READ_SPINOR_HALF_UP(spinor) \
float4 I0 = tex1Dfetch((spinor), sp_idx + 0*Nh); \
float4 I1 = tex1Dfetch((spinor), sp_idx + 1*Nh); \
float4 I2 = tex1Dfetch((spinor), sp_idx + 2*Nh); \
float4 I0 = tex1Dfetch((spinor), sp_idx + 0*Vh); \
float4 I1 = tex1Dfetch((spinor), sp_idx + 1*Vh); \
float4 I2 = tex1Dfetch((spinor), sp_idx + 2*Vh); \
float C = tex1Dfetch((spinorTexNorm), sp_idx); \
I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \
#define READ_SPINOR_HALF_DOWN(spinor) \
float4 I3 = tex1Dfetch((spinor), sp_idx + 3*Nh); \
float4 I4 = tex1Dfetch((spinor), sp_idx + 4*Nh); \
float4 I5 = tex1Dfetch((spinor), sp_idx + 5*Nh); \
float4 I3 = tex1Dfetch((spinor), sp_idx + 3*Vh); \
float4 I4 = tex1Dfetch((spinor), sp_idx + 4*Vh); \
float4 I5 = tex1Dfetch((spinor), sp_idx + 5*Vh); \
float C = tex1Dfetch((spinorTexNorm), sp_idx); \
I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \
I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \
I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
#define READ_ACCUM_DOUBLE(spinor) \
double2 accum0 = fetch_double2((spinor), sid + 0*Nh); \
double2 accum1 = fetch_double2((spinor), sid + 1*Nh); \
double2 accum2 = fetch_double2((spinor), sid + 2*Nh); \
double2 accum3 = fetch_double2((spinor), sid + 3*Nh); \
double2 accum4 = fetch_double2((spinor), sid + 4*Nh); \
double2 accum5 = fetch_double2((spinor), sid + 5*Nh); \
double2 accum6 = fetch_double2((spinor), sid + 6*Nh); \
double2 accum7 = fetch_double2((spinor), sid + 7*Nh); \
double2 accum8 = fetch_double2((spinor), sid + 8*Nh); \
double2 accum9 = fetch_double2((spinor), sid + 9*Nh); \
double2 accum10 = fetch_double2((spinor), sid + 10*Nh); \
double2 accum11 = fetch_double2((spinor), sid + 11*Nh);
double2 accum0 = fetch_double2((spinor), sid + 0*Vh); \
double2 accum1 = fetch_double2((spinor), sid + 1*Vh); \
double2 accum2 = fetch_double2((spinor), sid + 2*Vh); \
double2 accum3 = fetch_double2((spinor), sid + 3*Vh); \
double2 accum4 = fetch_double2((spinor), sid + 4*Vh); \
double2 accum5 = fetch_double2((spinor), sid + 5*Vh); \
double2 accum6 = fetch_double2((spinor), sid + 6*Vh); \
double2 accum7 = fetch_double2((spinor), sid + 7*Vh); \
double2 accum8 = fetch_double2((spinor), sid + 8*Vh); \
double2 accum9 = fetch_double2((spinor), sid + 9*Vh); \
double2 accum10 = fetch_double2((spinor), sid + 10*Vh); \
double2 accum11 = fetch_double2((spinor), sid + 11*Vh);
#define READ_ACCUM_SINGLE(spinor) \
float4 accum0 = tex1Dfetch((spinor), sid + 0*Nh); \
float4 accum1 = tex1Dfetch((spinor), sid + 1*Nh); \
float4 accum2 = tex1Dfetch((spinor), sid + 2*Nh); \
float4 accum3 = tex1Dfetch((spinor), sid + 3*Nh); \
float4 accum4 = tex1Dfetch((spinor), sid + 4*Nh); \
float4 accum5 = tex1Dfetch((spinor), sid + 5*Nh);
float4 accum0 = tex1Dfetch((spinor), sid + 0*Vh); \
float4 accum1 = tex1Dfetch((spinor), sid + 1*Vh); \
float4 accum2 = tex1Dfetch((spinor), sid + 2*Vh); \
float4 accum3 = tex1Dfetch((spinor), sid + 3*Vh); \
float4 accum4 = tex1Dfetch((spinor), sid + 4*Vh); \
float4 accum5 = tex1Dfetch((spinor), sid + 5*Vh);
#define READ_ACCUM_HALF(spinor) \
float4 accum0 = tex1Dfetch((spinor), sid + 0*Nh); \
float4 accum1 = tex1Dfetch((spinor), sid + 1*Nh); \
float4 accum2 = tex1Dfetch((spinor), sid + 2*Nh); \
float4 accum3 = tex1Dfetch((spinor), sid + 3*Nh); \
float4 accum4 = tex1Dfetch((spinor), sid + 4*Nh); \
float4 accum5 = tex1Dfetch((spinor), sid + 5*Nh); \
float4 accum0 = tex1Dfetch((spinor), sid + 0*Vh); \
float4 accum1 = tex1Dfetch((spinor), sid + 1*Vh); \
float4 accum2 = tex1Dfetch((spinor), sid + 2*Vh); \
float4 accum3 = tex1Dfetch((spinor), sid + 3*Vh); \
float4 accum4 = tex1Dfetch((spinor), sid + 4*Vh); \
float4 accum5 = tex1Dfetch((spinor), sid + 5*Vh); \
float C = tex1Dfetch((accumTexNorm), sid); \
accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C; \
accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C; \
......@@ -118,26 +118,26 @@
#define WRITE_SPINOR_DOUBLE2() \
g_out[0*Nh+sid] = make_double2(o00_re, o00_im); \
g_out[1*Nh+sid] = make_double2(o01_re, o01_im); \
g_out[2*Nh+sid] = make_double2(o02_re, o02_im); \
g_out[3*Nh+sid] = make_double2(o10_re, o10_im); \
g_out[4*Nh+sid] = make_double2(o11_re, o11_im); \
g_out[5*Nh+sid] = make_double2(o12_re, o12_im); \
g_out[6*Nh+sid] = make_double2(o20_re, o20_im); \
g_out[7*Nh+sid] = make_double2(o21_re, o21_im); \
g_out[8*Nh+sid] = make_double2(o22_re, o22_im); \
g_out[9*Nh+sid] = make_double2(o30_re, o30_im); \
g_out[10*Nh+sid] = make_double2(o31_re, o31_im); \
g_out[11*Nh+sid] = make_double2(o32_re, o32_im);
g_out[0*Vh+sid] = make_double2(o00_re, o00_im); \
g_out[1*Vh+sid] = make_double2(o01_re, o01_im); \
g_out[2*Vh+sid] = make_double2(o02_re, o02_im); \
g_out[3*Vh+sid] = make_double2(o10_re, o10_im); \
g_out[4*Vh+sid] = make_double2(o11_re, o11_im); \
g_out[5*Vh+sid] = make_double2(o12_re, o12_im); \
g_out[6*Vh+sid] = make_double2(o20_re, o20_im); \
g_out[7*Vh+sid] = make_double2(o21_re, o21_im); \
g_out[8*Vh+sid] = make_double2(o22_re, o22_im); \
g_out[9*Vh+sid] = make_double2(o30_re, o30_im); \
g_out[10*Vh+sid] = make_double2(o31_re, o31_im); \
g_out[11*Vh+sid] = make_double2(o32_re, o32_im);
#define WRITE_SPINOR_FLOAT4() \
g_out[0*Nh+sid] = make_float4(o00_re, o00_im, o01_re, o01_im); \
g_out[1*Nh+sid] = make_float4(o02_re, o02_im, o10_re, o10_im); \
g_out[2*Nh+sid] = make_float4(o11_re, o11_im, o12_re, o12_im); \
g_out[3*Nh+sid] = make_float4(o20_re, o20_im, o21_re, o21_im); \
g_out[4*Nh+sid] = make_float4(o22_re, o22_im, o30_re, o30_im); \
g_out[5*Nh+sid] = make_float4(o31_re, o31_im, o32_re, o32_im);
g_out[0*Vh+sid] = make_float4(o00_re, o00_im, o01_re, o01_im); \
g_out[1*Vh+sid] = make_float4(o02_re, o02_im, o10_re, o10_im); \
g_out[2*Vh+sid] = make_float4(o11_re, o11_im, o12_re, o12_im); \
g_out[3*Vh+sid] = make_float4(o20_re, o20_im, o21_re, o21_im); \
g_out[4*Vh+sid] = make_float4(o22_re, o22_im, o30_re, o30_im); \
g_out[5*Vh+sid] = make_float4(o31_re, o31_im, o32_re, o32_im);
#define WRITE_SPINOR_SHORT4() \
float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \
......@@ -171,12 +171,12 @@
o20_re *= scale; o20_im *= scale; o21_re *= scale; o21_im *= scale; \
o22_re *= scale; o22_im *= scale; o30_re *= scale; o30_im *= scale; \
o31_re *= scale; o31_im *= scale; o32_re *= scale; o32_im *= scale; \
g_out[sid+0*Nh] = make_short4((short)o00_re, (short)o00_im, (short)o01_re, (short)o01_im); \
g_out[sid+1*Nh] = make_short4((short)o02_re, (short)o02_im, (short)o10_re, (short)o10_im); \
g_out[sid+2*Nh] = make_short4((short)o11_re, (short)o11_im, (short)o12_re, (short)o12_im); \
g_out[sid+3*Nh] = make_short4((short)o20_re, (short)o20_im, (short)o21_re, (short)o21_im); \
g_out[sid+4*Nh] = make_short4((short)o22_re, (short)o22_im, (short)o30_re, (short)o30_im); \
g_out[sid+5*Nh] = make_short4((short)o31_re, (short)o31_im, (short)o32_re, (short)o32_im);
g_out[sid+0*Vh] = make_short4((short)o00_re, (short)o00_im, (short)o01_re, (short)o01_im); \
g_out[sid+1*Vh] = make_short4((short)o02_re, (short)o02_im, (short)o10_re, (short)o10_im); \
g_out[sid+2*Vh] = make_short4((short)o11_re, (short)o11_im, (short)o12_re, (short)o12_im); \
g_out[sid+3*Vh] = make_short4((short)o20_re, (short)o20_im, (short)o21_re, (short)o21_im); \
g_out[sid+4*Vh] = make_short4((short)o22_re, (short)o22_im, (short)o30_re, (short)o30_im); \
g_out[sid+5*Vh] = make_short4((short)o31_re, (short)o31_im, (short)o32_re, (short)o32_im);
/*
#define WRITE_SPINOR_FLOAT1_SMEM() \
......@@ -186,7 +186,7 @@
int f = SHARED_FLOATS_PER_THREAD; \
__syncthreads(); \
for (int i = 0; i < 6; i++) for (int c = 0; c < 4; c++) \
((float*)g_out)[i*(Nh*4) + b*(B*4) + c*(B) + t] = s_data[(c*B/4 + t/4)*(f) + i*(4) + t%4];
((float*)g_out)[i*(Vh*4) + b*(B*4) + c*(B) + t] = s_data[(c*B/4 + t/4)*(f) + i*(4) + t%4];
// the alternative to writing float4's directly: almost as fast, a lot more confusing
#define WRITE_SPINOR_FLOAT1_STAGGERED() \
......@@ -196,7 +196,7 @@
int f = SHARED_FLOATS_PER_THREAD; \
__syncthreads(); \
for (int i = 0; i < 4; i++) for (int c = 0; c < 4; c++) \
((float*)g_out)[i*(Nh*4) + b*(B*4) + c*(B) + t] = s_data[(c*B/4 + t/4)*(f) + i*(4) + t%4]; \
((float*)g_out)[i*(Vh*4) + b*(B*4) + c*(B) + t] = s_data[(c*B/4 + t/4)*(f) + i*(4) + t%4]; \
__syncthreads(); \
s[0] = o22_re; \
s[1] = o22_im; \
......@@ -208,6 +208,6 @@
s[7] = o32_im; \
__syncthreads(); \
for (int i = 0; i < 2; i++) for (int c = 0; c < 4; c++) \
((float*)g_out)[(i+4)*(Nh*4) + b*(B*4) + c*(B) + t] = s_data[(c*B/4 + t/4)*(f) + i*(4) + t%4];
((float*)g_out)[(i+4)*(Vh*4) + b*(B*4) + c*(B) + t] = s_data[(c*B/4 + t/4)*(f) + i*(4) + t%4];
*/
......@@ -7,6 +7,8 @@
#include <gauge_quda.h>
#include <spinor_quda.h>
#include <dslash_reference.h>
#define FULL_WILSON 1
QudaGaugeParam param;
......@@ -34,14 +36,6 @@ aligned_malloc(size_t n, void **m0)
return (void *)m;
}
void printSpinorHalfField(void *spinor, Precision precision) {
printSpinorElement(spinor, 0, precision);
printf("...\n");
printSpinorElement(spinor, Nh-1, precision);
printf("\n");
}
void init() {
Precision single = QUDA_SINGLE_PRECISION;
......@@ -51,10 +45,13 @@ void init() {
param.reconstruct = QUDA_RECONSTRUCT_12;
param.cuda_prec_sloppy = param.cuda_prec;
param.reconstruct_sloppy = param.reconstruct;
param.X = L1;
param.Y = L2;
param.Z = L3;
param.T = L4;
param.X[0] = 4;
param.X[1] = 4;
param.X[2] = 4;
param.X[3] = 4;
setDims(param.X);
param.anisotropy = 2.3;
param.t_boundary = QUDA_ANTI_PERIODIC_T;
param.gauge_fix = QUDA_GAUGE_FIXED_NO;
......@@ -62,17 +59,19 @@ void init() {
// construct input fields
for (int dir = 0; dir < 4; dir++) {
qdpGauge[dir] = (float*)malloc(N*gaugeSiteSize*sizeof(float));
qdpGauge[dir] = (float*)malloc(V*gaugeSiteSize*sizeof(float));
}
cpsGauge = (float*)malloc(4*N*gaugeSiteSize*sizeof(float));
cpsGauge = (float*)malloc(4*V*gaugeSiteSize*sizeof(float));
spinor = (float*)malloc(N*spinorSiteSize*sizeof(float));
spinor = (float*)malloc(V*spinorSiteSize*sizeof(float));
int dev = 0;
cudaSetDevice(dev);
cudaFullSpinor = allocateSpinorField(N, single);
cudaParitySpinor = allocateParitySpinor(Nh, single);
param.X[0] /= 2;
cudaFullSpinor = allocateSpinorField(param.X, single);
cudaParitySpinor = allocateParitySpinor(param.X, single);
param.X[0] *= 2;
}
......@@ -90,22 +89,21 @@ void packTest() {
init();
float spinorGiB = (float)Nh*spinorSiteSize*sizeof(float) / (1 << 30);
float spinorGiB = (float)Vh*spinorSiteSize*sizeof(float) / (1 << 30);
printf("\nSpinor mem: %.3f GiB\n", spinorGiB);
printf("Gauge mem: %.3f GiB\n", param.gaugeGiB);
printf("Sending fields to GPU...\n"); fflush(stdout);
stopwatchStart();
param.gauge_order = QUDA_CPS_WILSON_GAUGE_ORDER;
createGaugeField(&cudaGaugePrecise, cpsGauge, param.reconstruct, param.cuda_prec);
createGaugeField(&cudaGaugePrecise, cpsGauge, param.reconstruct, param.cuda_prec, param.X);
double cpsGtime = stopwatchReadSeconds();
printf("CPS Gauge send time = %e seconds\n", cpsGtime);
stopwatchStart();
param.gauge_order = QUDA_QDP_GAUGE_ORDER;
createGaugeField(&cudaGaugePrecise, qdpGauge, param.reconstruct, param.cuda_prec);
createGaugeField(&cudaGaugePrecise, qdpGauge, param.reconstruct, param.cuda_prec, param.X);
double qdpGtime = stopwatchReadSeconds();
printf("QDP Gauge send time = %e seconds\n", qdpGtime);
......
......@@ -3,14 +3,14 @@
#include <cuda_runtime.h>
#define L1 24 // "x" dimension
#define L2 24 // "y" dimension
#define L3 24 // "z" dimension
#define L4 32 // "time" dimension
#define L1h (L1/2) // half of the full "x" dimension, useful for even/odd lattice indexing
//#define L1 4 // "x" dimension
//#define L2 4 // "y" dimension
//#define L3 4 // "z" dimension
//#define L4 4 // "time" dimension
//#define L1h (L1/2) // half of the full "x" dimension, useful for even/odd lattice indexing
#define N (L1*L2*L3*L4) // total number of lattice points
#define Nh (N/2) // total number of even/odd lattice points
//#define N (L1*L2*L3*L4) // total number of lattice points
//#define Nh (N/2) // total number of even/odd lattice points
#define MAX_SHORT 32767
......@@ -35,16 +35,30 @@ extern "C" {
#endif
typedef void *ParityGauge;
typedef void *ParityClover;
typedef struct {
size_t packedGaugeBytes;
size_t bytes;
Precision precision;
int length; // total length
int volume; // geometric volume (single parity)
int X[4]; // the geometric lengths (single parity)
int Nc; // number of colors
ReconstructType reconstruct;
ParityGauge odd;
ParityGauge even;
} FullGauge;
typedef struct {
size_t bytes;
Precision precision;
int length;
int volume;
int Nc;
int Ns;
void *clover; // pointer to clover matrix
void *cloverInverse; // pointer to inverse of clover matrix
} ParityClover;
typedef struct {
Precision precision;
ParityClover odd;
......@@ -52,8 +66,13 @@ extern "C" {
} FullClover;
typedef struct {
size_t bytes;
Precision precision;
int length; // geometric length of spinor
int length; // total length
int volume; // geometric volume (single parity)
int X[4]; // the geometric lengths (single parity)
int Nc; // length of color dimension
int Ns; // length of spin dimension
void *spinor; // either (double2*), (float4 *) or (short4 *), depending on precision
float *spinorNorm; // used only when precision is QUDA_HALF_PRECISION
} ParitySpinor;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment