Advanced Computing Platform for Theoretical Physics

commit大文件会使得服务器变得不稳定,请大家尽量只commit代码,不要commit大的文件。

Commit b7252b18 authored by mikeaclark's avatar mikeaclark
Browse files

Clean up of half precision and some interfaces

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@399 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 85b72c53
......@@ -22,6 +22,24 @@
#define QudaSumFloat3 float3
#endif
// Double precision input spinor field
texture<int4, 1> spinorTexDouble;
// Single precision input spinor field
texture<float4, 1, cudaReadModeElementType> spinorTexSingle;
// Half precision input spinor field
texture<short4, 1, cudaReadModeNormalizedFloat> spinorTexHalf;
texture<float, 1, cudaReadModeElementType> spinorTexNorm;
#if (__CUDA_ARCH__ == 130)
static __inline__ __device__ double2 fetch_double2(texture<int4, 1> t, int i)
{
int4 v = tex1Dfetch(t,i);
return make_double2(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z));
}
#endif
inline void checkSpinor(ParitySpinor &a, ParitySpinor &b) {
if (a.precision == QUDA_HALF_PRECISION || b.precision == QUDA_HALF_PRECISION) {
printf("checkSpinor error, this kernel does not support QUDA_HALF_PRECISION\n");
......@@ -40,12 +58,7 @@ inline void checkSpinor(ParitySpinor &a, ParitySpinor &b) {
}
// For kernels with precision conversion built in
inline void checkHalfSpinor(ParitySpinor &a, ParitySpinor &b) {
if (a.precision == QUDA_HALF_PRECISION || b.precision == QUDA_HALF_PRECISION) {
printf("checkSpinor error, this kernel does not support QUDA_HALF_PRECISION\n");
exit(-1);
}
inline void checkSpinorLength(ParitySpinor &a, ParitySpinor &b) {
if (a.length != b.length) {
printf("checkSpinor error, lengths do not match: %d %d\n", a.length, b.length);
exit(-1);
......@@ -98,22 +111,217 @@ __global__ void convertSDKernel(float4 *dst, double2 *src, int length) {
}
}
__global__ void convertHSKernel(short4 *h, float *norm, int length) {
int i = blockIdx.x*(blockDim.x) + threadIdx.x;
unsigned int gridSize = gridDim.x*blockDim.x;
while(i < length) {
float4 F0 = tex1Dfetch(spinorTexSingle, i + 0*length);
float4 F1 = tex1Dfetch(spinorTexSingle, i + 1*length);
float4 F2 = tex1Dfetch(spinorTexSingle, i + 2*length);
float4 F3 = tex1Dfetch(spinorTexSingle, i + 3*length);
float4 F4 = tex1Dfetch(spinorTexSingle, i + 4*length);
float4 F5 = tex1Dfetch(spinorTexSingle, i + 5*length);
float c0 = fmaxf(fabsf(F0.x), fabsf(F0.y));
float c1 = fmaxf(fabsf(F0.z), fabsf(F0.w));
float c2 = fmaxf(fabsf(F1.x), fabsf(F1.y));
float c3 = fmaxf(fabsf(F1.z), fabsf(F1.w));
float c4 = fmaxf(fabsf(F2.x), fabsf(F2.y));
float c5 = fmaxf(fabsf(F2.z), fabsf(F2.w));
float c6 = fmaxf(fabsf(F3.x), fabsf(F3.y));
float c7 = fmaxf(fabsf(F3.z), fabsf(F3.w));
float c8 = fmaxf(fabsf(F4.x), fabsf(F4.y));
float c9 = fmaxf(fabsf(F4.z), fabsf(F4.w));
float c10 = fmaxf(fabsf(F5.x), fabsf(F5.y));
float c11 = fmaxf(fabsf(F5.z), fabsf(F5.w));
c0 = fmaxf(c0, c1);
c1 = fmaxf(c2, c3);
c2 = fmaxf(c4, c5);
c3 = fmaxf(c6, c7);
c4 = fmaxf(c8, c9);
c5 = fmaxf(c10, c11);
c0 = fmaxf(c0, c1);
c1 = fmaxf(c2, c3);
c2 = fmaxf(c4, c5);
c0 = fmaxf(c0, c1);
c0 = fmaxf(c0, c2); // c0 is now the maximum element
norm[i] = c0;
float C = __fdividef(MAX_SHORT, c0);
F0.x *= C; F0.y *= C; F0.z *= C; F0.w *= C; F1.x *= C; F1.y *= C; F1.z *= C; F1.w *= C;
F2.x *= C; F2.y *= C; F2.z *= C; F2.w *= C; F3.x *= C; F3.y *= C; F3.z *= C; F3.w *= C;
F4.x *= C; F4.y *= C; F4.z *= C; F4.w *= C; F5.x *= C; F5.y *= C; F5.z *= C; F5.w *= C;
h[i+0*length] = make_short4((short)F0.x, (short)F0.y, (short)F0.z, (short)F0.w);
h[i+1*length] = make_short4((short)F1.x, (short)F1.y, (short)F1.z, (short)F1.w);
h[i+2*length] = make_short4((short)F2.x, (short)F2.y, (short)F2.z, (short)F2.w);
h[i+3*length] = make_short4((short)F3.x, (short)F3.y, (short)F3.z, (short)F3.w);
h[i+4*length] = make_short4((short)F4.x, (short)F4.y, (short)F4.z, (short)F4.w);
h[i+5*length] = make_short4((short)F5.x, (short)F5.y, (short)F5.z, (short)F5.w);
i += gridSize;
}
}
__global__ void convertSHKernel(float4 *res, int length) {
int i = blockIdx.x*(blockDim.x) + threadIdx.x;
unsigned int gridSize = gridDim.x*blockDim.x;
while (i<length) {
float4 I0 = tex1Dfetch(spinorTexHalf, i + 0*length);
float4 I1 = tex1Dfetch(spinorTexHalf, i + 1*length);
float4 I2 = tex1Dfetch(spinorTexHalf, i + 2*length);
float4 I3 = tex1Dfetch(spinorTexHalf, i + 3*length);
float4 I4 = tex1Dfetch(spinorTexHalf, i + 4*length);
float4 I5 = tex1Dfetch(spinorTexHalf, i + 5*length);
float C = tex1Dfetch(spinorTexNorm, i);
I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C;
I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C;
I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
res[0*length+i] = I0;
res[1*length+i] = I1;
res[2*length+i] = I2;
res[3*length+i] = I3;
res[4*length+i] = I4;
res[5*length+i] = I5;
i += gridSize;
}
}
__global__ void convertHDKernel(short4 *h, float *norm, int length) {
int i = blockIdx.x*(blockDim.x) + threadIdx.x;
unsigned int gridSize = gridDim.x*blockDim.x;
while(i < length) {
double2 F0 = fetch_double2(spinorTexDouble, i + 0*length);
double2 F1 = fetch_double2(spinorTexDouble, i + 1*length);
double2 F2 = fetch_double2(spinorTexDouble, i + 2*length);
double2 F3 = fetch_double2(spinorTexDouble, i + 3*length);
double2 F4 = fetch_double2(spinorTexDouble, i + 4*length);
double2 F5 = fetch_double2(spinorTexDouble, i + 5*length);
double2 F6 = fetch_double2(spinorTexDouble, i + 6*length);
double2 F7 = fetch_double2(spinorTexDouble, i + 7*length);
double2 F8 = fetch_double2(spinorTexDouble, i + 8*length);
double2 F9 = fetch_double2(spinorTexDouble, i + 9*length);
double2 F10 = fetch_double2(spinorTexDouble, i + 10*length);
double2 F11 = fetch_double2(spinorTexDouble, i + 11*length);
float c0 = fmaxf(fabsf(F0.x), fabsf(F0.y));
float c1 = fmaxf(fabsf(F1.x), fabsf(F1.y));
float c2 = fmaxf(fabsf(F2.x), fabsf(F2.y));
float c3 = fmaxf(fabsf(F3.x), fabsf(F3.y));
float c4 = fmaxf(fabsf(F4.x), fabsf(F4.y));
float c5 = fmaxf(fabsf(F5.x), fabsf(F5.y));
float c6 = fmaxf(fabsf(F6.x), fabsf(F6.y));
float c7 = fmaxf(fabsf(F7.x), fabsf(F7.y));
float c8 = fmaxf(fabsf(F8.x), fabsf(F8.y));
float c9 = fmaxf(fabsf(F9.x), fabsf(F9.y));
float c10 = fmaxf(fabsf(F10.x), fabsf(F10.y));
float c11 = fmaxf(fabsf(F11.x), fabsf(F11.y));
c0 = fmaxf(c0, c1); c1 = fmaxf(c2, c3); c2 = fmaxf(c4, c5); c3 = fmaxf(c6, c7);
c4 = fmaxf(c8, c9); c5 = fmaxf(c10, c11); c0 = fmaxf(c0, c1); c1 = fmaxf(c2, c3);
c2 = fmaxf(c4, c5); c0 = fmaxf(c0, c1); c0 = fmaxf(c0, c2); // c0 is now the maximum element
norm[i] = c0;
float C = __fdividef(MAX_SHORT, c0);
h[i+0*length] = make_short4((short)(C*(float)F0.x), (short)(C*(float)F0.y),
(short)(C*(float)F1.x), (short)(C*(float)F1.y));
h[i+1*length] = make_short4((short)(C*(float)F2.x), (short)(C*(float)F2.y),
(short)(C*(float)F3.x), (short)(C*(float)F3.y));
h[i+2*length] = make_short4((short)(C*(float)F4.x), (short)(C*(float)F4.y),
(short)(C*(float)F5.x), (short)(C*(float)F5.y));
h[i+3*length] = make_short4((short)(C*(float)F6.x), (short)(C*(float)F6.y),
(short)(C*(float)F7.x), (short)(C*(float)F7.y));
h[i+4*length] = make_short4((short)(C*(float)F8.x), (short)(C*(float)F8.y),
(short)(C*(float)F9.x), (short)(C*(float)F9.y));
h[i+5*length] = make_short4((short)(C*(float)F10.x), (short)(C*(float)F10.y),
(short)(C*(float)F11.x), (short)(C*(float)F11.y));
i += gridSize;
}
}
__global__ void convertDHKernel(double2 *res, int length) {
int i = blockIdx.x*(blockDim.x) + threadIdx.x;
unsigned int gridSize = gridDim.x*blockDim.x;
while(i < length) {
float4 I0 = tex1Dfetch(spinorTexHalf, i + 0*length);
float4 I1 = tex1Dfetch(spinorTexHalf, i + 1*length);
float4 I2 = tex1Dfetch(spinorTexHalf, i + 2*length);
float4 I3 = tex1Dfetch(spinorTexHalf, i + 3*length);
float4 I4 = tex1Dfetch(spinorTexHalf, i + 4*length);
float4 I5 = tex1Dfetch(spinorTexHalf, i + 5*length);
float C = tex1Dfetch(spinorTexNorm, i);
I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C;
I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C;
I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C;
I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C;
I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C;
I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
res[0*length+i] = make_double2(I0.x, I0.y);
res[1*length+i] = make_double2(I0.z, I0.w);
res[2*length+i] = make_double2(I1.x, I1.y);
res[3*length+i] = make_double2(I1.z, I1.w);
res[4*length+i] = make_double2(I2.x, I2.y);
res[5*length+i] = make_double2(I2.z, I2.w);
res[6*length+i] = make_double2(I3.x, I3.y);
res[7*length+i] = make_double2(I3.z, I3.w);
res[8*length+i] = make_double2(I4.x, I4.y);
res[9*length+i] = make_double2(I4.z, I4.w);
res[10*length+i] = make_double2(I0.x, I5.y);
res[11*length+i] = make_double2(I0.z, I5.w);
i += gridSize;
}
}
void copyQuda(ParitySpinor dst, ParitySpinor src) {
checkHalfSpinor(dst, src);
checkSpinorLength(dst, src);
int convertLength = dst.length / 24;
int convertLength = dst.length / spinorSiteSize;
int blocks = min(REDUCE_MAX_BLOCKS, max(convertLength/REDUCE_THREADS, 1));
dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1);
if (dst.precision == QUDA_DOUBLE_PRECISION && src.precision == QUDA_SINGLE_PRECISION)
if (dst.precision == QUDA_DOUBLE_PRECISION && src.precision == QUDA_SINGLE_PRECISION) {
convertDSKernel<<<dimGrid, dimBlock>>>((double2*)dst.spinor, (float4*)src.spinor, convertLength);
else if (dst.precision == QUDA_SINGLE_PRECISION && src.precision == QUDA_DOUBLE_PRECISION)
} else if (dst.precision == QUDA_SINGLE_PRECISION && src.precision == QUDA_DOUBLE_PRECISION) {
convertSDKernel<<<dimGrid, dimBlock>>>((float4*)dst.spinor, (double2*)src.spinor, convertLength);
else if (dst.precision == QUDA_DOUBLE_PRECISION)
} else if (dst.precision == QUDA_SINGLE_PRECISION && src.precision == QUDA_HALF_PRECISION) {
int spinor_bytes = dst.length*sizeof(short);
cudaBindTexture(0, spinorTexHalf, src.spinor, spinor_bytes);
cudaBindTexture(0, spinorTexNorm, src.spinorNorm, spinor_bytes/12);
convertSHKernel<<<dimGrid, dimBlock>>>((float4*)dst.spinor, convertLength);
} else if (dst.precision == QUDA_HALF_PRECISION && src.precision == QUDA_SINGLE_PRECISION) {
int spinor_bytes = dst.length*sizeof(float);
cudaBindTexture(0, spinorTexSingle, src.spinor, spinor_bytes);
convertHSKernel<<<dimGrid, dimBlock>>>((short4*)dst.spinor, (float*)dst.spinorNorm, convertLength);
} else if (dst.precision == QUDA_DOUBLE_PRECISION && src.precision == QUDA_HALF_PRECISION) {
int spinor_bytes = dst.length*sizeof(short);
cudaBindTexture(0, spinorTexHalf, src.spinor, spinor_bytes);
cudaBindTexture(0, spinorTexNorm, src.spinorNorm, spinor_bytes/12);
convertDHKernel<<<dimGrid, dimBlock>>>((double2*)dst.spinor, convertLength);
} else if (dst.precision == QUDA_HALF_PRECISION && src.precision == QUDA_DOUBLE_PRECISION) {
int spinor_bytes = dst.length*sizeof(double);
cudaBindTexture(0, spinorTexDouble, src.spinor, spinor_bytes);
convertHDKernel<<<dimGrid, dimBlock>>>((short4*)dst.spinor, (float*)dst.spinorNorm, convertLength);
} else if (dst.precision == QUDA_DOUBLE_PRECISION) {
cudaMemcpy(dst.spinor, src.spinor, dst.length*sizeof(double), cudaMemcpyDeviceToDevice);
else
} else {
cudaMemcpy(dst.spinor, src.spinor, dst.length*sizeof(float), cudaMemcpyDeviceToDevice);
}
}
......
......@@ -130,12 +130,9 @@
#define READ_SPINOR_UP READ_SPINOR_HALF_UP
#define READ_SPINOR_DOWN READ_SPINOR_HALF_DOWN
#define SPINORTEX spinorTexHalf
#if (DD_XPAY==0)
#define DD_PARAM1 short4* g_out, float *c
#define WRITE_SPINOR WRITE_SPINOR_SHORT4
#else
#define DD_PARAM1 float4* g_out
#define WRITE_SPINOR WRITE_SPINOR_FLOAT4
#if (DD_XPAY==1)
#define ACCUMTEX accumTexHalf
#define READ_ACCUM READ_ACCUM_HALF
#endif
......
......@@ -71,90 +71,6 @@ __constant__ double t_boundary;
#include <dslash_def.h>
__global__ void spinorHalfPack(float *c, void *half) {
int sid = BLOCK_DIM*blockIdx.x + threadIdx.x;
short4 *h = (short4 *)half;
float4 F0 = tex1Dfetch(spinorTexSingle, sid + 0*Nh);
float4 F1 = tex1Dfetch(spinorTexSingle, sid + 1*Nh);
float4 F2 = tex1Dfetch(spinorTexSingle, sid + 2*Nh);
float4 F3 = tex1Dfetch(spinorTexSingle, sid + 3*Nh);
float4 F4 = tex1Dfetch(spinorTexSingle, sid + 4*Nh);
float4 F5 = tex1Dfetch(spinorTexSingle, sid + 5*Nh);
float c0 = fmaxf(fabsf(F0.x), fabsf(F0.y));
float c1 = fmaxf(fabsf(F0.z), fabsf(F0.w));
float c2 = fmaxf(fabsf(F1.x), fabsf(F1.y));
float c3 = fmaxf(fabsf(F1.z), fabsf(F1.w));
float c4 = fmaxf(fabsf(F2.x), fabsf(F2.y));
float c5 = fmaxf(fabsf(F2.z), fabsf(F2.w));
float c6 = fmaxf(fabsf(F3.x), fabsf(F3.y));
float c7 = fmaxf(fabsf(F3.z), fabsf(F3.w));
float c8 = fmaxf(fabsf(F4.x), fabsf(F4.y));
float c9 = fmaxf(fabsf(F4.z), fabsf(F4.w));
float c10 = fmaxf(fabsf(F5.x), fabsf(F5.y));
float c11 = fmaxf(fabsf(F5.z), fabsf(F5.w));
c0 = fmaxf(c0, c1);
c1 = fmaxf(c2, c3);
c2 = fmaxf(c4, c5);
c3 = fmaxf(c6, c7);
c4 = fmaxf(c8, c9);
c5 = fmaxf(c10, c11);
c0 = fmaxf(c0, c1);
c1 = fmaxf(c2, c3);
c2 = fmaxf(c4, c5);
c0 = fmaxf(c0, c1);
c0 = fmaxf(c0, c2); // c0 is now the maximum element
c[sid] = c0;
float scale = __fdividef(MAX_SHORT, c0);
F0.x *= scale; F0.y *= scale; F0.z *= scale; F0.w *= scale;
F1.x *= scale; F1.y *= scale; F1.z *= scale; F1.w *= scale;
F2.x *= scale; F2.y *= scale; F2.z *= scale; F2.w *= scale;
F3.x *= scale; F3.y *= scale; F3.z *= scale; F3.w *= scale;
F4.x *= scale; F4.y *= scale; F4.z *= scale; F4.w *= scale;
F5.x *= scale; F5.y *= scale; F5.z *= scale; F5.w *= scale;
h[sid+0*Nh] = make_short4((short)F0.x, (short)F0.y, (short)F0.z, (short)F0.w);
h[sid+1*Nh] = make_short4((short)F1.x, (short)F1.y, (short)F1.z, (short)F1.w);
h[sid+2*Nh] = make_short4((short)F2.x, (short)F2.y, (short)F2.z, (short)F2.w);
h[sid+3*Nh] = make_short4((short)F3.x, (short)F3.y, (short)F3.z, (short)F3.w);
h[sid+4*Nh] = make_short4((short)F4.x, (short)F4.y, (short)F4.z, (short)F4.w);
h[sid+5*Nh] = make_short4((short)F5.x, (short)F5.y, (short)F5.z, (short)F5.w);
}
__global__ void spinorHalfUnpack(ParitySpinor out) {
float4* out4 = (float4*)out.spinor;
int sid = BLOCK_DIM*blockIdx.x + threadIdx.x;
float4 I0 = tex1Dfetch(spinorTexHalf, sid + 0*Nh);
float4 I1 = tex1Dfetch(spinorTexHalf, sid + 1*Nh);
float4 I2 = tex1Dfetch(spinorTexHalf, sid + 2*Nh);
float4 I3 = tex1Dfetch(spinorTexHalf, sid + 3*Nh);
float4 I4 = tex1Dfetch(spinorTexHalf, sid + 4*Nh);
float4 I5 = tex1Dfetch(spinorTexHalf, sid + 5*Nh);
float C = tex1Dfetch(spinorTexNorm, sid);
I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C;
I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C;
I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C;
I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C;
I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C;
I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
out4[0*Nh+sid] = I0;
out4[1*Nh+sid] = I1;
out4[2*Nh+sid] = I2;
out4[3*Nh+sid] = I3;
out4[4*Nh+sid] = I4;
out4[5*Nh+sid] = I5;
}
void setCudaGaugeParam() {
int gf = (gauge_param->gauge_fix == QUDA_GAUGE_FIXED_YES) ? 1 : 0;
cudaMemcpyToSymbol("gauge_fixed", &(gf), sizeof(int));
......@@ -228,19 +144,7 @@ void dslashCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity,
} else if (in.precision == QUDA_SINGLE_PRECISION) {
dslashSCuda(out, gauge, in, parity, dagger);
} else if (in.precision == QUDA_HALF_PRECISION) {
dim3 gridDim(GRID_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
int spinor_float_bytes = Nh*spinorSiteSize*sizeof(float);
cudaBindTexture(0, spinorTexSingle, in.spinor, spinor_float_bytes);
spinorHalfPack <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>(hSpinor1.spinorNorm, hSpinor1.spinor);
dslashHCuda(hSpinor2, gauge, hSpinor1, parity, dagger);
int spinor_half_bytes = Nh*spinorSiteSize*sizeof(float)/2;
cudaBindTexture(0, spinorTexHalf, hSpinor2.spinor, spinor_half_bytes);
cudaBindTexture(0, spinorTexNorm, hSpinor2.spinorNorm, spinor_half_bytes/12);
spinorHalfUnpack <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>(out);
dslashHCuda(out, gauge, in, parity, dagger);
}
}
......@@ -428,22 +332,7 @@ void dslashXpayCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int pari
} else if (in.precision == QUDA_SINGLE_PRECISION) {
dslashXpaySCuda(out, gauge, in, parity, dagger, x, a);
} else if (in.precision == QUDA_HALF_PRECISION) {
printf("Not yet implemented\n");
exit(-1);
dim3 gridDim(GRID_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
int spinor_float_bytes = Nh*spinorSiteSize*sizeof(float);
cudaBindTexture(0, spinorTexSingle, in.spinor, spinor_float_bytes);
spinorHalfPack <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>(hSpinor1.spinorNorm, hSpinor1.spinor);
dslashXpayHCuda(hSpinor2, gauge, hSpinor1, parity, dagger, x, a);
int spinor_half_bytes = Nh*spinorSiteSize*sizeof(float)/2;
cudaBindTexture(0, spinorTexHalf, hSpinor2.spinor, spinor_half_bytes);
cudaBindTexture(0, spinorTexNorm, hSpinor2.spinorNorm, spinor_half_bytes/12);
spinorHalfUnpack <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>(out);
dslashXpayHCuda(out, gauge, in, parity, dagger, x, a);
}
}
......@@ -587,43 +476,55 @@ void dslashXpayHCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
if (gauge.precision == QUDA_DOUBLE_PRECISION) {
if (gauge.reconstruct == QUDA_RECONSTRUCT_12) {
if (!daggerBit) {
dslashDH12XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashDH12XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>
((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a);
} else {
dslashDH12DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashDH12DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>
((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a);
}
} else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) {
if (!daggerBit) {
dslashDH8XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashDH8XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>
((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a);
} else {
dslashDH8DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashDH8DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>
((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a);
}
}
} else if (gauge.precision == QUDA_SINGLE_PRECISION) {
if (gauge.reconstruct == QUDA_RECONSTRUCT_12) {
if (!daggerBit) {
dslashSH12XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashSH12XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>
((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a);
} else {
dslashSH12DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashSH12DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>
((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a);
}
} else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) {
if (!daggerBit) {
dslashSH8XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashSH8XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>
((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a);
} else {
dslashSH8DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashSH8DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>
((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a);
}
}
} else {
if (gauge.reconstruct == QUDA_RECONSTRUCT_12) {
if (!daggerBit) {
dslashHH12XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashHH12XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>
((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a);
} else {
dslashHH12DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashHH12DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>
((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a);
}
} else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) {
if (!daggerBit) {
dslashHH8XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashHH8XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>
((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a);
} else {
dslashHH8DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashHH8DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>
((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a);
}
}
}
......@@ -660,18 +561,13 @@ void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa,
dslashXpaySCuda(out, gauge, tmp, 1, 0, in, kappa2);
}
} else if (in.precision == QUDA_HALF_PRECISION) {
dim3 gridDim(GRID_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
int spinor_bytes = Nh*spinorSiteSize*sizeof(float);
cudaBindTexture(0, spinorTexSingle, in.spinor, spinor_bytes);
spinorHalfPack <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>(hSpinor1.spinorNorm, hSpinor1.spinor);
if (matpc_type == QUDA_MATPC_EVEN_EVEN) dslashHCuda(hSpinor2, gauge, hSpinor1, 1, 0);
else dslashHCuda(hSpinor2, gauge, hSpinor1, 0, 0);
if (matpc_type == QUDA_MATPC_EVEN_EVEN) dslashXpayHCuda(out, gauge, hSpinor2, 0, 0, hSpinor1, kappa2);
else dslashXpayHCuda(out, gauge, hSpinor2, 1, 0, hSpinor1, kappa2);
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashHCuda(tmp, gauge, in, 1, 0);
dslashXpayHCuda(out, gauge, tmp, 0, 0, in, kappa2);
} else {
dslashHCuda(tmp, gauge, in, 0, 0);
dslashXpayHCuda(out, gauge, tmp, 1, 0, in, kappa2);
}
}
}
......@@ -702,18 +598,13 @@ void MatPCDagCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kap
dslashXpaySCuda(out, gauge, tmp, 1, 1, in, kappa2);
}
} else {
dim3 gridDim(GRID_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
int spinor_bytes = Nh*spinorSiteSize*sizeof(float);
cudaBindTexture(0, spinorTexSingle, in.spinor, spinor_bytes);
spinorHalfPack <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>(hSpinor1.spinorNorm, hSpinor1.spinor);
if (matpc_type == QUDA_MATPC_EVEN_EVEN) dslashHCuda(hSpinor2, gauge, hSpinor1, 1, 1);
else dslashHCuda(hSpinor2, gauge, hSpinor1, 0, 1);
if (matpc_type == QUDA_MATPC_EVEN_EVEN) dslashXpayHCuda(out, gauge, hSpinor2, 0, 1, hSpinor1, kappa2);
else dslashXpayHCuda(out, gauge, hSpinor2, 1, 1, hSpinor1, kappa2);
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashHCuda(tmp, gauge, in, 1, 1);
dslashXpayHCuda(out, gauge, tmp, 0, 1, in, kappa2);
} else {
dslashHCuda(tmp, gauge, in, 0, 1);
dslashXpayHCuda(out, gauge, tmp, 1, 1, in, kappa2);
}
}
}
......@@ -735,8 +626,8 @@ void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa) {
dslashXpaySCuda(out.odd, gauge, in.even, 1, 0, in.odd, -kappa);
dslashXpaySCuda(out.even, gauge, in.odd, 0, 0, in.even, -kappa);
} else if (in.even.precision == QUDA_HALF_PRECISION) {
printf("Half precision not supported in MatCuda\n");
exit(-1);
dslashXpayHCuda(out.odd, gauge, in.even, 1, 0, in.odd, -kappa);
dslashXpayHCuda(out.even, gauge, in.odd, 0, 0, in.even, -kappa);
}
}
......@@ -752,8 +643,8 @@ void MatDaggerCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa)
dslashXpaySCuda(out.odd, gauge, in.even, 1, 1, in.odd, -kappa);
dslashXpaySCuda(out.even, gauge, in.odd, 0, 1, in.even, -kappa);
} else if (in.even.precision == QUDA_HALF_PRECISION) {
printf("Half precision not supported in MatDaggerCuda\n");
exit(-1);
dslashXpayHCuda(out.odd, gauge, in.even, 1, 1, in.odd, -kappa);
dslashXpayHCuda(out.even, gauge, in.odd, 0, 1, in.even, -kappa);
}
}
......
......@@ -29,9 +29,6 @@ extern "C" {
extern FullClover cudaClover;
extern ParitySpinor hSpinor1;
extern ParitySpinor hSpinor2;