Advanced Computing Platform for Theoretical Physics

Commit b7252b18 authored by mikeaclark's avatar mikeaclark
Browse files

Clean up of half precision and some interfaces

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@399 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 85b72c53
...@@ -22,6 +22,24 @@ ...@@ -22,6 +22,24 @@
#define QudaSumFloat3 float3 #define QudaSumFloat3 float3
#endif #endif
// Double precision input spinor field
texture<int4, 1> spinorTexDouble;
// Single precision input spinor field
texture<float4, 1, cudaReadModeElementType> spinorTexSingle;
// Half precision input spinor field
texture<short4, 1, cudaReadModeNormalizedFloat> spinorTexHalf;
texture<float, 1, cudaReadModeElementType> spinorTexNorm;
#if (__CUDA_ARCH__ == 130)
static __inline__ __device__ double2 fetch_double2(texture<int4, 1> t, int i)
{
int4 v = tex1Dfetch(t,i);
return make_double2(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z));
}
#endif
inline void checkSpinor(ParitySpinor &a, ParitySpinor &b) { inline void checkSpinor(ParitySpinor &a, ParitySpinor &b) {
if (a.precision == QUDA_HALF_PRECISION || b.precision == QUDA_HALF_PRECISION) { if (a.precision == QUDA_HALF_PRECISION || b.precision == QUDA_HALF_PRECISION) {
printf("checkSpinor error, this kernel does not support QUDA_HALF_PRECISION\n"); printf("checkSpinor error, this kernel does not support QUDA_HALF_PRECISION\n");
...@@ -40,12 +58,7 @@ inline void checkSpinor(ParitySpinor &a, ParitySpinor &b) { ...@@ -40,12 +58,7 @@ inline void checkSpinor(ParitySpinor &a, ParitySpinor &b) {
} }
// For kernels with precision conversion built in // For kernels with precision conversion built in
inline void checkHalfSpinor(ParitySpinor &a, ParitySpinor &b) { inline void checkSpinorLength(ParitySpinor &a, ParitySpinor &b) {
if (a.precision == QUDA_HALF_PRECISION || b.precision == QUDA_HALF_PRECISION) {
printf("checkSpinor error, this kernel does not support QUDA_HALF_PRECISION\n");
exit(-1);
}
if (a.length != b.length) { if (a.length != b.length) {
printf("checkSpinor error, lengths do not match: %d %d\n", a.length, b.length); printf("checkSpinor error, lengths do not match: %d %d\n", a.length, b.length);
exit(-1); exit(-1);
...@@ -98,22 +111,217 @@ __global__ void convertSDKernel(float4 *dst, double2 *src, int length) { ...@@ -98,22 +111,217 @@ __global__ void convertSDKernel(float4 *dst, double2 *src, int length) {
} }
} }
__global__ void convertHSKernel(short4 *h, float *norm, int length) {
int i = blockIdx.x*(blockDim.x) + threadIdx.x;
unsigned int gridSize = gridDim.x*blockDim.x;
while(i < length) {
float4 F0 = tex1Dfetch(spinorTexSingle, i + 0*length);
float4 F1 = tex1Dfetch(spinorTexSingle, i + 1*length);
float4 F2 = tex1Dfetch(spinorTexSingle, i + 2*length);
float4 F3 = tex1Dfetch(spinorTexSingle, i + 3*length);
float4 F4 = tex1Dfetch(spinorTexSingle, i + 4*length);
float4 F5 = tex1Dfetch(spinorTexSingle, i + 5*length);
float c0 = fmaxf(fabsf(F0.x), fabsf(F0.y));
float c1 = fmaxf(fabsf(F0.z), fabsf(F0.w));
float c2 = fmaxf(fabsf(F1.x), fabsf(F1.y));
float c3 = fmaxf(fabsf(F1.z), fabsf(F1.w));
float c4 = fmaxf(fabsf(F2.x), fabsf(F2.y));
float c5 = fmaxf(fabsf(F2.z), fabsf(F2.w));
float c6 = fmaxf(fabsf(F3.x), fabsf(F3.y));
float c7 = fmaxf(fabsf(F3.z), fabsf(F3.w));
float c8 = fmaxf(fabsf(F4.x), fabsf(F4.y));
float c9 = fmaxf(fabsf(F4.z), fabsf(F4.w));
float c10 = fmaxf(fabsf(F5.x), fabsf(F5.y));
float c11 = fmaxf(fabsf(F5.z), fabsf(F5.w));
c0 = fmaxf(c0, c1);
c1 = fmaxf(c2, c3);
c2 = fmaxf(c4, c5);
c3 = fmaxf(c6, c7);
c4 = fmaxf(c8, c9);
c5 = fmaxf(c10, c11);
c0 = fmaxf(c0, c1);
c1 = fmaxf(c2, c3);
c2 = fmaxf(c4, c5);
c0 = fmaxf(c0, c1);
c0 = fmaxf(c0, c2); // c0 is now the maximum element
norm[i] = c0;
float C = __fdividef(MAX_SHORT, c0);
F0.x *= C; F0.y *= C; F0.z *= C; F0.w *= C; F1.x *= C; F1.y *= C; F1.z *= C; F1.w *= C;
F2.x *= C; F2.y *= C; F2.z *= C; F2.w *= C; F3.x *= C; F3.y *= C; F3.z *= C; F3.w *= C;
F4.x *= C; F4.y *= C; F4.z *= C; F4.w *= C; F5.x *= C; F5.y *= C; F5.z *= C; F5.w *= C;
h[i+0*length] = make_short4((short)F0.x, (short)F0.y, (short)F0.z, (short)F0.w);
h[i+1*length] = make_short4((short)F1.x, (short)F1.y, (short)F1.z, (short)F1.w);
h[i+2*length] = make_short4((short)F2.x, (short)F2.y, (short)F2.z, (short)F2.w);
h[i+3*length] = make_short4((short)F3.x, (short)F3.y, (short)F3.z, (short)F3.w);
h[i+4*length] = make_short4((short)F4.x, (short)F4.y, (short)F4.z, (short)F4.w);
h[i+5*length] = make_short4((short)F5.x, (short)F5.y, (short)F5.z, (short)F5.w);
i += gridSize;
}
}
__global__ void convertSHKernel(float4 *res, int length) {
int i = blockIdx.x*(blockDim.x) + threadIdx.x;
unsigned int gridSize = gridDim.x*blockDim.x;
while (i<length) {
float4 I0 = tex1Dfetch(spinorTexHalf, i + 0*length);
float4 I1 = tex1Dfetch(spinorTexHalf, i + 1*length);
float4 I2 = tex1Dfetch(spinorTexHalf, i + 2*length);
float4 I3 = tex1Dfetch(spinorTexHalf, i + 3*length);
float4 I4 = tex1Dfetch(spinorTexHalf, i + 4*length);
float4 I5 = tex1Dfetch(spinorTexHalf, i + 5*length);
float C = tex1Dfetch(spinorTexNorm, i);
I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C;
I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C;
I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
res[0*length+i] = I0;
res[1*length+i] = I1;
res[2*length+i] = I2;
res[3*length+i] = I3;
res[4*length+i] = I4;
res[5*length+i] = I5;
i += gridSize;
}
}
__global__ void convertHDKernel(short4 *h, float *norm, int length) {
int i = blockIdx.x*(blockDim.x) + threadIdx.x;
unsigned int gridSize = gridDim.x*blockDim.x;
while(i < length) {
double2 F0 = fetch_double2(spinorTexDouble, i + 0*length);
double2 F1 = fetch_double2(spinorTexDouble, i + 1*length);
double2 F2 = fetch_double2(spinorTexDouble, i + 2*length);
double2 F3 = fetch_double2(spinorTexDouble, i + 3*length);
double2 F4 = fetch_double2(spinorTexDouble, i + 4*length);
double2 F5 = fetch_double2(spinorTexDouble, i + 5*length);
double2 F6 = fetch_double2(spinorTexDouble, i + 6*length);
double2 F7 = fetch_double2(spinorTexDouble, i + 7*length);
double2 F8 = fetch_double2(spinorTexDouble, i + 8*length);
double2 F9 = fetch_double2(spinorTexDouble, i + 9*length);
double2 F10 = fetch_double2(spinorTexDouble, i + 10*length);
double2 F11 = fetch_double2(spinorTexDouble, i + 11*length);
float c0 = fmaxf(fabsf(F0.x), fabsf(F0.y));
float c1 = fmaxf(fabsf(F1.x), fabsf(F1.y));
float c2 = fmaxf(fabsf(F2.x), fabsf(F2.y));
float c3 = fmaxf(fabsf(F3.x), fabsf(F3.y));
float c4 = fmaxf(fabsf(F4.x), fabsf(F4.y));
float c5 = fmaxf(fabsf(F5.x), fabsf(F5.y));
float c6 = fmaxf(fabsf(F6.x), fabsf(F6.y));
float c7 = fmaxf(fabsf(F7.x), fabsf(F7.y));
float c8 = fmaxf(fabsf(F8.x), fabsf(F8.y));
float c9 = fmaxf(fabsf(F9.x), fabsf(F9.y));
float c10 = fmaxf(fabsf(F10.x), fabsf(F10.y));
float c11 = fmaxf(fabsf(F11.x), fabsf(F11.y));
c0 = fmaxf(c0, c1); c1 = fmaxf(c2, c3); c2 = fmaxf(c4, c5); c3 = fmaxf(c6, c7);
c4 = fmaxf(c8, c9); c5 = fmaxf(c10, c11); c0 = fmaxf(c0, c1); c1 = fmaxf(c2, c3);
c2 = fmaxf(c4, c5); c0 = fmaxf(c0, c1); c0 = fmaxf(c0, c2); // c0 is now the maximum element
norm[i] = c0;
float C = __fdividef(MAX_SHORT, c0);
h[i+0*length] = make_short4((short)(C*(float)F0.x), (short)(C*(float)F0.y),
(short)(C*(float)F1.x), (short)(C*(float)F1.y));
h[i+1*length] = make_short4((short)(C*(float)F2.x), (short)(C*(float)F2.y),
(short)(C*(float)F3.x), (short)(C*(float)F3.y));
h[i+2*length] = make_short4((short)(C*(float)F4.x), (short)(C*(float)F4.y),
(short)(C*(float)F5.x), (short)(C*(float)F5.y));
h[i+3*length] = make_short4((short)(C*(float)F6.x), (short)(C*(float)F6.y),
(short)(C*(float)F7.x), (short)(C*(float)F7.y));
h[i+4*length] = make_short4((short)(C*(float)F8.x), (short)(C*(float)F8.y),
(short)(C*(float)F9.x), (short)(C*(float)F9.y));
h[i+5*length] = make_short4((short)(C*(float)F10.x), (short)(C*(float)F10.y),
(short)(C*(float)F11.x), (short)(C*(float)F11.y));
i += gridSize;
}
}
__global__ void convertDHKernel(double2 *res, int length) {
int i = blockIdx.x*(blockDim.x) + threadIdx.x;
unsigned int gridSize = gridDim.x*blockDim.x;
while(i < length) {
float4 I0 = tex1Dfetch(spinorTexHalf, i + 0*length);
float4 I1 = tex1Dfetch(spinorTexHalf, i + 1*length);
float4 I2 = tex1Dfetch(spinorTexHalf, i + 2*length);
float4 I3 = tex1Dfetch(spinorTexHalf, i + 3*length);
float4 I4 = tex1Dfetch(spinorTexHalf, i + 4*length);
float4 I5 = tex1Dfetch(spinorTexHalf, i + 5*length);
float C = tex1Dfetch(spinorTexNorm, i);
I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C;
I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C;
I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C;
I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C;
I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C;
I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
res[0*length+i] = make_double2(I0.x, I0.y);
res[1*length+i] = make_double2(I0.z, I0.w);
res[2*length+i] = make_double2(I1.x, I1.y);
res[3*length+i] = make_double2(I1.z, I1.w);
res[4*length+i] = make_double2(I2.x, I2.y);
res[5*length+i] = make_double2(I2.z, I2.w);
res[6*length+i] = make_double2(I3.x, I3.y);
res[7*length+i] = make_double2(I3.z, I3.w);
res[8*length+i] = make_double2(I4.x, I4.y);
res[9*length+i] = make_double2(I4.z, I4.w);
res[10*length+i] = make_double2(I0.x, I5.y);
res[11*length+i] = make_double2(I0.z, I5.w);
i += gridSize;
}
}
void copyQuda(ParitySpinor dst, ParitySpinor src) { void copyQuda(ParitySpinor dst, ParitySpinor src) {
checkHalfSpinor(dst, src); checkSpinorLength(dst, src);
int convertLength = dst.length / 24; int convertLength = dst.length / spinorSiteSize;
int blocks = min(REDUCE_MAX_BLOCKS, max(convertLength/REDUCE_THREADS, 1)); int blocks = min(REDUCE_MAX_BLOCKS, max(convertLength/REDUCE_THREADS, 1));
dim3 dimBlock(REDUCE_THREADS, 1, 1); dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1); dim3 dimGrid(blocks, 1, 1);
if (dst.precision == QUDA_DOUBLE_PRECISION && src.precision == QUDA_SINGLE_PRECISION) if (dst.precision == QUDA_DOUBLE_PRECISION && src.precision == QUDA_SINGLE_PRECISION) {
convertDSKernel<<<dimGrid, dimBlock>>>((double2*)dst.spinor, (float4*)src.spinor, convertLength); convertDSKernel<<<dimGrid, dimBlock>>>((double2*)dst.spinor, (float4*)src.spinor, convertLength);
else if (dst.precision == QUDA_SINGLE_PRECISION && src.precision == QUDA_DOUBLE_PRECISION) } else if (dst.precision == QUDA_SINGLE_PRECISION && src.precision == QUDA_DOUBLE_PRECISION) {
convertSDKernel<<<dimGrid, dimBlock>>>((float4*)dst.spinor, (double2*)src.spinor, convertLength); convertSDKernel<<<dimGrid, dimBlock>>>((float4*)dst.spinor, (double2*)src.spinor, convertLength);
else if (dst.precision == QUDA_DOUBLE_PRECISION) } else if (dst.precision == QUDA_SINGLE_PRECISION && src.precision == QUDA_HALF_PRECISION) {
int spinor_bytes = dst.length*sizeof(short);
cudaBindTexture(0, spinorTexHalf, src.spinor, spinor_bytes);
cudaBindTexture(0, spinorTexNorm, src.spinorNorm, spinor_bytes/12);
convertSHKernel<<<dimGrid, dimBlock>>>((float4*)dst.spinor, convertLength);
} else if (dst.precision == QUDA_HALF_PRECISION && src.precision == QUDA_SINGLE_PRECISION) {
int spinor_bytes = dst.length*sizeof(float);
cudaBindTexture(0, spinorTexSingle, src.spinor, spinor_bytes);
convertHSKernel<<<dimGrid, dimBlock>>>((short4*)dst.spinor, (float*)dst.spinorNorm, convertLength);
} else if (dst.precision == QUDA_DOUBLE_PRECISION && src.precision == QUDA_HALF_PRECISION) {
int spinor_bytes = dst.length*sizeof(short);
cudaBindTexture(0, spinorTexHalf, src.spinor, spinor_bytes);
cudaBindTexture(0, spinorTexNorm, src.spinorNorm, spinor_bytes/12);
convertDHKernel<<<dimGrid, dimBlock>>>((double2*)dst.spinor, convertLength);
} else if (dst.precision == QUDA_HALF_PRECISION && src.precision == QUDA_DOUBLE_PRECISION) {
int spinor_bytes = dst.length*sizeof(double);
cudaBindTexture(0, spinorTexDouble, src.spinor, spinor_bytes);
convertHDKernel<<<dimGrid, dimBlock>>>((short4*)dst.spinor, (float*)dst.spinorNorm, convertLength);
} else if (dst.precision == QUDA_DOUBLE_PRECISION) {
cudaMemcpy(dst.spinor, src.spinor, dst.length*sizeof(double), cudaMemcpyDeviceToDevice); cudaMemcpy(dst.spinor, src.spinor, dst.length*sizeof(double), cudaMemcpyDeviceToDevice);
else } else {
cudaMemcpy(dst.spinor, src.spinor, dst.length*sizeof(float), cudaMemcpyDeviceToDevice); cudaMemcpy(dst.spinor, src.spinor, dst.length*sizeof(float), cudaMemcpyDeviceToDevice);
}
} }
......
...@@ -130,12 +130,9 @@ ...@@ -130,12 +130,9 @@
#define READ_SPINOR_UP READ_SPINOR_HALF_UP #define READ_SPINOR_UP READ_SPINOR_HALF_UP
#define READ_SPINOR_DOWN READ_SPINOR_HALF_DOWN #define READ_SPINOR_DOWN READ_SPINOR_HALF_DOWN
#define SPINORTEX spinorTexHalf #define SPINORTEX spinorTexHalf
#if (DD_XPAY==0)
#define DD_PARAM1 short4* g_out, float *c #define DD_PARAM1 short4* g_out, float *c
#define WRITE_SPINOR WRITE_SPINOR_SHORT4 #define WRITE_SPINOR WRITE_SPINOR_SHORT4
#else #if (DD_XPAY==1)
#define DD_PARAM1 float4* g_out
#define WRITE_SPINOR WRITE_SPINOR_FLOAT4
#define ACCUMTEX accumTexHalf #define ACCUMTEX accumTexHalf
#define READ_ACCUM READ_ACCUM_HALF #define READ_ACCUM READ_ACCUM_HALF
#endif #endif
......
...@@ -71,90 +71,6 @@ __constant__ double t_boundary; ...@@ -71,90 +71,6 @@ __constant__ double t_boundary;
#include <dslash_def.h> #include <dslash_def.h>
__global__ void spinorHalfPack(float *c, void *half) {
int sid = BLOCK_DIM*blockIdx.x + threadIdx.x;
short4 *h = (short4 *)half;
float4 F0 = tex1Dfetch(spinorTexSingle, sid + 0*Nh);
float4 F1 = tex1Dfetch(spinorTexSingle, sid + 1*Nh);
float4 F2 = tex1Dfetch(spinorTexSingle, sid + 2*Nh);
float4 F3 = tex1Dfetch(spinorTexSingle, sid + 3*Nh);
float4 F4 = tex1Dfetch(spinorTexSingle, sid + 4*Nh);
float4 F5 = tex1Dfetch(spinorTexSingle, sid + 5*Nh);
float c0 = fmaxf(fabsf(F0.x), fabsf(F0.y));
float c1 = fmaxf(fabsf(F0.z), fabsf(F0.w));
float c2 = fmaxf(fabsf(F1.x), fabsf(F1.y));
float c3 = fmaxf(fabsf(F1.z), fabsf(F1.w));
float c4 = fmaxf(fabsf(F2.x), fabsf(F2.y));
float c5 = fmaxf(fabsf(F2.z), fabsf(F2.w));
float c6 = fmaxf(fabsf(F3.x), fabsf(F3.y));
float c7 = fmaxf(fabsf(F3.z), fabsf(F3.w));
float c8 = fmaxf(fabsf(F4.x), fabsf(F4.y));
float c9 = fmaxf(fabsf(F4.z), fabsf(F4.w));
float c10 = fmaxf(fabsf(F5.x), fabsf(F5.y));
float c11 = fmaxf(fabsf(F5.z), fabsf(F5.w));
c0 = fmaxf(c0, c1);
c1 = fmaxf(c2, c3);
c2 = fmaxf(c4, c5);
c3 = fmaxf(c6, c7);
c4 = fmaxf(c8, c9);
c5 = fmaxf(c10, c11);
c0 = fmaxf(c0, c1);
c1 = fmaxf(c2, c3);
c2 = fmaxf(c4, c5);
c0 = fmaxf(c0, c1);
c0 = fmaxf(c0, c2); // c0 is now the maximum element
c[sid] = c0;
float scale = __fdividef(MAX_SHORT, c0);
F0.x *= scale; F0.y *= scale; F0.z *= scale; F0.w *= scale;
F1.x *= scale; F1.y *= scale; F1.z *= scale; F1.w *= scale;
F2.x *= scale; F2.y *= scale; F2.z *= scale; F2.w *= scale;
F3.x *= scale; F3.y *= scale; F3.z *= scale; F3.w *= scale;
F4.x *= scale; F4.y *= scale; F4.z *= scale; F4.w *= scale;
F5.x *= scale; F5.y *= scale; F5.z *= scale; F5.w *= scale;
h[sid+0*Nh] = make_short4((short)F0.x, (short)F0.y, (short)F0.z, (short)F0.w);
h[sid+1*Nh] = make_short4((short)F1.x, (short)F1.y, (short)F1.z, (short)F1.w);
h[sid+2*Nh] = make_short4((short)F2.x, (short)F2.y, (short)F2.z, (short)F2.w);
h[sid+3*Nh] = make_short4((short)F3.x, (short)F3.y, (short)F3.z, (short)F3.w);
h[sid+4*Nh] = make_short4((short)F4.x, (short)F4.y, (short)F4.z, (short)F4.w);
h[sid+5*Nh] = make_short4((short)F5.x, (short)F5.y, (short)F5.z, (short)F5.w);
}
__global__ void spinorHalfUnpack(ParitySpinor out) {
float4* out4 = (float4*)out.spinor;
int sid = BLOCK_DIM*blockIdx.x + threadIdx.x;
float4 I0 = tex1Dfetch(spinorTexHalf, sid + 0*Nh);
float4 I1 = tex1Dfetch(spinorTexHalf, sid + 1*Nh);
float4 I2 = tex1Dfetch(spinorTexHalf, sid + 2*Nh);
float4 I3 = tex1Dfetch(spinorTexHalf, sid + 3*Nh);
float4 I4 = tex1Dfetch(spinorTexHalf, sid + 4*Nh);
float4 I5 = tex1Dfetch(spinorTexHalf, sid + 5*Nh);
float C = tex1Dfetch(spinorTexNorm, sid);
I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C;
I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C;
I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C;
I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C;
I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C;
I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
out4[0*Nh+sid] = I0;
out4[1*Nh+sid] = I1;
out4[2*Nh+sid] = I2;
out4[3*Nh+sid] = I3;
out4[4*Nh+sid] = I4;
out4[5*Nh+sid] = I5;
}
void setCudaGaugeParam() { void setCudaGaugeParam() {
int gf = (gauge_param->gauge_fix == QUDA_GAUGE_FIXED_YES) ? 1 : 0; int gf = (gauge_param->gauge_fix == QUDA_GAUGE_FIXED_YES) ? 1 : 0;
cudaMemcpyToSymbol("gauge_fixed", &(gf), sizeof(int)); cudaMemcpyToSymbol("gauge_fixed", &(gf), sizeof(int));
...@@ -228,19 +144,7 @@ void dslashCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity, ...@@ -228,19 +144,7 @@ void dslashCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity,
} else if (in.precision == QUDA_SINGLE_PRECISION) { } else if (in.precision == QUDA_SINGLE_PRECISION) {
dslashSCuda(out, gauge, in, parity, dagger); dslashSCuda(out, gauge, in, parity, dagger);
} else if (in.precision == QUDA_HALF_PRECISION) { } else if (in.precision == QUDA_HALF_PRECISION) {
dim3 gridDim(GRID_DIM, 1, 1); dslashHCuda(out, gauge, in, parity, dagger);
dim3 blockDim(BLOCK_DIM, 1, 1);
int spinor_float_bytes = Nh*spinorSiteSize*sizeof(float);
cudaBindTexture(0, spinorTexSingle, in.spinor, spinor_float_bytes);
spinorHalfPack <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>(hSpinor1.spinorNorm, hSpinor1.spinor);
dslashHCuda(hSpinor2, gauge, hSpinor1, parity, dagger);
int spinor_half_bytes = Nh*spinorSiteSize*sizeof(float)/2;
cudaBindTexture(0, spinorTexHalf, hSpinor2.spinor, spinor_half_bytes);
cudaBindTexture(0, spinorTexNorm, hSpinor2.spinorNorm, spinor_half_bytes/12);
spinorHalfUnpack <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>(out);
} }
} }
...@@ -428,22 +332,7 @@ void dslashXpayCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int pari ...@@ -428,22 +332,7 @@ void dslashXpayCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int pari
} else if (in.precision == QUDA_SINGLE_PRECISION) { } else if (in.precision == QUDA_SINGLE_PRECISION) {
dslashXpaySCuda(out, gauge, in, parity, dagger, x, a); dslashXpaySCuda(out, gauge, in, parity, dagger, x, a);
} else if (in.precision == QUDA_HALF_PRECISION) { } else if (in.precision == QUDA_HALF_PRECISION) {
printf("Not yet implemented\n"); dslashXpayHCuda(out, gauge, in, parity, dagger, x, a);
exit(-1);
dim3 gridDim(GRID_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
int spinor_float_bytes = Nh*spinorSiteSize*sizeof(float);
cudaBindTexture(0, spinorTexSingle, in.spinor, spinor_float_bytes);
spinorHalfPack <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>(hSpinor1.spinorNorm, hSpinor1.spinor);
dslashXpayHCuda(hSpinor2, gauge, hSpinor1, parity, dagger, x, a);
int spinor_half_bytes = Nh*spinorSiteSize*sizeof(float)/2;
cudaBindTexture(0, spinorTexHalf, hSpinor2.spinor, spinor_half_bytes);
cudaBindTexture(0, spinorTexNorm, hSpinor2.spinorNorm, spinor_half_bytes/12);
spinorHalfUnpack <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>(out);
} }
} }
...@@ -587,43 +476,55 @@ void dslashXpayHCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor, ...@@ -587,43 +476,55 @@ void dslashXpayHCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
if (gauge.precision == QUDA_DOUBLE_PRECISION) { if (gauge.precision == QUDA_DOUBLE_PRECISION) {
if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) {
if (!daggerBit) { if (!daggerBit) {
dslashDH12XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a); dslashDH12XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>
((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a);
} else { } else {
dslashDH12DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a); dslashDH12DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>
((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a);
} }
} else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) {
if (!daggerBit) { if (!daggerBit) {
dslashDH8XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a); dslashDH8XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>
((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a);
} else { } else {
dslashDH8DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a); dslashDH8DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>
((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a);
} }
} }
} else if (gauge.precision == QUDA_SINGLE_PRECISION) { } else if (gauge.precision == QUDA_SINGLE_PRECISION) {
if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) {
if (!daggerBit) { if (!daggerBit) {
dslashSH12XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a); dslashSH12XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>
((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a);
} else { } else {
dslashSH12DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a); dslashSH12DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>
((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a);
} }
} else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) {
if (!daggerBit) { if (!daggerBit) {
dslashSH8XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a); dslashSH8XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>
((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a);
} else { } else {
dslashSH8DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a); dslashSH8DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>
((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a);
} }
} }
} else { } else {
if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) {
if (!daggerBit) { if (!daggerBit) {
dslashHH12XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a); dslashHH12XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>
((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a);
} else { } else {
dslashHH12DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a); dslashHH12DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>
((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a);
} }
} else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) {
if (!daggerBit) { if (!daggerBit) {
dslashHH8XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a); dslashHH8XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>
((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a);
} else { } else {
dslashHH8DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a); dslashHH8DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>
((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a);
} }
} }
} }
...@@ -660,18 +561,13 @@ void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa, ...@@ -660,18 +561,13 @@ void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa,
dslashXpaySCuda(out, gauge, tmp, 1, 0, in, kappa2); dslashXpaySCuda(out, gauge, tmp, 1, 0, in, kappa2);
} }
} else if (in.precision == QUDA_HALF_PRECISION) { } else if (in.precision == QUDA_HALF_PRECISION) {
dim3 gridDim(GRID_DIM, 1, 1); if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dim3 blockDim(BLOCK_DIM, 1, 1); dslashHCuda(tmp, gauge, in, 1, 0);
dslashXpayHCuda(out, gauge, tmp, 0, 0, in, kappa2);
int spinor_bytes = Nh*spinorSiteSize*sizeof(float); } else {
cudaBindTexture(0, spinorTexSingle, in.spinor, spinor_bytes); dslashHCuda(tmp, gauge, in, 0, 0);
spinorHalfPack <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>(hSpinor1.spinorNorm, hSpinor1.spinor); dslashXpayHCuda(out, gauge, tmp, 1, 0, in, kappa2);
}
if (matpc_type == QUDA_MATPC_EVEN_EVEN) dslashHCuda(hSpinor2, gauge, hSpinor1, 1, 0);
else dslashHCuda(hSpinor2, gauge, hSpinor1, 0, 0);
if (matpc_type == QUDA_MATPC_EVEN_EVEN) dslashXpayHCuda(out, gauge, hSpinor2, 0, 0, hSpinor1, kappa2);
else dslashXpayHCuda(out, gauge, hSpinor2, 1, 0, hSpinor1, kappa2);
} }
} }
...@@ -702,18 +598,13 @@ void MatPCDagCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kap ...@@ -702,18 +598,13 @@ void MatPCDagCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kap
dslashXpaySCuda(out, gauge, tmp, 1, 1, in, kappa2); dslashXpaySCuda(out, gauge, tmp, 1, 1, in, kappa2);
} }
} else { } else {
dim3 gridDim(GRID_DIM, 1, 1); if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dim3 blockDim(BLOCK_DIM, 1, 1); dslashHCuda(tmp, gauge, in, 1, 1);
dslashXpayHCuda(out, gauge, tmp, 0, 1, in, kappa2);
int spinor_bytes = Nh*spinorSiteSize*sizeof(float); } else {
cudaBindTexture(0, spinorTexSingle, in.spinor, spinor_bytes); dslashHCuda(tmp, gauge, in, 0, 1);
spinorHalfPack <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>(hSpinor1.spinorNorm, hSpinor1.spinor); dslashXpayHCuda(out, gauge, tmp, 1, 1, in, kappa2);
}
if (matpc_type == QUDA_MATPC_EVEN_EVEN) dslashHCuda(hSpinor2, gauge, hSpinor1, 1, 1);
else dslashHCuda(hSpinor2, gauge, hSpinor1, 0, 1);
if (matpc_type == QUDA_MATPC_EVEN_EVEN) dslashXpayHCuda(out, gauge, hSpinor2, 0, 1, hSpinor1, kappa2);
else dslashXpayHCuda(out, gauge, hSpinor2, 1, 1, hSpinor1, kappa2);
} }
} }
...@@ -735,8 +626,8 @@ void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa) { ...@@ -735,8 +626,8 @@ void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa) {
dslashXpaySCuda(out.odd, gauge, in.even, 1, 0, in.odd, -kappa); dslashXpaySCuda(out.odd, gauge, in.even, 1, 0, in.odd, -kappa);
dslashXpaySCuda(out.even, gauge, in.odd, 0, 0, in.even, -kappa); dslashXpaySCuda(out.even, gauge, in.odd, 0, 0, in.even, -kappa);
} else if (in.even.precision == QUDA_HALF_PRECISION) { } else if (in.even.precision == QUDA_HALF_PRECISION) {
printf("Half precision not supported in MatCuda\n"); dslashXpayHCuda(out.odd, gauge, in.even, 1, 0, in.odd, -kappa);
exit(-1); dslashXpayHCuda(out.even, gauge, in.odd, 0, 0, in.even, -kappa);
} }
} }
...@@ -752,8 +643,8 @@ void MatDaggerCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa) ...@@ -752,8 +643,8 @@ void MatDaggerCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa)
dslashXpaySCuda(out.odd, gauge, in.even, 1, 1, in.odd, -kappa); dslashXpaySCuda(out.odd, gauge, in.even, 1, 1, in.odd, -kappa);
dslashXpaySCuda(out.even, gauge, in.odd, 0, 1, in.even, -kappa); dslashXpaySCuda(out.even, gauge, in.odd, 0, 1, in.even, -kappa);
} else if (in.even.precision == QUDA_HALF_PRECISION) { } else if (in.even.precision == QUDA_HALF_PRECISION) {
printf("Half precision not supported in MatDaggerCuda\n"); dslashXpayHCuda(out.odd, gauge, in.even, 1, 1, in.odd, -kappa);
exit(-1); dslashXpayHCuda(out.even, gauge, in.odd, 0, 1, in.even, -kappa);
} }
} }
......
...@@ -29,9 +29,6 @@ extern "C" { ...@@ -29,9 +29,6 @@ extern "C" {
extern FullClover cudaClover; extern FullClover cudaClover;
extern ParitySpinor hSpinor1;
extern ParitySpinor hSpinor2;
// ---------- dslash_quda.cu ---------- // ---------- dslash_quda.cu ----------
int dslashCudaSharedBytes(); int dslashCudaSharedBytes();
......
...@@ -30,8 +30,8 @@ int TRANSFER = 1; // include transfer time in the benchmark? ...@@ -30,8 +30,8 @@ int TRANSFER = 1; // include transfer time in the benchmark?
void init() { void init() {
gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION; gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION;
gaugeParam.cuda_prec = QUDA_SINGLE_PRECISION;
gaugeParam.reconstruct = QUDA_RECONSTRUCT_12; gaugeParam.reconstruct = QUDA_RECONSTRUCT_12;
gaugeParam.cuda_prec = QUDA_DOUBLE_PRECISION;
gaugeParam.reconstruct_sloppy = gaugeParam.reconstruct; gaugeParam.reconstruct_sloppy = gaugeParam.reconstruct;
gaugeParam.cuda_prec_sloppy = gaugeParam.cuda_prec; gaugeParam.cuda_prec_sloppy = gaugeParam.cuda_prec;
gaugeParam.X = L1; gaugeParam.X = L1;
...@@ -45,7 +45,7 @@ void init() { ...@@ -45,7 +45,7 @@ void init() {
gauge_param = &gaugeParam; gauge_param = &gaugeParam;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION; inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_DOUBLE_PRECISION; inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
if (test_type == 2) inv_param.dirac_order = QUDA_DIRAC_ORDER; if (test_type == 2) inv_param.dirac_order = QUDA_DIRAC_ORDER;
else inv_param.dirac_order = QUDA_DIRAC_ORDER; else inv_param.dirac_order = QUDA_DIRAC_ORDER;
inv_param.kappa = kappa; inv_param.kappa = kappa;
...@@ -88,12 +88,14 @@ void init() { ...@@ -88,12 +88,14 @@ void init() {
if (test_type < 2) { if (test_type < 2) {
loadParitySpinor(cudaSpinor.even, spinorEven, inv_param.cpu_prec, loadParitySpinor(cudaSpinor.even, spinorEven, inv_param.cpu_prec,
inv_param.cuda_prec, inv_param.dirac_order); inv_param.dirac_order);
} else { } else {
loadSpinorField(cudaSpinor, spinor, inv_param.cpu_prec, loadSpinorField(cudaSpinor, spinor, inv_param.cpu_prec,
inv_param.cuda_prec, inv_param.dirac_order); inv_param.dirac_order);
} }
} }
} }
void end() { void end() {
...@@ -189,10 +191,8 @@ void dslashTest() { ...@@ -189,10 +191,8 @@ void dslashTest() {
double secs = dslashCUDA(); double secs = dslashCUDA();
if (!TRANSFER) { if (!TRANSFER) {
if (test_type < 2) retrieveParitySpinor(spinorOdd, cudaSpinor.odd, inv_param.cpu_prec, if (test_type < 2) retrieveParitySpinor(spinorOdd, cudaSpinor.odd, inv_param.cpu_prec, inv_param.dirac_order);
inv_param.cuda_prec, inv_param.dirac_order); else retrieveSpinorField(spinorGPU, cudaSpinorOut, inv_param.cpu_prec, inv_param.dirac_order);
else retrieveSpinorField(spinorGPU, cudaSpinorOut, inv_param.cpu_prec,
inv_param.cuda_prec, inv_param.dirac_order);
} }
// print timing information // print timing information
printf("%fms per loop\n", 1000*secs); printf("%fms per loop\n", 1000*secs);
......
...@@ -88,11 +88,6 @@ void initQuda(int dev) ...@@ -88,11 +88,6 @@ void initQuda(int dev)
cudaGaugeSloppy.even = NULL; cudaGaugeSloppy.even = NULL;
cudaGaugeSloppy.odd = NULL; cudaGaugeSloppy.odd = NULL;
hSpinor1.spinor = NULL;
hSpinor1.spinorNorm = NULL;
hSpinor2.spinor = NULL;
hSpinor2.spinorNorm = NULL;
} }
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
...@@ -131,9 +126,9 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int parity, ...@@ -131,9 +126,9 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int parity,
ParitySpinor in = allocateParitySpinor(Nh, inv_param->cuda_prec); ParitySpinor in = allocateParitySpinor(Nh, inv_param->cuda_prec);
ParitySpinor out = allocateParitySpinor(Nh, inv_param->cuda_prec); ParitySpinor out = allocateParitySpinor(Nh, inv_param->cuda_prec);
loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->cuda_prec, inv_param->dirac_order); loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->dirac_order);
dslashCuda(out, cudaGaugePrecise, in, parity, dagger); dslashCuda(out, cudaGaugePrecise, in, parity, dagger);
retrieveParitySpinor(h_out, out, inv_param->cpu_prec, inv_param->cuda_prec, inv_param->dirac_order); retrieveParitySpinor(h_out, out, inv_param->cpu_prec, inv_param->dirac_order);
freeParitySpinor(out); freeParitySpinor(out);
freeParitySpinor(in); freeParitySpinor(in);
...@@ -145,9 +140,9 @@ void MatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) ...@@ -145,9 +140,9 @@ void MatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
ParitySpinor out = allocateParitySpinor(Nh, inv_param->cuda_prec); ParitySpinor out = allocateParitySpinor(Nh, inv_param->cuda_prec);
ParitySpinor tmp = allocateParitySpinor(Nh, inv_param->cuda_prec); ParitySpinor tmp = allocateParitySpinor(Nh, inv_param->cuda_prec);
loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->cuda_prec, inv_param->dirac_order); loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->dirac_order);
MatPCCuda(out, cudaGaugePrecise, in, inv_param->kappa, tmp, inv_param->matpc_type); MatPCCuda(out, cudaGaugePrecise, in, inv_param->kappa, tmp, inv_param->matpc_type);
retrieveParitySpinor(h_out, out, inv_param->cpu_prec, inv_param->cuda_prec, inv_param->dirac_order); retrieveParitySpinor(h_out, out, inv_param->cpu_prec, inv_param->dirac_order);
freeParitySpinor(tmp); freeParitySpinor(tmp);
freeParitySpinor(out); freeParitySpinor(out);
...@@ -160,9 +155,9 @@ void MatPCDagQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) ...@@ -160,9 +155,9 @@ void MatPCDagQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
ParitySpinor out = allocateParitySpinor(Nh, inv_param->cuda_prec); ParitySpinor out = allocateParitySpinor(Nh, inv_param->cuda_prec);
ParitySpinor tmp = allocateParitySpinor(Nh, inv_param->cuda_prec); ParitySpinor tmp = allocateParitySpinor(Nh, inv_param->cuda_prec);
loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->cuda_prec, inv_param->dirac_order); loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->dirac_order);
MatPCDagCuda(out, cudaGaugePrecise, in, inv_param->kappa, tmp, inv_param->matpc_type); MatPCDagCuda(out, cudaGaugePrecise, in, inv_param->kappa, tmp, inv_param->matpc_type);
retrieveParitySpinor(h_out, out, inv_param->cpu_prec, inv_param->cuda_prec, inv_param->dirac_order); retrieveParitySpinor(h_out, out, inv_param->cpu_prec, inv_param->dirac_order);
freeParitySpinor(tmp); freeParitySpinor(tmp);
freeParitySpinor(out); freeParitySpinor(out);
...@@ -175,9 +170,9 @@ void MatPCDagMatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) ...@@ -175,9 +170,9 @@ void MatPCDagMatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
ParitySpinor out = allocateParitySpinor(Nh, inv_param->cuda_prec); ParitySpinor out = allocateParitySpinor(Nh, inv_param->cuda_prec);
ParitySpinor tmp = allocateParitySpinor(Nh, inv_param->cuda_prec); ParitySpinor tmp = allocateParitySpinor(Nh, inv_param->cuda_prec);
loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->cuda_prec, inv_param->dirac_order); loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->dirac_order);
MatPCDagMatPCCuda(out, cudaGaugePrecise, in, inv_param->kappa, tmp, inv_param->matpc_type); MatPCDagMatPCCuda(out, cudaGaugePrecise, in, inv_param->kappa, tmp, inv_param->matpc_type);
retrieveParitySpinor(h_out, out, inv_param->cpu_prec, inv_param->cuda_prec, inv_param->dirac_order); retrieveParitySpinor(h_out, out, inv_param->cpu_prec, inv_param->dirac_order);
freeParitySpinor(tmp); freeParitySpinor(tmp);
freeParitySpinor(out); freeParitySpinor(out);
...@@ -188,16 +183,12 @@ void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) { ...@@ -188,16 +183,12 @@ void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) {
FullSpinor in = allocateSpinorField(N, inv_param->cuda_prec); FullSpinor in = allocateSpinorField(N, inv_param->cuda_prec);
FullSpinor out = allocateSpinorField(N, inv_param->cuda_prec); FullSpinor out = allocateSpinorField(N, inv_param->cuda_prec);
loadSpinorField(in, h_in, inv_param->cpu_prec, inv_param->cuda_prec, inv_param->dirac_order); loadSpinorField(in, h_in, inv_param->cpu_prec, inv_param->dirac_order);
dslashCuda(out.odd, cudaGaugePrecise, in.even, 1, 0);
dslashCuda(out.even, cudaGaugePrecise, in.odd, 0, 0);
xpayQuda(in.even, -inv_param->kappa, out.even); dslashXpayCuda(out.odd, cudaGaugePrecise, in.even, 1, 0, in.odd, -inv_param->kappa);
xpayQuda(in.odd, -inv_param->kappa, out.odd); dslashXpayCuda(out.even, cudaGaugePrecise, in.odd, 0, 0, in.even, -inv_param->kappa);
retrieveSpinorField(h_out, out, inv_param->cpu_prec, retrieveSpinorField(h_out, out, inv_param->cpu_prec, inv_param->dirac_order);
inv_param->cuda_prec, inv_param->dirac_order);
freeSpinorField(out); freeSpinorField(out);
freeSpinorField(in); freeSpinorField(in);
...@@ -207,17 +198,12 @@ void MatDagQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) { ...@@ -207,17 +198,12 @@ void MatDagQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) {
FullSpinor in = allocateSpinorField(N, inv_param->cuda_prec); FullSpinor in = allocateSpinorField(N, inv_param->cuda_prec);
FullSpinor out = allocateSpinorField(N, inv_param->cuda_prec); FullSpinor out = allocateSpinorField(N, inv_param->cuda_prec);
loadSpinorField(in, h_in, inv_param->cpu_prec, loadSpinorField(in, h_in, inv_param->cpu_prec, inv_param->dirac_order);
inv_param->cuda_prec, inv_param->dirac_order);
dslashCuda(out.odd, cudaGaugePrecise, in.even, 1, 1);
dslashCuda(out.even, cudaGaugePrecise, in.odd, 0, 1);
xpayQuda(in.even, -inv_param->kappa, out.even); dslashXpayCuda(out.odd, cudaGaugePrecise, in.even, 1, 1, in.odd, -inv_param->kappa);
xpayQuda(in.odd, -inv_param->kappa, out.odd); dslashXpayCuda(out.even, cudaGaugePrecise, in.odd, 0, 1, in.even, -inv_param->kappa);
retrieveSpinorField(h_out, out, inv_param->cpu_prec, retrieveSpinorField(h_out, out, inv_param->cpu_prec, inv_param->dirac_order);
inv_param->cuda_prec, inv_param->dirac_order);
freeSpinorField(out); freeSpinorField(out);
freeSpinorField(in); freeSpinorField(in);
...@@ -261,21 +247,7 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param) ...@@ -261,21 +247,7 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
if (param->matpc_type == QUDA_MATPC_EVEN_EVEN) { x.odd = tmp; x.even = out; } if (param->matpc_type == QUDA_MATPC_EVEN_EVEN) { x.odd = tmp; x.even = out; }
else { x.even = tmp; x.odd = out; } else { x.even = tmp; x.odd = out; }
loadSpinorField(b, h_b, param->cpu_prec, param->cuda_prec, param->dirac_order); loadSpinorField(b, h_b, param->cpu_prec, param->dirac_order);
/*ParitySpinor t = allocateParitySpinor(Nh, invert_param->cuda_prec_sloppy);
ParitySpinor s = allocateParitySpinor(Nh, invert_param->cuda_prec_sloppy);
dslashCuda(in, cudaGaugeSloppy, b.odd, 0, 0);
copyQuda(s, b.odd);
dslashCuda(t, cudaGaugeSloppy, s, 0, 0);
copyQuda(b.even, t);
printf("%e %e %e\n", normQuda(in), normQuda(t), normQuda(b.even));
FILE *file = fopen("test.dat","w");
for (int i=0; i<in.length; i++) {
fprintf(file, "%d %e %e %e\n", i, ((double*)in.spinor)[i], ((float*)t.spinor)[i], ((double*)b.even.spinor)[i]);
}
exit(0);*/
// multiply the source to get the mass normalization // multiply the source to get the mass normalization
if (param->mass_normalization == QUDA_MASS_NORMALIZATION) { if (param->mass_normalization == QUDA_MASS_NORMALIZATION) {
...@@ -291,7 +263,7 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param) ...@@ -291,7 +263,7 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
} else if (param->solution_type == QUDA_MATPC_SOLUTION || } else if (param->solution_type == QUDA_MATPC_SOLUTION ||
param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION){ param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION){
loadParitySpinor(in, h_b, param->cpu_prec, param->cuda_prec, param->dirac_order); loadParitySpinor(in, h_b, param->cpu_prec, param->dirac_order);
// multiply the source to get the mass normalization // multiply the source to get the mass normalization
if (param->mass_normalization == QUDA_MASS_NORMALIZATION) if (param->mass_normalization == QUDA_MASS_NORMALIZATION)
...@@ -326,7 +298,7 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param) ...@@ -326,7 +298,7 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
if (param->preserve_source == QUDA_PRESERVE_SOURCE_NO) { if (param->preserve_source == QUDA_PRESERVE_SOURCE_NO) {
// qdp dirac fields are even-odd ordered // qdp dirac fields are even-odd ordered
b.even = in; b.even = in;
loadSpinorField(b, h_b, param->cpu_prec, param->cuda_prec, param->dirac_order); loadSpinorField(b, h_b, param->cpu_prec, param->dirac_order);
} }
if (param->matpc_type == QUDA_MATPC_EVEN_EVEN) { if (param->matpc_type == QUDA_MATPC_EVEN_EVEN) {
...@@ -335,12 +307,12 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param) ...@@ -335,12 +307,12 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
dslashXpayCuda(x.even, cudaGaugePrecise, out, 0, 0, b.even, kappa); dslashXpayCuda(x.even, cudaGaugePrecise, out, 0, 0, b.even, kappa);
} }
retrieveSpinorField(h_x, x, param->cpu_prec, param->cuda_prec, param->dirac_order); retrieveSpinorField(h_x, x, param->cpu_prec, param->dirac_order);
if (param->preserve_source == QUDA_PRESERVE_SOURCE_YES) freeSpinorField(b); if (param->preserve_source == QUDA_PRESERVE_SOURCE_YES) freeSpinorField(b);
} else { } else {
retrieveParitySpinor(h_x, out, param->cpu_prec, param->cuda_prec, param->dirac_order); retrieveParitySpinor(h_x, out, param->cpu_prec, param->dirac_order);
} }
freeParitySpinor(tmp); freeParitySpinor(tmp);
......
...@@ -44,7 +44,7 @@ int main(int argc, char **argv) ...@@ -44,7 +44,7 @@ int main(int argc, char **argv)
inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION; inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION; inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_DOUBLE_PRECISION; inv_param.cuda_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION; inv_param.cuda_prec_sloppy = QUDA_HALF_PRECISION;
inv_param.solution_type = QUDA_MAT_SOLUTION; inv_param.solution_type = QUDA_MAT_SOLUTION;
inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO; inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO;
......
...@@ -13,9 +13,6 @@ FullClover cudaClover; ...@@ -13,9 +13,6 @@ FullClover cudaClover;
void *packedSpinor1 = 0; void *packedSpinor1 = 0;
void *packedSpinor2 = 0; void *packedSpinor2 = 0;
// Half precision spinor field temporaries
ParitySpinor hSpinor1, hSpinor2;
ParitySpinor allocateParitySpinor(int geometric_length, Precision precision) { ParitySpinor allocateParitySpinor(int geometric_length, Precision precision) {
ParitySpinor ret; ParitySpinor ret;
...@@ -57,12 +54,6 @@ FullSpinor allocateSpinorField(int length, Precision precision) { ...@@ -57,12 +54,6 @@ FullSpinor allocateSpinorField(int length, Precision precision) {
return ret; return ret;
} }
void allocateSpinorHalf() {
Precision precision = QUDA_HALF_PRECISION;
hSpinor1 = allocateParitySpinor(Nh, precision);
hSpinor2 = allocateParitySpinor(Nh, precision);
}
ParityClover allocateParityClover() { ParityClover allocateParityClover() {
ParityClover ret; ParityClover ret;
...@@ -113,11 +104,6 @@ void freeSpinorBuffer() { ...@@ -113,11 +104,6 @@ void freeSpinorBuffer() {
packedSpinor1 = NULL; packedSpinor1 = NULL;
} }
void freeSpinorHalf() {
freeParitySpinor(hSpinor1);
freeParitySpinor(hSpinor2);
}
inline void packDouble2(double2* a, double *b) { inline void packDouble2(double2* a, double *b) {
a->x = b[0]; a->y = b[1]; a->x = b[0]; a->y = b[1];
} }
...@@ -681,179 +667,191 @@ void unpackQDPParitySpinorSS(float *res, float4 *spinorPacked) { ...@@ -681,179 +667,191 @@ void unpackQDPParitySpinorSS(float *res, float4 *spinorPacked) {
void loadParitySpinor(ParitySpinor ret, void *spinor, Precision cpu_prec, void loadParitySpinor(ParitySpinor ret, void *spinor, Precision cpu_prec,
Precision cuda_prec, DiracFieldOrder dirac_order) { DiracFieldOrder dirac_order) {
if (cuda_prec == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) { if (ret.precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) {
printf("Error, cannot have CUDA double precision without double CPU precision\n"); printf("Error, cannot have CUDA double precision without double CPU precision\n");
exit(-1); exit(-1);
} }
if (cuda_prec == QUDA_HALF_PRECISION) { if (ret.precision != QUDA_HALF_PRECISION) {
if (!hSpinor1.spinor && !hSpinor1.spinorNorm && size_t spinor_bytes;
!hSpinor2.spinor && !hSpinor2.spinorNorm ) { if (ret.precision == QUDA_DOUBLE_PRECISION) spinor_bytes = Nh*spinorSiteSize*sizeof(double);
allocateSpinorHalf(); else spinor_bytes = Nh*spinorSiteSize*sizeof(float);
} else if (!hSpinor1.spinor || !hSpinor1.spinorNorm ||
!hSpinor2.spinor || !hSpinor2.spinorNorm) {
printf("allocateSpinorHalf error %lu %lu %lu %lu\n",
(unsigned long)hSpinor1.spinor, (unsigned long)hSpinor1.spinorNorm,
(unsigned long)hSpinor2.spinor, (unsigned long)hSpinor2.spinorNorm);
exit(-1);
}
}
size_t spinor_bytes;
if (cuda_prec == QUDA_DOUBLE_PRECISION) spinor_bytes = Nh*spinorSiteSize*sizeof(double);
else spinor_bytes = Nh*spinorSiteSize*sizeof(float);
#ifndef __DEVICE_EMULATION__ #ifndef __DEVICE_EMULATION__
//if (!packedSpinor1) cudaHostAlloc(&packedSpinor1, spinor_bytes, cudaHostAllocDefault); if (!packedSpinor1) cudaMallocHost(&packedSpinor1, spinor_bytes);
if (!packedSpinor1) cudaMallocHost(&packedSpinor1, spinor_bytes);
#else #else
if (!packedSpinor1) packedSpinor1 = malloc(spinor_bytes); if (!packedSpinor1) packedSpinor1 = malloc(spinor_bytes);
#endif #endif
if (dirac_order == QUDA_DIRAC_ORDER || QUDA_CPS_WILSON_DIRAC_ORDER) { if (dirac_order == QUDA_DIRAC_ORDER || QUDA_CPS_WILSON_DIRAC_ORDER) {
if (cuda_prec == QUDA_DOUBLE_PRECISION) { if (ret.precision == QUDA_DOUBLE_PRECISION) {
packParitySpinorDD((double2*)packedSpinor1, (double*)spinor); packParitySpinorDD((double2*)packedSpinor1, (double*)spinor);
} else { } else {
if (cpu_prec == QUDA_DOUBLE_PRECISION) packParitySpinorSD((float4*)packedSpinor1, (double*)spinor); if (cpu_prec == QUDA_DOUBLE_PRECISION) packParitySpinorSD((float4*)packedSpinor1, (double*)spinor);
else packParitySpinorSS((float4*)packedSpinor1, (float*)spinor); else packParitySpinorSS((float4*)packedSpinor1, (float*)spinor);
} }
} else if (dirac_order == QUDA_QDP_DIRAC_ORDER) { } else if (dirac_order == QUDA_QDP_DIRAC_ORDER) {
if (cuda_prec == QUDA_DOUBLE_PRECISION) { if (ret.precision == QUDA_DOUBLE_PRECISION) {
packQDPParitySpinorDD((double2*)packedSpinor1, (double*)spinor); packQDPParitySpinorDD((double2*)packedSpinor1, (double*)spinor);
} else { } else {
if (cpu_prec == QUDA_DOUBLE_PRECISION) packQDPParitySpinorSD((float4*)packedSpinor1, (double*)spinor); if (cpu_prec == QUDA_DOUBLE_PRECISION) packQDPParitySpinorSD((float4*)packedSpinor1, (double*)spinor);
else packQDPParitySpinorSS((float4*)packedSpinor1, (float*)spinor); else packQDPParitySpinorSS((float4*)packedSpinor1, (float*)spinor);
}
} }
cudaMemcpy(ret.spinor, packedSpinor1, spinor_bytes, cudaMemcpyHostToDevice);
} else {
ParitySpinor tmp = allocateParitySpinor(ret.length/spinorSiteSize, QUDA_SINGLE_PRECISION);
loadParitySpinor(tmp, spinor, cpu_prec, dirac_order);
copyQuda(ret, tmp);
freeParitySpinor(tmp);
} }
cudaMemcpy(ret.spinor, packedSpinor1, spinor_bytes, cudaMemcpyHostToDevice);
}
void loadFullSpinor(FullSpinor ret, void *spinor, Precision cpu_prec, Precision cuda_prec) {
if (cuda_prec == QUDA_HALF_PRECISION) { }
printf("Sorry, half precision isn't supported\n");
exit(-1);
}
size_t spinor_bytes; void loadFullSpinor(FullSpinor ret, void *spinor, Precision cpu_prec) {
if (cuda_prec == QUDA_DOUBLE_PRECISION) spinor_bytes = Nh*spinorSiteSize*sizeof(double);
else spinor_bytes = Nh*spinorSiteSize*sizeof(float);
if (ret.even.precision != QUDA_HALF_PRECISION) {
size_t spinor_bytes;
if (ret.even.precision == QUDA_DOUBLE_PRECISION) spinor_bytes = Nh*spinorSiteSize*sizeof(double);
else if (ret.even.precision == QUDA_SINGLE_PRECISION) spinor_bytes = Nh*spinorSiteSize*sizeof(float);
else spinor_bytes = Nh*spinorSiteSize*sizeof(float)/2;
#ifndef __DEVICE_EMULATION__ #ifndef __DEVICE_EMULATION__
if (!packedSpinor1) cudaMallocHost(&packedSpinor1, spinor_bytes); if (!packedSpinor1) cudaMallocHost(&packedSpinor1, spinor_bytes);
if (!packedSpinor2) cudaMallocHost(&packedSpinor2, spinor_bytes); if (!packedSpinor2) cudaMallocHost(&packedSpinor2, spinor_bytes);
#else #else
if (!packedSpinor1) packedSpinor1 = malloc(spinor_bytes); if (!packedSpinor1) packedSpinor1 = malloc(spinor_bytes);
if (!packedSpinor2) packedSpinor2 = malloc(spinor_bytes); if (!packedSpinor2) packedSpinor2 = malloc(spinor_bytes);
#endif #endif
if (cuda_prec == QUDA_DOUBLE_PRECISION) { if (ret.even.precision == QUDA_DOUBLE_PRECISION) {
packFullSpinorDD((double2*)packedSpinor1, (double2*)packedSpinor2, (double*)spinor); packFullSpinorDD((double2*)packedSpinor1, (double2*)packedSpinor2, (double*)spinor);
} else { } else {
if (cpu_prec == QUDA_DOUBLE_PRECISION) packFullSpinorSD((float4*)packedSpinor1, (float4*)packedSpinor2, (double*)spinor); if (cpu_prec == QUDA_DOUBLE_PRECISION) packFullSpinorSD((float4*)packedSpinor1, (float4*)packedSpinor2, (double*)spinor);
else packFullSpinorSS((float4*)packedSpinor1, (float4*)packedSpinor2, (float*)spinor); else packFullSpinorSS((float4*)packedSpinor1, (float4*)packedSpinor2, (float*)spinor);
} }
cudaMemcpy(ret.even.spinor, packedSpinor1, spinor_bytes, cudaMemcpyHostToDevice); cudaMemcpy(ret.even.spinor, packedSpinor1, spinor_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(ret.odd.spinor, packedSpinor2, spinor_bytes, cudaMemcpyHostToDevice); cudaMemcpy(ret.odd.spinor, packedSpinor2, spinor_bytes, cudaMemcpyHostToDevice);
#ifndef __DEVICE_EMULATION__ #ifndef __DEVICE_EMULATION__
cudaFreeHost(packedSpinor2); cudaFreeHost(packedSpinor2);
#else #else
free(packedSpinor2); free(packedSpinor2);
#endif #endif
packedSpinor2 = 0; packedSpinor2 = 0;
} else {
FullSpinor tmp = allocateSpinorField(2*ret.even.length/spinorSiteSize, QUDA_SINGLE_PRECISION);
loadFullSpinor(tmp, spinor, cpu_prec);
copyQuda(ret.even, tmp.even);
copyQuda(ret.odd, tmp.odd);
freeSpinorField(tmp);
}
} }
void loadSpinorField(FullSpinor ret, void *spinor, Precision cpu_prec, void loadSpinorField(FullSpinor ret, void *spinor, Precision cpu_prec, DiracFieldOrder dirac_order) {
Precision cuda_prec, DiracFieldOrder dirac_order) {
void *spinor_odd; void *spinor_odd;
if (cpu_prec == QUDA_SINGLE_PRECISION) spinor_odd = (float*)spinor + Nh*spinorSiteSize; if (cpu_prec == QUDA_SINGLE_PRECISION) spinor_odd = (float*)spinor + Nh*spinorSiteSize;
else spinor_odd = (double*)spinor + Nh*spinorSiteSize; else spinor_odd = (double*)spinor + Nh*spinorSiteSize;
if (dirac_order == QUDA_LEX_DIRAC_ORDER) { if (dirac_order == QUDA_LEX_DIRAC_ORDER) {
loadFullSpinor(ret, spinor, cpu_prec, cuda_prec); loadFullSpinor(ret, spinor, cpu_prec);
} else if (dirac_order == QUDA_DIRAC_ORDER || dirac_order == QUDA_QDP_DIRAC_ORDER) { } else if (dirac_order == QUDA_DIRAC_ORDER || dirac_order == QUDA_QDP_DIRAC_ORDER) {
loadParitySpinor(ret.even, spinor, cpu_prec, cuda_prec, dirac_order); loadParitySpinor(ret.even, spinor, cpu_prec, dirac_order);
loadParitySpinor(ret.odd, spinor_odd, cpu_prec, cuda_prec, dirac_order); loadParitySpinor(ret.odd, spinor_odd, cpu_prec, dirac_order);
} else if (dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) { } else if (dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
// odd-even so reverse order // odd-even so reverse order
loadParitySpinor(ret.even, spinor_odd, cpu_prec, cuda_prec, dirac_order); loadParitySpinor(ret.even, spinor_odd, cpu_prec, dirac_order);
loadParitySpinor(ret.odd, spinor, cpu_prec, cuda_prec, dirac_order); loadParitySpinor(ret.odd, spinor, cpu_prec, dirac_order);
} else { } else {
printf("DiracFieldOrder %d not supported\n", dirac_order); printf("DiracFieldOrder %d not supported\n", dirac_order);
exit(-1); exit(-1);
} }
} }
void retrieveParitySpinor(void *res, ParitySpinor spinor, Precision cpu_prec, Precision cuda_prec, void retrieveParitySpinor(void *res, ParitySpinor spinor, Precision cpu_prec, DiracFieldOrder dirac_order) {
DiracFieldOrder dirac_order) {
size_t spinor_bytes;
if (cuda_prec == QUDA_DOUBLE_PRECISION) spinor_bytes = Nh*spinorSiteSize*sizeof(double);
else spinor_bytes = Nh*spinorSiteSize*sizeof(float);
if (!packedSpinor1) cudaMallocHost((void**)&packedSpinor1, spinor_bytes); if (spinor.precision != QUDA_HALF_PRECISION) {
cudaMemcpy(packedSpinor1, spinor.spinor, spinor_bytes, cudaMemcpyDeviceToHost); size_t spinor_bytes;
if (dirac_order == QUDA_DIRAC_ORDER || QUDA_CPS_WILSON_DIRAC_ORDER) { if (spinor.precision == QUDA_DOUBLE_PRECISION) spinor_bytes = Nh*spinorSiteSize*sizeof(double);
if (cuda_prec == QUDA_DOUBLE_PRECISION) { else if (spinor.precision == QUDA_SINGLE_PRECISION) spinor_bytes = Nh*spinorSiteSize*sizeof(float);
unpackParitySpinorDD((double*)res, (double2*)packedSpinor1); else spinor_bytes = Nh*spinorSiteSize*sizeof(float)/2;
} else {
if (cpu_prec == QUDA_DOUBLE_PRECISION) unpackParitySpinorDS((double*)res, (float4*)packedSpinor1); if (!packedSpinor1) cudaMallocHost((void**)&packedSpinor1, spinor_bytes);
else unpackParitySpinorSS((float*)res, (float4*)packedSpinor1); cudaMemcpy(packedSpinor1, spinor.spinor, spinor_bytes, cudaMemcpyDeviceToHost);
} if (dirac_order == QUDA_DIRAC_ORDER || QUDA_CPS_WILSON_DIRAC_ORDER) {
} else if (dirac_order == QUDA_QDP_DIRAC_ORDER) { if (spinor.precision == QUDA_DOUBLE_PRECISION) {
if (cuda_prec == QUDA_DOUBLE_PRECISION) { unpackParitySpinorDD((double*)res, (double2*)packedSpinor1);
unpackQDPParitySpinorDD((double*)res, (double2*)packedSpinor1); } else {
} else { if (cpu_prec == QUDA_DOUBLE_PRECISION) unpackParitySpinorDS((double*)res, (float4*)packedSpinor1);
if (cpu_prec == QUDA_DOUBLE_PRECISION) unpackQDPParitySpinorDS((double*)res, (float4*)packedSpinor1); else unpackParitySpinorSS((float*)res, (float4*)packedSpinor1);
else unpackQDPParitySpinorSS((float*)res, (float4*)packedSpinor1); }
} else if (dirac_order == QUDA_QDP_DIRAC_ORDER) {
if (spinor.precision == QUDA_DOUBLE_PRECISION) {
unpackQDPParitySpinorDD((double*)res, (double2*)packedSpinor1);
} else {
if (cpu_prec == QUDA_DOUBLE_PRECISION) unpackQDPParitySpinorDS((double*)res, (float4*)packedSpinor1);
else unpackQDPParitySpinorSS((float*)res, (float4*)packedSpinor1);
}
} }
} else {
ParitySpinor tmp = allocateParitySpinor(spinor.length/spinorSiteSize, QUDA_SINGLE_PRECISION);
copyQuda(tmp, spinor);
retrieveParitySpinor(res, tmp, cpu_prec, dirac_order);
freeParitySpinor(tmp);
} }
} }
void retrieveFullSpinor(void *res, FullSpinor spinor, Precision cpu_prec, Precision cuda_prec) { void retrieveFullSpinor(void *res, FullSpinor spinor, Precision cpu_prec) {
size_t spinor_bytes;
if (cuda_prec == QUDA_DOUBLE_PRECISION) spinor_bytes = Nh*spinorSiteSize*sizeof(double);
else spinor_bytes = Nh*spinorSiteSize*sizeof(float);
if (!packedSpinor1) cudaMallocHost((void**)&packedSpinor1, spinor_bytes);
if (!packedSpinor2) cudaMallocHost((void**)&packedSpinor2, spinor_bytes);
cudaMemcpy(packedSpinor1, spinor.even.spinor, spinor_bytes, cudaMemcpyDeviceToHost);
cudaMemcpy(packedSpinor2, spinor.odd.spinor, spinor_bytes, cudaMemcpyDeviceToHost);
if (cuda_prec == QUDA_DOUBLE_PRECISION) {
unpackFullSpinorDD((double*)res, (double2*)packedSpinor1, (double2*)packedSpinor2);
} else {
if (cpu_prec == QUDA_DOUBLE_PRECISION) unpackFullSpinorDS((double*)res, (float4*)packedSpinor1, (float4*)packedSpinor2);
else unpackFullSpinorSS((float*)res, (float4*)packedSpinor1, (float4*)packedSpinor2);
}
if (spinor.even.precision != QUDA_HALF_PRECISION) {
size_t spinor_bytes;
if (spinor.even.precision == QUDA_DOUBLE_PRECISION) spinor_bytes = Nh*spinorSiteSize*sizeof(double);
else spinor_bytes = Nh*spinorSiteSize*sizeof(float);
if (!packedSpinor1) cudaMallocHost((void**)&packedSpinor1, spinor_bytes);
if (!packedSpinor2) cudaMallocHost((void**)&packedSpinor2, spinor_bytes);
cudaMemcpy(packedSpinor1, spinor.even.spinor, spinor_bytes, cudaMemcpyDeviceToHost);
cudaMemcpy(packedSpinor2, spinor.odd.spinor, spinor_bytes, cudaMemcpyDeviceToHost);
if (spinor.even.precision == QUDA_DOUBLE_PRECISION) {
unpackFullSpinorDD((double*)res, (double2*)packedSpinor1, (double2*)packedSpinor2);
} else {
if (cpu_prec == QUDA_DOUBLE_PRECISION) unpackFullSpinorDS((double*)res, (float4*)packedSpinor1, (float4*)packedSpinor2);
else unpackFullSpinorSS((float*)res, (float4*)packedSpinor1, (float4*)packedSpinor2);
}
#ifndef __DEVICE_EMULATION__ #ifndef __DEVICE_EMULATION__
cudaFreeHost(packedSpinor2); cudaFreeHost(packedSpinor2);
#else #else
free(packedSpinor2); free(packedSpinor2);
#endif #endif
packedSpinor2 = 0; packedSpinor2 = 0;
} else {
FullSpinor tmp = allocateSpinorField(2*spinor.even.length/spinorSiteSize, QUDA_SINGLE_PRECISION);
copyQuda(tmp.even, spinor.even);
copyQuda(tmp.odd, spinor.odd);
retrieveFullSpinor(res, tmp, cpu_prec);
freeSpinorField(tmp);
}
} }
void retrieveSpinorField(void *res, FullSpinor spinor, Precision cpu_prec, void retrieveSpinorField(void *res, FullSpinor spinor, Precision cpu_prec, DiracFieldOrder dirac_order) {
Precision cuda_prec, DiracFieldOrder dirac_order) {
void *res_odd; void *res_odd;
if (cpu_prec == QUDA_SINGLE_PRECISION) res_odd = (float*)res + Nh*spinorSiteSize; if (cpu_prec == QUDA_SINGLE_PRECISION) res_odd = (float*)res + Nh*spinorSiteSize;
else res_odd = (double*)res + Nh*spinorSiteSize; else res_odd = (double*)res + Nh*spinorSiteSize;
if (dirac_order == QUDA_LEX_DIRAC_ORDER) { if (dirac_order == QUDA_LEX_DIRAC_ORDER) {
retrieveFullSpinor(res, spinor, cpu_prec, cuda_prec); retrieveFullSpinor(res, spinor, cpu_prec);
} else if (dirac_order == QUDA_DIRAC_ORDER || dirac_order == QUDA_QDP_DIRAC_ORDER) { } else if (dirac_order == QUDA_DIRAC_ORDER || dirac_order == QUDA_QDP_DIRAC_ORDER) {
retrieveParitySpinor(res, spinor.even, cpu_prec, cuda_prec, dirac_order); retrieveParitySpinor(res, spinor.even, cpu_prec, dirac_order);
retrieveParitySpinor(res_odd, spinor.odd, cpu_prec, cuda_prec, dirac_order); retrieveParitySpinor(res_odd, spinor.odd, cpu_prec, dirac_order);
} else if (dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) { } else if (dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
retrieveParitySpinor(res, spinor.odd, cpu_prec, cuda_prec, dirac_order); retrieveParitySpinor(res, spinor.odd, cpu_prec, dirac_order);
retrieveParitySpinor(res_odd, spinor.even, cpu_prec, cuda_prec, dirac_order); retrieveParitySpinor(res_odd, spinor.even, cpu_prec, dirac_order);
} else { } else {
printf("DiracFieldOrder %d not supported\n", dirac_order); printf("DiracFieldOrder %d not supported\n", dirac_order);
exit(-1); exit(-1);
......
...@@ -19,15 +19,11 @@ extern "C" { ...@@ -19,15 +19,11 @@ extern "C" {
void freeParityClover(ParityClover clover); void freeParityClover(ParityClover clover);
void freeCloverField(FullClover clover); void freeCloverField(FullClover clover);
void loadParitySpinor(ParitySpinor, void *spinor, Precision cpu_prec, Precision cuda_prec, void loadParitySpinor(ParitySpinor, void *spinor, Precision cpu_prec, DiracFieldOrder dirac_order);
DiracFieldOrder dirac_order); void loadSpinorField(FullSpinor, void *spinor, Precision cpu_prec, DiracFieldOrder dirac_order);
void loadSpinorField(FullSpinor, void *spinor, Precision cpu_prec, Precision cuda_prec,
DiracFieldOrder dirac_order);
void retrieveParitySpinor(void *res, ParitySpinor spinor, Precision cpu_prec, Precision cuda_prec, void retrieveParitySpinor(void *res, ParitySpinor spinor, Precision cpu_prec, DiracFieldOrder dirac_order);
DiracFieldOrder dirac_order); void retrieveSpinorField(void *res, FullSpinor spinor, Precision cpu_prec, DiracFieldOrder dirac_order);
void retrieveSpinorField(void *res, FullSpinor spinor, Precision cpu_prec, Precision cuda_prec,
DiracFieldOrder dirac_order);
void spinorHalfPack(float *c, short *s0, float *f0); void spinorHalfPack(float *c, short *s0, float *f0);
void spinorHalfUnpack(float *f0, float *c, short *s0); void spinorHalfUnpack(float *f0, float *c, short *s0);
......
...@@ -9,8 +9,6 @@ ...@@ -9,8 +9,6 @@
#include <complex> #include <complex>
using namespace std; using namespace std;
typedef complex<float> Complex;
typedef complex<double> Complex16;
struct timeval startTime; struct timeval startTime;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment