Advanced Computing Platform for Theoretical Physics

Commit 9a1d73f7 authored by rbabich's avatar rbabich
Browse files

quda-0.2: merged fix for reductions in emulation mode from quda r700


git-svn-id: http://lattice.bu.edu/qcdalg/cuda/branches/quda-0.2@713 be54200a-260c-0410-bdd7-ce6af2a381ab
parent f3b020ce
......@@ -21,6 +21,13 @@
#define QudaSumFloat3 float3
#endif
// Required for the reduction kernels
#ifdef __DEVICE_EMULATION__
#define EMUSYNC __syncthreads()
#else
#define EMUSYNC
#endif
// These are used for reduction kernels
static QudaSumFloat *d_reduceFloat=0;
static QudaSumComplex *d_reduceComplex=0;
......@@ -187,6 +194,24 @@ double2 __device__ make_Float2(double2 x) {
float4 a##5 = tex1Dfetch(tex, i + 5*length); \
float a##c = a[i];
#define SHORT_LENGTH 65536
#define SCALE_FLOAT ((SHORT_LENGTH-1) * 0.5)
#define SHIFT_FLOAT (-1.f / (SHORT_LENGTH-1))
__device__ short float2short(float c, float a) {
//return (short)(a*MAX_SHORT);
short rtn = (short)((a+SHIFT_FLOAT)*SCALE_FLOAT*c);
return rtn;
}
__device__ float short2float(short a) {
return (float)a/SCALE_FLOAT - SHIFT_FLOAT;
}
__device__ short4 float42short4(float c, float4 a) {
return make_short4(float2short(c, a.x), float2short(c, a.y), float2short(c, a.z), float2short(c, a.w));
}
#define CONSTRUCT_HALF_SPINOR_FROM_SINGLE(h, n, a, length) \
{float c0 = fmaxf(fabsf((a##0).x), fabsf((a##0).y)); \
float c1 = fmaxf(fabsf((a##0).z), fabsf((a##0).w)); \
......@@ -217,7 +242,17 @@ double2 __device__ make_Float2(double2 x) {
h[i+4*length] = make_short4((short)(C*(float)(a##4).x), (short)(C*(float)(a##4).y), \
(short)(C*(float)(a##4).z), (short)(C*(float)(a##4).w)); \
h[i+5*length] = make_short4((short)(C*(float)(a##5).x), (short)(C*(float)(a##5).y), \
(short)(C*(float)(a##5).z), (short)(C*(float)(a##5).w));}
(short)(C*(float)(a##5).z), (short)(C*(float)(a##5).w));}
/*
float C = 1.0f / c0; \
h[i+0*length] = float42short4(C, a##0); \
h[i+1*length] = float42short4(C, a##1); \
h[i+2*length] = float42short4(C, a##2); \
h[i+3*length] = float42short4(C, a##3); \
h[i+4*length] = float42short4(C, a##4); \
h[i+5*length] = float42short4(C, a##5);}
*/
#define CONSTRUCT_HALF_SPINOR_FROM_DOUBLE(h, n, a, length) \
{float c0 = fmaxf(fabsf((a##0).x), fabsf((a##0).y)); \
......@@ -1461,6 +1496,24 @@ double normCuda(ParitySpinor a) {
}
#if 0
double normEven(const cudaColorSpinorField &a) {
blas_quda_flops += 2*a.real_length;
blas_quda_bytes += a.real_length*a.precision;
if (a.precision == QUDA_DOUBLE_PRECISION) {
return normDCuda((double*)a.v, a.length/2, 13, a.precision);
} else if (a.precision == QUDA_SINGLE_PRECISION) {
return normSCuda((float2*)a.v, a.length/4, 13, a.precision);
} else {
int spinor_bytes = a.length*sizeof(short);
cudaBindTexture(0, texHalf1, a.v, spinor_bytes);
cudaBindTexture(0, texNorm1, a.norm, spinor_bytes/12);
blas_quda_bytes += (2*a.real_length*a.precision) / (a.nColor * a.nSpin);
return normHCuda((float*)a.norm, a.stride/2, a.volume/2, 13, a.precision);
}
}
#endif
//
// double reDotProductFCuda(float *a, float *b, int n) {}
......
......@@ -36,14 +36,18 @@ __global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumComplex *g_odata,
if (reduce_threads >= 512) { if (tid < 256) { ZCACC(s[0],s[1],s[512+0],s[512+1]); } __syncthreads(); }
if (reduce_threads >= 256) { if (tid < 128) { ZCACC(s[0],s[1],s[256+0],s[256+1]); } __syncthreads(); }
if (reduce_threads >= 128) { if (tid < 64) { ZCACC(s[0],s[1],s[128+0],s[128+1]); } __syncthreads(); }
if (tid < 32) {
if (reduce_threads >= 64) { ZCACC(s[0],s[1],s[64+0],s[64+1]); }
if (reduce_threads >= 32) { ZCACC(s[0],s[1],s[32+0],s[32+1]); }
if (reduce_threads >= 16) { ZCACC(s[0],s[1],s[16+0],s[16+1]); }
if (reduce_threads >= 8) { ZCACC(s[0],s[1], s[8+0], s[8+1]); }
if (reduce_threads >= 4) { ZCACC(s[0],s[1], s[4+0], s[4+1]); }
if (reduce_threads >= 2) { ZCACC(s[0],s[1], s[2+0], s[2+1]); }
}
#ifndef __DEVICE_EMULATION__
if (tid < 32)
#endif
{
if (reduce_threads >= 64) { ZCACC(s[0],s[1],s[64+0],s[64+1]); EMUSYNC; }
if (reduce_threads >= 32) { ZCACC(s[0],s[1],s[32+0],s[32+1]); EMUSYNC; }
if (reduce_threads >= 16) { ZCACC(s[0],s[1],s[16+0],s[16+1]); EMUSYNC; }
if (reduce_threads >= 8) { ZCACC(s[0],s[1], s[8+0], s[8+1]); EMUSYNC; }
if (reduce_threads >= 4) { ZCACC(s[0],s[1], s[4+0], s[4+1]); EMUSYNC; }
if (reduce_threads >= 2) { ZCACC(s[0],s[1], s[2+0], s[2+1]); EMUSYNC; }
}
// write result for this block to global mem as single QudaSumComplex
if (tid == 0) {
......@@ -79,14 +83,17 @@ __global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumComplex *g_odata,
if (reduce_threads >= 256) { if (tid < 128) { s[0].x += s[128].x; s[0].y += s[128].y; } __syncthreads(); }
if (reduce_threads >= 128) { if (tid < 64) { s[0].x += s[ 64].x; s[0].y += s[ 64].y; } __syncthreads(); }
if (tid < 32) {
if (reduce_threads >= 64) { s[0].x += s[32].x; s[0].y += s[32].y; }
if (reduce_threads >= 32) { s[0].x += s[16].x; s[0].y += s[16].y; }
if (reduce_threads >= 16) { s[0].x += s[ 8].x; s[0].y += s[ 8].y; }
if (reduce_threads >= 8) { s[0].x += s[ 4].x; s[0].y += s[ 4].y; }
if (reduce_threads >= 4) { s[0].x += s[ 2].x; s[0].y += s[ 2].y; }
if (reduce_threads >= 2) { s[0].x += s[ 1].x; s[0].y += s[ 1].y; }
}
#ifndef __DEVICE_EMULATION__
if (tid < 32)
#endif
{
if (reduce_threads >= 64) { s[0].x += s[32].x; s[0].y += s[32].y; EMUSYNC; }
if (reduce_threads >= 32) { s[0].x += s[16].x; s[0].y += s[16].y; EMUSYNC; }
if (reduce_threads >= 16) { s[0].x += s[ 8].x; s[0].y += s[ 8].y; EMUSYNC; }
if (reduce_threads >= 8) { s[0].x += s[ 4].x; s[0].y += s[ 4].y; EMUSYNC; }
if (reduce_threads >= 4) { s[0].x += s[ 2].x; s[0].y += s[ 2].y; EMUSYNC; }
if (reduce_threads >= 2) { s[0].x += s[ 1].x; s[0].y += s[ 1].y; EMUSYNC; }
}
// write result for this block to global mem
if (tid == 0) {
......
#if (REDUCE_TYPE == REDUCE_KAHAN)
#define DSACC(c0, c1, a0, a1) dsadd((c0), (c1), (c0), (c1), (a0), (a1))
......@@ -28,14 +27,19 @@ __global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumFloat *g_odata, u
if (reduce_threads >= 512) { if (tid < 256) { DSACC(s[0],s[1],s[512+0],s[512+1]); } __syncthreads(); }
if (reduce_threads >= 256) { if (tid < 128) { DSACC(s[0],s[1],s[256+0],s[256+1]); } __syncthreads(); }
if (reduce_threads >= 128) { if (tid < 64) { DSACC(s[0],s[1],s[128+0],s[128+1]); } __syncthreads(); }
if (tid < 32) {
if (reduce_threads >= 64) { DSACC(s[0],s[1],s[64+0],s[64+1]); }
if (reduce_threads >= 32) { DSACC(s[0],s[1],s[32+0],s[32+1]); }
if (reduce_threads >= 16) { DSACC(s[0],s[1],s[16+0],s[16+1]); }
if (reduce_threads >= 8) { DSACC(s[0],s[1], s[8+0], s[8+1]); }
if (reduce_threads >= 4) { DSACC(s[0],s[1], s[4+0], s[4+1]); }
if (reduce_threads >= 2) { DSACC(s[0],s[1], s[2+0], s[2+1]); }
}
#ifndef __DEVICE_EMULATION__
if (tid < 32)
#endif
{
if (reduce_threads >= 64) { DSACC(s[0],s[1],s[64+0],s[64+1]); EMUSYNC; }
if (reduce_threads >= 32) { DSACC(s[0],s[1],s[32+0],s[32+1]); EMUSYNC; }
if (reduce_threads >= 16) { DSACC(s[0],s[1],s[16+0],s[16+1]); EMUSYNC; }
if (reduce_threads >= 8) { DSACC(s[0],s[1], s[8+0], s[8+1]); EMUSYNC; }
if (reduce_threads >= 4) { DSACC(s[0],s[1], s[4+0], s[4+1]); EMUSYNC; }
if (reduce_threads >= 2) { DSACC(s[0],s[1], s[2+0], s[2+1]); EMUSYNC; }
}
// write result for this block to global mem as single float
if (tid == 0) g_odata[blockIdx.x] = sdata[0]+sdata[1];
......@@ -65,17 +69,23 @@ __global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumFloat *g_odata, u
if (reduce_threads >= 256) { if (tid < 128) { s[0] += s[128]; } __syncthreads(); }
if (reduce_threads >= 128) { if (tid < 64) { s[0] += s[ 64]; } __syncthreads(); }
if (tid < 32) {
if (reduce_threads >= 64) { s[0] += s[32]; }
if (reduce_threads >= 32) { s[0] += s[16]; }
if (reduce_threads >= 16) { s[0] += s[ 8]; }
if (reduce_threads >= 8) { s[0] += s[ 4]; }
if (reduce_threads >= 4) { s[0] += s[ 2]; }
if (reduce_threads >= 2) { s[0] += s[ 1]; }
}
#ifndef __DEVICE_EMULATION__
if (tid < 32)
#endif
{
if (reduce_threads >= 64) { s[0] += s[32]; EMUSYNC; }
if (reduce_threads >= 32) { s[0] += s[16]; EMUSYNC; }
if (reduce_threads >= 16) { s[0] += s[ 8]; EMUSYNC; }
if (reduce_threads >= 8) { s[0] += s[ 4]; EMUSYNC; }
if (reduce_threads >= 4) { s[0] += s[ 2]; EMUSYNC; }
if (reduce_threads >= 2) { s[0] += s[ 1]; EMUSYNC; }
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = s[0];
if (tid == 0) {
g_odata[blockIdx.x] = s[0];
}
}
#endif
......@@ -119,6 +129,7 @@ double REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n, int kernel, QudaPrecision pr
double cpu_sum = 0;
for (int i = 0; i < blasGrid.x; i++) cpu_sum += h_reduceFloat[i];
return cpu_sum;
}
......@@ -41,14 +41,18 @@ __global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumFloat3 *g_odata,
if (reduce_threads >= 512) { if (tid < 256) { DSACC3(s[0],s[1],s[512+0],s[512+1]); } __syncthreads(); }
if (reduce_threads >= 256) { if (tid < 128) { DSACC3(s[0],s[1],s[256+0],s[256+1]); } __syncthreads(); }
if (reduce_threads >= 128) { if (tid < 64) { DSACC3(s[0],s[1],s[128+0],s[128+1]); } __syncthreads(); }
if (tid < 32) {
if (reduce_threads >= 64) { DSACC3(s[0],s[1],s[64+0],s[64+1]); }
if (reduce_threads >= 32) { DSACC3(s[0],s[1],s[32+0],s[32+1]); }
if (reduce_threads >= 16) { DSACC3(s[0],s[1],s[16+0],s[16+1]); }
if (reduce_threads >= 8) { DSACC3(s[0],s[1], s[8+0], s[8+1]); }
if (reduce_threads >= 4) { DSACC3(s[0],s[1], s[4+0], s[4+1]); }
if (reduce_threads >= 2) { DSACC3(s[0],s[1], s[2+0], s[2+1]); }
}
#ifndef __DEVICE_EMULATION__
if (tid < 32)
#endif
{
if (reduce_threads >= 64) { DSACC3(s[0],s[1],s[64+0],s[64+1]); EMUSYNC; }
if (reduce_threads >= 32) { DSACC3(s[0],s[1],s[32+0],s[32+1]); EMUSYNC; }
if (reduce_threads >= 16) { DSACC3(s[0],s[1],s[16+0],s[16+1]); EMUSYNC; }
if (reduce_threads >= 8) { DSACC3(s[0],s[1], s[8+0], s[8+1]); EMUSYNC; }
if (reduce_threads >= 4) { DSACC3(s[0],s[1], s[4+0], s[4+1]); EMUSYNC; }
if (reduce_threads >= 2) { DSACC3(s[0],s[1], s[2+0], s[2+1]); EMUSYNC; }
}
// write result for this block to global mem as single QudaSumFloat3
if (tid == 0) {
......@@ -92,14 +96,17 @@ __global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumFloat3 *g_odata,
if (reduce_threads >= 128)
{ if (tid < 64) { s[0].x += s[ 64].x; s[0].y += s[ 64].y; s[0].z += s[ 64].z; } __syncthreads(); }
if (tid < 32) {
if (reduce_threads >= 64) { s[0].x += s[32].x; s[0].y += s[32].y; s[0].z += s[32].z; }
if (reduce_threads >= 32) { s[0].x += s[16].x; s[0].y += s[16].y; s[0].z += s[16].z; }
if (reduce_threads >= 16) { s[0].x += s[ 8].x; s[0].y += s[ 8].y; s[0].z += s[ 8].z; }
if (reduce_threads >= 8) { s[0].x += s[ 4].x; s[0].y += s[ 4].y; s[0].z += s[ 4].z; }
if (reduce_threads >= 4) { s[0].x += s[ 2].x; s[0].y += s[ 2].y; s[0].z += s[ 2].z; }
if (reduce_threads >= 2) { s[0].x += s[ 1].x; s[0].y += s[ 1].y; s[0].z += s[ 1].z; }
}
#ifndef __DEVICE_EMULATION__
if (tid < 32)
#endif
{
if (reduce_threads >= 64) { s[0].x += s[32].x; s[0].y += s[32].y; s[0].z += s[32].z; EMUSYNC; }
if (reduce_threads >= 32) { s[0].x += s[16].x; s[0].y += s[16].y; s[0].z += s[16].z; EMUSYNC; }
if (reduce_threads >= 16) { s[0].x += s[ 8].x; s[0].y += s[ 8].y; s[0].z += s[ 8].z; EMUSYNC; }
if (reduce_threads >= 8) { s[0].x += s[ 4].x; s[0].y += s[ 4].y; s[0].z += s[ 4].z; EMUSYNC; }
if (reduce_threads >= 4) { s[0].x += s[ 2].x; s[0].y += s[ 2].y; s[0].z += s[ 2].z; EMUSYNC; }
if (reduce_threads >= 2) { s[0].x += s[ 1].x; s[0].y += s[ 1].y; s[0].z += s[ 1].z; EMUSYNC; }
}
// write result for this block to global mem
if (tid == 0) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment