Advanced Computing Platform for Theoretical Physics

commit大文件会使得服务器变得不稳定,请大家尽量只commit代码,不要commit大的文件。

Commit 479eb285 authored by mikeaclark's avatar mikeaclark
Browse files

Added auto-tuning blas

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@586 be54200a-260c-0410-bdd7-ce6af2a381ab
parent ba59a842
Version 0.x
- Add auto-tuning blas to improve performance (see README for details)
- Introduced new interface functions newQudaGaugeParam() and
newQudaInvertParam() to allow for enhanced error checking. See
invert_test for an example of their use.
......
......@@ -53,6 +53,15 @@ against lib/libquda.a, and study tests/invert_test.c for an example of
the interface. The various inverter options are enumerated in
include/enum_quda.h.
The lib/blas_quda.cu file contains all of the BLAS like functions
required for the inverters. The threads per block and blocks per grid
parameters are auto-tuned using the blas_test function in quda/tests,
and the output stored in blas_param.h which is included here. These
optimal values may change a function of the CUDA device and the host
hardware, so re-running blas_test and copying over the output
blas_param.h and recompiling the blas library may provide extra
performance.
Known issues:
......
......@@ -16,6 +16,10 @@ extern "C" {
// ---------- blas_quda.cu ----------
// creates and destroys reduction buffers
void initBlas();
void endBlas();
void zeroCuda(ParitySpinor a);
void copyCuda(ParitySpinor dst, ParitySpinor src);
......@@ -53,6 +57,9 @@ extern "C" {
extern unsigned long long blas_quda_flops;
extern unsigned long long blas_quda_bytes;
extern int blas_threads[3][22];
extern int blas_blocks[3][22];
#ifdef __cplusplus
}
#endif
......
/*
Auto-tuned blas CUDA parameters, generated by blas_test
*/
// Kernel: copyCuda
blas_threads[0][0] = 64;
blas_blocks[0][0] = 256;
// Kernel: axpbyCuda
blas_threads[0][1] = 64;
blas_blocks[0][1] = 1024;
// Kernel: xpyCuda
blas_threads[0][2] = 64;
blas_blocks[0][2] = 1024;
// Kernel: axpyCuda
blas_threads[0][3] = 64;
blas_blocks[0][3] = 1024;
// Kernel: xpayCuda
blas_threads[0][4] = 64;
blas_blocks[0][4] = 1024;
// Kernel: mxpyCuda
blas_threads[0][5] = 64;
blas_blocks[0][5] = 1024;
// Kernel: axCuda
blas_threads[0][6] = 64;
blas_blocks[0][6] = 1024;
// Kernel: caxpyCuda
blas_threads[0][7] = 64;
blas_blocks[0][7] = 1024;
// Kernel: caxpbyCuda
blas_threads[0][8] = 64;
blas_blocks[0][8] = 1024;
// Kernel: cxpaypbzCuda
blas_threads[0][9] = 64;
blas_blocks[0][9] = 1024;
// Kernel: axpyZpbxCuda
blas_threads[0][10] = 64;
blas_blocks[0][10] = 512;
// Kernel: caxpbypzYmbwCuda
blas_threads[0][11] = 64;
blas_blocks[0][11] = 1024;
// Kernel: sumCuda
blas_threads[0][12] = 64;
blas_blocks[0][12] = 128;
// Kernel: normCuda
blas_threads[0][13] = 64;
blas_blocks[0][13] = 128;
// Kernel: reDotProductCuda
blas_threads[0][14] = 64;
blas_blocks[0][14] = 64;
// Kernel: axpyNormCuda
blas_threads[0][15] = 64;
blas_blocks[0][15] = 256;
// Kernel: xmyNormCuda
blas_threads[0][16] = 64;
blas_blocks[0][16] = 512;
// Kernel: cDotProductCuda
blas_threads[0][17] = 64;
blas_blocks[0][17] = 64;
// Kernel: xpaycDotzyCuda
blas_threads[0][18] = 64;
blas_blocks[0][18] = 256;
// Kernel: cDotProductNormACuda
blas_threads[0][19] = 64;
blas_blocks[0][19] = 64;
// Kernel: cDotProductNormBCuda
blas_threads[0][20] = 64;
blas_blocks[0][20] = 64;
// Kernel: caxpbypzYmbwcDotProductWYNormYQuda
blas_threads[0][21] = 64;
blas_blocks[0][21] = 512;
// Kernel: copyCuda
blas_threads[1][0] = 64;
blas_blocks[1][0] = 1024;
// Kernel: axpbyCuda
blas_threads[1][1] = 128;
blas_blocks[1][1] = 128;
// Kernel: xpyCuda
blas_threads[1][2] = 128;
blas_blocks[1][2] = 128;
// Kernel: axpyCuda
blas_threads[1][3] = 128;
blas_blocks[1][3] = 128;
// Kernel: xpayCuda
blas_threads[1][4] = 128;
blas_blocks[1][4] = 128;
// Kernel: mxpyCuda
blas_threads[1][5] = 128;
blas_blocks[1][5] = 128;
// Kernel: axCuda
blas_threads[1][6] = 64;
blas_blocks[1][6] = 128;
// Kernel: caxpyCuda
blas_threads[1][7] = 64;
blas_blocks[1][7] = 128;
// Kernel: caxpbyCuda
blas_threads[1][8] = 64;
blas_blocks[1][8] = 128;
// Kernel: cxpaypbzCuda
blas_threads[1][9] = 64;
blas_blocks[1][9] = 128;
// Kernel: axpyZpbxCuda
blas_threads[1][10] = 128;
blas_blocks[1][10] = 128;
// Kernel: caxpbypzYmbwCuda
blas_threads[1][11] = 64;
blas_blocks[1][11] = 128;
// Kernel: sumCuda
blas_threads[1][12] = 128;
blas_blocks[1][12] = 1024;
// Kernel: normCuda
blas_threads[1][13] = 128;
blas_blocks[1][13] = 1024;
// Kernel: reDotProductCuda
blas_threads[1][14] = 128;
blas_blocks[1][14] = 1024;
// Kernel: axpyNormCuda
blas_threads[1][15] = 128;
blas_blocks[1][15] = 1024;
// Kernel: xmyNormCuda
blas_threads[1][16] = 128;
blas_blocks[1][16] = 1024;
// Kernel: cDotProductCuda
blas_threads[1][17] = 128;
blas_blocks[1][17] = 512;
// Kernel: xpaycDotzyCuda
blas_threads[1][18] = 64;
blas_blocks[1][18] = 128;
// Kernel: cDotProductNormACuda
blas_threads[1][19] = 64;
blas_blocks[1][19] = 1024;
// Kernel: cDotProductNormBCuda
blas_threads[1][20] = 64;
blas_blocks[1][20] = 1024;
// Kernel: caxpbypzYmbwcDotProductWYNormYQuda
blas_threads[1][21] = 64;
blas_blocks[1][21] = 128;
// Kernel: copyCuda
blas_threads[2][0] = 64;
blas_blocks[2][0] = 128;
// Kernel: axpbyCuda
blas_threads[2][1] = 64;
blas_blocks[2][1] = 128;
// Kernel: xpyCuda
blas_threads[2][2] = 64;
blas_blocks[2][2] = 128;
// Kernel: axpyCuda
blas_threads[2][3] = 64;
blas_blocks[2][3] = 128;
// Kernel: xpayCuda
blas_threads[2][4] = 64;
blas_blocks[2][4] = 128;
// Kernel: mxpyCuda
blas_threads[2][5] = 64;
blas_blocks[2][5] = 128;
// Kernel: axCuda
blas_threads[2][6] = 64;
blas_blocks[2][6] = 1024;
// Kernel: caxpyCuda
blas_threads[2][7] = 64;
blas_blocks[2][7] = 128;
// Kernel: caxpbyCuda
blas_threads[2][8] = 64;
blas_blocks[2][8] = 128;
// Kernel: cxpaypbzCuda
blas_threads[2][9] = 64;
blas_blocks[2][9] = 1024;
// Kernel: axpyZpbxCuda
blas_threads[2][10] = 64;
blas_blocks[2][10] = 128;
// Kernel: caxpbypzYmbwCuda
blas_threads[2][11] = 64;
blas_blocks[2][11] = 128;
// Kernel: sumCuda
blas_threads[2][12] = 128;
blas_blocks[2][12] = 128;
// Kernel: normCuda
blas_threads[2][13] = 128;
blas_blocks[2][13] = 128;
// Kernel: reDotProductCuda
blas_threads[2][14] = 128;
blas_blocks[2][14] = 128;
// Kernel: axpyNormCuda
blas_threads[2][15] = 64;
blas_blocks[2][15] = 128;
// Kernel: xmyNormCuda
blas_threads[2][16] = 64;
blas_blocks[2][16] = 128;
// Kernel: cDotProductCuda
blas_threads[2][17] = 64;
blas_blocks[2][17] = 128;
// Kernel: xpaycDotzyCuda
blas_threads[2][18] = 64;
blas_blocks[2][18] = 128;
// Kernel: cDotProductNormACuda
blas_threads[2][19] = 64;
blas_blocks[2][19] = 128;
// Kernel: cDotProductNormBCuda
blas_threads[2][20] = 64;
blas_blocks[2][20] = 128;
// Kernel: caxpbypzYmbwcDotProductWYNormYQuda
blas_threads[2][21] = 64;
blas_blocks[2][21] = 256;
This diff is collapsed.
......@@ -29,8 +29,8 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, ParitySpinor r,
ParitySpinor p = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy, x.pad);
ParitySpinor v = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy, x.pad);
ParitySpinor t = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy, x.pad);
ParitySpinor tmp = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy, x.pad);
ParitySpinor t = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy, x.pad);
ParitySpinor x_sloppy, r_sloppy, r0;
if (invert_param->cuda_prec_sloppy == x.precision) {
......@@ -47,6 +47,14 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, ParitySpinor r,
zeroCuda(x_sloppy);
copyCuda(r_sloppy, b);
/*{
cuDoubleComplex rv;
MatVec(v, cudaGaugeSloppy, cudaCloverSloppy, cudaCloverInvSloppy, r_sloppy, invert_param, tmp, dag_type);
// rv = (r0,v)
rv = cDotProductCuda(r0, v);
printf("%e %e %e %e %e\n", rv.x, rv.y, normCuda(r_sloppy), normCuda(v), normCuda(tmp)); exit(0);
} */
zeroCuda(y);
double b2 = normCuda(b);
......@@ -101,8 +109,8 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, ParitySpinor r,
MatVec(v, cudaGaugeSloppy, cudaCloverSloppy, cudaCloverInvSloppy, p, invert_param, tmp, dag_type);
// rv = (r0,v)
rv = cDotProductCuda(r0, v);
rv = cDotProductCuda(r0, v);
alpha = cuCdiv(rho, rv);
// r -= alpha*v
......@@ -119,7 +127,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, ParitySpinor r,
//x += alpha*p + omega*r, r -= omega*t, r2 = (r,r), rho = (r0, r)
rho_r2 = caxpbypzYmbwcDotProductWYNormYQuda(alpha, p, omega, r_sloppy, x_sloppy, t, r0);
rho0 = rho; rho.x = rho_r2.x; rho.y = rho_r2.y; r2 = rho_r2.z;
// reliable updates
rNorm = sqrt(r2);
if (rNorm > maxrx) maxrx = rNorm;
......
......@@ -18,10 +18,13 @@ FullClover cudaCloverSloppy;
FullClover cudaCloverInvPrecise; // inverted clover term
FullClover cudaCloverInvSloppy;
void initBlas();
// define newQudaGaugeParam() and newQudaInvertParam()
#define INIT_PARAM
#include "check_params.h"
#undef INIT_PARAM
>>>>>>> .r585
// define (static) checkGaugeParam() and checkInvertParam()
#define CHECK_PARAM
......@@ -88,6 +91,8 @@ void initQuda(int dev)
cudaCloverInvSloppy.even.clover = NULL;
cudaCloverInvSloppy.odd.clover = NULL;
initBlas();
}
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
......@@ -204,6 +209,7 @@ void endQuda(void)
if (cudaCloverSloppy.even.clover) freeCloverField(&cudaCloverSloppy);
if (cudaCloverInvPrecise.even.clover) freeCloverField(&cudaCloverInvPrecise);
if (cudaCloverInvSloppy.even.clover) freeCloverField(&cudaCloverInvSloppy);
endBlas();
}
void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int parity, int dagger)
......
......@@ -7,8 +7,8 @@
__global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumComplex *g_odata, unsigned int n) {
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(REDUCE_THREADS) + threadIdx.x;
unsigned int gridSize = REDUCE_THREADS*gridDim.x;
unsigned int i = blockIdx.x*(reduce_threads) + threadIdx.x;
unsigned int gridSize = reduce_threads*gridDim.x;
QudaSumFloat acc0 = 0;
QudaSumFloat acc1 = 0;
......@@ -32,15 +32,17 @@ __global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumComplex *g_odata,
__syncthreads();
if (REDUCE_THREADS >= 256) { if (tid < 128) { ZCACC(s[0],s[1],s[256+0],s[256+1]); } __syncthreads(); }
if (REDUCE_THREADS >= 128) { if (tid < 64) { ZCACC(s[0],s[1],s[128+0],s[128+1]); } __syncthreads(); }
if (reduce_threads >= 1024) { if (tid < 512) { ZCACC(s[0],s[1],s[1024+0],s[1024+1]); } __syncthreads(); }
if (reduce_threads >= 512) { if (tid < 256) { ZCACC(s[0],s[1],s[512+0],s[512+1]); } __syncthreads(); }
if (reduce_threads >= 256) { if (tid < 128) { ZCACC(s[0],s[1],s[256+0],s[256+1]); } __syncthreads(); }
if (reduce_threads >= 128) { if (tid < 64) { ZCACC(s[0],s[1],s[128+0],s[128+1]); } __syncthreads(); }
if (tid < 32) {
if (REDUCE_THREADS >= 64) { ZCACC(s[0],s[1],s[64+0],s[64+1]); }
if (REDUCE_THREADS >= 32) { ZCACC(s[0],s[1],s[32+0],s[32+1]); }
if (REDUCE_THREADS >= 16) { ZCACC(s[0],s[1],s[16+0],s[16+1]); }
if (REDUCE_THREADS >= 8) { ZCACC(s[0],s[1], s[8+0], s[8+1]); }
if (REDUCE_THREADS >= 4) { ZCACC(s[0],s[1], s[4+0], s[4+1]); }
if (REDUCE_THREADS >= 2) { ZCACC(s[0],s[1], s[2+0], s[2+1]); }
if (reduce_threads >= 64) { ZCACC(s[0],s[1],s[64+0],s[64+1]); }
if (reduce_threads >= 32) { ZCACC(s[0],s[1],s[32+0],s[32+1]); }
if (reduce_threads >= 16) { ZCACC(s[0],s[1],s[16+0],s[16+1]); }
if (reduce_threads >= 8) { ZCACC(s[0],s[1], s[8+0], s[8+1]); }
if (reduce_threads >= 4) { ZCACC(s[0],s[1], s[4+0], s[4+1]); }
if (reduce_threads >= 2) { ZCACC(s[0],s[1], s[2+0], s[2+1]); }
}
// write result for this block to global mem as single QudaSumComplex
......@@ -54,8 +56,8 @@ __global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumComplex *g_odata,
__global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumComplex *g_odata, unsigned int n) {
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*REDUCE_THREADS + threadIdx.x;
unsigned int gridSize = REDUCE_THREADS*gridDim.x;
unsigned int i = blockIdx.x*reduce_threads + threadIdx.x;
unsigned int gridSize = reduce_threads*gridDim.x;
extern __shared__ QudaSumComplex cdata[];
QudaSumComplex *s = cdata + tid;
......@@ -72,16 +74,18 @@ __global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumComplex *g_odata,
__syncthreads();
// do reduction in shared mem
if (REDUCE_THREADS >= 256) { if (tid < 128) { s[0].x += s[128].x; s[0].y += s[128].y; } __syncthreads(); }
if (REDUCE_THREADS >= 128) { if (tid < 64) { s[0].x += s[ 64].x; s[0].y += s[ 64].y; } __syncthreads(); }
if (reduce_threads >= 1024) { if (tid < 512) { s[0].x += s[512].x; s[0].y += s[512].y; } __syncthreads(); }
if (reduce_threads >= 512) { if (tid < 256) { s[0].x += s[256].x; s[0].y += s[256].y; } __syncthreads(); }
if (reduce_threads >= 256) { if (tid < 128) { s[0].x += s[128].x; s[0].y += s[128].y; } __syncthreads(); }
if (reduce_threads >= 128) { if (tid < 64) { s[0].x += s[ 64].x; s[0].y += s[ 64].y; } __syncthreads(); }
if (tid < 32) {
if (REDUCE_THREADS >= 64) { s[0].x += s[32].x; s[0].y += s[32].y; }
if (REDUCE_THREADS >= 32) { s[0].x += s[16].x; s[0].y += s[16].y; }
if (REDUCE_THREADS >= 16) { s[0].x += s[ 8].x; s[0].y += s[ 8].y; }
if (REDUCE_THREADS >= 8) { s[0].x += s[ 4].x; s[0].y += s[ 4].y; }
if (REDUCE_THREADS >= 4) { s[0].x += s[ 2].x; s[0].y += s[ 2].y; }
if (REDUCE_THREADS >= 2) { s[0].x += s[ 1].x; s[0].y += s[ 1].y; }
if (reduce_threads >= 64) { s[0].x += s[32].x; s[0].y += s[32].y; }
if (reduce_threads >= 32) { s[0].x += s[16].x; s[0].y += s[16].y; }
if (reduce_threads >= 16) { s[0].x += s[ 8].x; s[0].y += s[ 8].y; }
if (reduce_threads >= 8) { s[0].x += s[ 4].x; s[0].y += s[ 4].y; }
if (reduce_threads >= 4) { s[0].x += s[ 2].x; s[0].y += s[ 2].y; }
if (reduce_threads >= 2) { s[0].x += s[ 1].x; s[0].y += s[ 1].y; }
}
// write result for this block to global mem
......@@ -94,28 +98,43 @@ __global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumComplex *g_odata,
#endif
template <typename Float, typename Float2>
cuDoubleComplex REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n) {
if (n % REDUCE_THREADS != 0) {
printf("ERROR reduceCuda(): length %d must be a multiple of %d\n", n, REDUCE_THREADS);
cuDoubleComplex REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n, int kernel, QudaPrecision precision) {
setBlock(kernel, n, precision);
if (n % blasBlock.x != 0) {
printf("ERROR reduce_complex(): length %d must be a multiple of %d\n", n, blasBlock.x);
exit(-1);
}
// allocate arrays on device and host to store one QudaSumComplex for each block
int blocks = min(REDUCE_MAX_BLOCKS, n / REDUCE_THREADS);
initReduceComplex(blocks);
if (blasBlock.x > REDUCE_MAX_BLOCKS) {
printf("ERROR reduce_complex: block size greater then maximum permitted\n");
exit(-1);
}
// partial reduction; each block generates one number
dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1);
#if (REDUCE_TYPE == REDUCE_KAHAN)
int smemSize = REDUCE_THREADS * 2 * sizeof(QudaSumComplex);
int smemSize = blasBlock.x * 2 * sizeof(QudaSumComplex);
#else
int smemSize = REDUCE_THREADS * sizeof(QudaSumComplex);
int smemSize = blasBlock.x * sizeof(QudaSumComplex);
#endif
REDUCE_FUNC_NAME(Kernel)<<< dimGrid, dimBlock, smemSize >>>(REDUCE_PARAMS, d_reduceComplex, n);
if (blasBlock.x == 64) {
REDUCE_FUNC_NAME(Kernel)<64><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceComplex, n);
} else if (blasBlock.x == 128) {
REDUCE_FUNC_NAME(Kernel)<128><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceComplex, n);
} else if (blasBlock.x == 256) {
REDUCE_FUNC_NAME(Kernel)<256><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceComplex, n);
} else if (blasBlock.x == 512) {
REDUCE_FUNC_NAME(Kernel)<512><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceComplex, n);
} else if (blasBlock.x == 1024) {
REDUCE_FUNC_NAME(Kernel)<1024><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceComplex, n);
} else {
printf("Reduction not implemented for %d threads\n", blasBlock.x);
exit(-1);
}
// copy result from device to host, and perform final reduction on CPU
cudaError_t error = cudaMemcpy(h_reduceComplex, d_reduceComplex, blocks*sizeof(QudaSumComplex), cudaMemcpyDeviceToHost);
cudaError_t error = cudaMemcpy(h_reduceComplex, d_reduceComplex, blasGrid.x*sizeof(QudaSumComplex), cudaMemcpyDeviceToHost);
if (error != cudaSuccess) {
printf("Error: %s\n", cudaGetErrorString(error));
exit(-1);
......@@ -124,7 +143,7 @@ cuDoubleComplex REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n) {
cuDoubleComplex gpu_result;
gpu_result.x = 0;
gpu_result.y = 0;
for (int i = 0; i < blocks; i++) {
for (int i = 0; i < blasGrid.x; i++) {
gpu_result.x += h_reduceComplex[i].x;
gpu_result.y += h_reduceComplex[i].y;
}
......
......@@ -5,8 +5,8 @@
__global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumFloat *g_odata, unsigned int n) {
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(REDUCE_THREADS) + threadIdx.x;
unsigned int gridSize = REDUCE_THREADS*gridDim.x;
unsigned int i = blockIdx.x*(reduce_threads) + threadIdx.x;
unsigned int gridSize = reduce_threads*gridDim.x;
QudaSumFloat acc0 = 0;
QudaSumFloat acc1 = 0;
......@@ -24,15 +24,17 @@ __global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumFloat *g_odata, u
__syncthreads();
if (REDUCE_THREADS >= 256) { if (tid < 128) { DSACC(s[0],s[1],s[256+0],s[256+1]); } __syncthreads(); }
if (REDUCE_THREADS >= 128) { if (tid < 64) { DSACC(s[0],s[1],s[128+0],s[128+1]); } __syncthreads(); }
if (reduce_threads >= 1024) { if (tid < 512) { DSACC(s[0],s[1],s[1024+0],s[1024+1]); } __syncthreads(); }
if (reduce_threads >= 512) { if (tid < 256) { DSACC(s[0],s[1],s[512+0],s[512+1]); } __syncthreads(); }
if (reduce_threads >= 256) { if (tid < 128) { DSACC(s[0],s[1],s[256+0],s[256+1]); } __syncthreads(); }
if (reduce_threads >= 128) { if (tid < 64) { DSACC(s[0],s[1],s[128+0],s[128+1]); } __syncthreads(); }
if (tid < 32) {
if (REDUCE_THREADS >= 64) { DSACC(s[0],s[1],s[64+0],s[64+1]); }
if (REDUCE_THREADS >= 32) { DSACC(s[0],s[1],s[32+0],s[32+1]); }
if (REDUCE_THREADS >= 16) { DSACC(s[0],s[1],s[16+0],s[16+1]); }
if (REDUCE_THREADS >= 8) { DSACC(s[0],s[1], s[8+0], s[8+1]); }
if (REDUCE_THREADS >= 4) { DSACC(s[0],s[1], s[4+0], s[4+1]); }
if (REDUCE_THREADS >= 2) { DSACC(s[0],s[1], s[2+0], s[2+1]); }
if (reduce_threads >= 64) { DSACC(s[0],s[1],s[64+0],s[64+1]); }
if (reduce_threads >= 32) { DSACC(s[0],s[1],s[32+0],s[32+1]); }
if (reduce_threads >= 16) { DSACC(s[0],s[1],s[16+0],s[16+1]); }
if (reduce_threads >= 8) { DSACC(s[0],s[1], s[8+0], s[8+1]); }
if (reduce_threads >= 4) { DSACC(s[0],s[1], s[4+0], s[4+1]); }
if (reduce_threads >= 2) { DSACC(s[0],s[1], s[2+0], s[2+1]); }
}
// write result for this block to global mem as single float
......@@ -43,8 +45,8 @@ __global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumFloat *g_odata, u
__global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumFloat *g_odata, unsigned int n) {
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*REDUCE_THREADS + threadIdx.x;
unsigned int gridSize = REDUCE_THREADS*gridDim.x;
unsigned int i = blockIdx.x*reduce_threads + threadIdx.x;
unsigned int gridSize = reduce_threads*gridDim.x;
extern __shared__ QudaSumFloat sdata[];
QudaSumFloat *s = sdata + tid;
......@@ -58,16 +60,18 @@ __global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumFloat *g_odata, u
__syncthreads();
// do reduction in shared mem
if (REDUCE_THREADS >= 256) { if (tid < 128) { s[0] += s[128]; } __syncthreads(); }
if (REDUCE_THREADS >= 128) { if (tid < 64) { s[0] += s[ 64]; } __syncthreads(); }
if (reduce_threads >= 1024) { if (tid < 512) { s[0] += s[512]; } __syncthreads(); }
if (reduce_threads >= 512) { if (tid < 256) { s[0] += s[256]; } __syncthreads(); }
if (reduce_threads >= 256) { if (tid < 128) { s[0] += s[128]; } __syncthreads(); }
if (reduce_threads >= 128) { if (tid < 64) { s[0] += s[ 64]; } __syncthread