Advanced Computing Platform for Theoretical Physics

commit大文件会使得服务器变得不稳定,请大家尽量只commit代码,不要commit大的文件。

Commit 54ccabd5 authored by mikeaclark's avatar mikeaclark
Browse files

Removed continual memory allocations from reductions

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@440 be54200a-260c-0410-bdd7-ce6af2a381ab
parent f89cf306
......@@ -22,6 +22,62 @@
#define QudaSumFloat3 float3
#endif
// These are used for reduction kernels
QudaSumFloat *d_reduceFloat;
QudaSumComplex *d_reduceComplex;
QudaSumFloat3 *d_reduceFloat3;
QudaSumFloat h_reduceFloat[REDUCE_MAX_BLOCKS];
QudaSumComplex h_reduceComplex[REDUCE_MAX_BLOCKS];
QudaSumFloat3 h_reduceFloat3[REDUCE_MAX_BLOCKS];
int blocksFloat = 0;
int blocksComplex = 0;
int blocksFloat3 = 0;
void initReduceFloat(int blocks) {
if (blocks != blocksFloat) {
if (blocksFloat > 0) cudaFree(h_reduceFloat);
if (cudaMalloc((void**) &d_reduceFloat, blocks*sizeof(QudaSumFloat))) {
printf("Error allocating reduction matrix\n");
exit(0);
}
blocksFloat = blocks;
printf("Initialized reduce floats %d\n", blocksFloat);
}
}
void initReduceComplex(int blocks) {
if (blocks != blocksComplex) {
if (blocksComplex > 0) cudaFree(h_reduceComplex);
if (cudaMalloc((void**) &d_reduceComplex, blocks*sizeof(QudaSumComplex))) {
printf("Error allocating reduction matrix\n");
exit(0);
}
blocksComplex = blocks;
printf("Initialized reduce complex %d\n", blocksComplex);
}
}
void initReduceFloat3(int blocks) {
if (blocks != blocksFloat3) {
if (blocksFloat3 > 0) cudaFree(h_reduceFloat3);
if (cudaMalloc((void**) &d_reduceFloat3, blocks*sizeof(QudaSumFloat3))) {
printf("Error allocating reduction matrix\n");
exit(0);
}
blocksFloat3 = blocks;
printf("Initialized reduce float3 %d\n", blocksFloat3);
}
}
#if (__CUDA_ARCH__ == 130)
static __inline__ __device__ double2 fetch_double2(texture<int4, 1> t, int i)
{
......
......@@ -23,13 +23,13 @@ int main(int argc, char **argv)
Gauge_param.cpu_prec = QUDA_DOUBLE_PRECISION;
Gauge_param.cuda_prec = QUDA_DOUBLE_PRECISION;
Gauge_param.cuda_prec = QUDA_SINGLE_PRECISION;
Gauge_param.reconstruct = QUDA_RECONSTRUCT_12;
Gauge_param.cuda_prec_sloppy = QUDA_DOUBLE_PRECISION;
Gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12;
Gauge_param.cuda_prec_sloppy = QUDA_HALF_PRECISION;
Gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_8;
Gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
Gauge_param.gauge_fix = QUDA_GAUGE_FIXED_YES;
Gauge_param.anisotropy = 1.0;
......@@ -42,12 +42,12 @@ int main(int argc, char **argv)
double mass = -0.97;
inv_param.kappa = 1.0 / (2.0*(4 + mass));
inv_param.tol = 1e-12;
inv_param.maxiter = 10000;
inv_param.reliable_delta = 1e-2;
inv_param.maxiter = 100;
inv_param.reliable_delta = 1e-8;
inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec_sloppy = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
inv_param.cuda_prec_sloppy = QUDA_HALF_PRECISION;
inv_param.solution_type = QUDA_MAT_SOLUTION;
inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
inv_param.preserve_source = QUDA_PRESERVE_SOURCE_YES; // preservation doesn't work with reliable?
......
......@@ -102,32 +102,25 @@ cuDoubleComplex REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n) {
// allocate arrays on device and host to store one QudaSumComplex for each block
int blocks = min(REDUCE_MAX_BLOCKS, n / REDUCE_THREADS);
QudaSumComplex h_odata[REDUCE_MAX_BLOCKS];
QudaSumComplex *d_odata;
if (cudaMalloc((void**) &d_odata, blocks*sizeof(QudaSumComplex))) {
printf("Error allocating reduction matrix\n");
exit(0);
}
initReduceComplex(blocks);
// partial reduction; each block generates one number
dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1);
int smemSize = REDUCE_THREADS * sizeof(QudaSumComplex);
REDUCE_FUNC_NAME(Kernel)<<< dimGrid, dimBlock, smemSize >>>(REDUCE_PARAMS, d_odata, n);
REDUCE_FUNC_NAME(Kernel)<<< dimGrid, dimBlock, smemSize >>>(REDUCE_PARAMS, d_reduceComplex, n);
// copy result from device to host, and perform final reduction on CPU
cudaMemcpy(h_odata, d_odata, blocks*sizeof(QudaSumComplex), cudaMemcpyDeviceToHost);
cudaMemcpy(h_reduceComplex, d_reduceComplex, blocks*sizeof(QudaSumComplex), cudaMemcpyDeviceToHost);
cuDoubleComplex gpu_result;
gpu_result.x = 0;
gpu_result.y = 0;
for (int i = 0; i < blocks; i++) {
gpu_result.x += h_odata[i].x;
gpu_result.y += h_odata[i].y;
gpu_result.x += h_reduceComplex[i].x;
gpu_result.y += h_reduceComplex[i].y;
}
cudaFree(d_odata);
return gpu_result;
}
......
......@@ -85,26 +85,20 @@ double REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n) {
// allocate arrays on device and host to store one QudaSumFloat for each block
int blocks = min(REDUCE_MAX_BLOCKS, n / REDUCE_THREADS);
QudaSumFloat h_odata[REDUCE_MAX_BLOCKS];
QudaSumFloat *d_odata;
if (cudaMalloc((void**) &d_odata, blocks*sizeof(QudaSumFloat))) {
printf("Error allocating reduction matrix of size %d\n", blocks*sizeof(QudaSumFloat));
exit(0);
}
initReduceFloat(blocks);
// partial reduction; each block generates one number
dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1);
int smemSize = REDUCE_THREADS * sizeof(QudaSumFloat);
REDUCE_FUNC_NAME(Kernel)<<< dimGrid, dimBlock, smemSize >>>(REDUCE_PARAMS, d_odata, n);
REDUCE_FUNC_NAME(Kernel)<<< dimGrid, dimBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat, n);
// copy result from device to host, and perform final reduction on CPU
cudaMemcpy(h_odata, d_odata, blocks*sizeof(QudaSumFloat), cudaMemcpyDeviceToHost);
cudaMemcpy(h_reduceFloat, d_reduceFloat, blocks*sizeof(QudaSumFloat), cudaMemcpyDeviceToHost);
double cpu_sum = 0;
for (int i = 0; i < blocks; i++)
cpu_sum += h_odata[i];
cpu_sum += h_reduceFloat[i];
cudaFree(d_odata);
return cpu_sum;
}
......
......@@ -114,34 +114,27 @@ double3 REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n) {
// allocate arrays on device and host to store one QudaSumFloat3 for each block
int blocks = min(REDUCE_MAX_BLOCKS, n / REDUCE_THREADS);
QudaSumFloat3 h_odata[REDUCE_MAX_BLOCKS];
QudaSumFloat3 *d_odata;
if (cudaMalloc((void**) &d_odata, blocks*sizeof(QudaSumFloat3))) {
printf("Error allocating reduction matrix\n");
exit(0);
}
initReduceFloat3(blocks);
// partial reduction; each block generates one number
dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1);
int smemSize = REDUCE_THREADS * sizeof(QudaSumFloat3);
REDUCE_FUNC_NAME(Kernel)<<< dimGrid, dimBlock, smemSize >>>(REDUCE_PARAMS, d_odata, n);
REDUCE_FUNC_NAME(Kernel)<<< dimGrid, dimBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat3, n);
// copy result from device to host, and perform final reduction on CPU
cudaMemcpy(h_odata, d_odata, blocks*sizeof(QudaSumFloat3), cudaMemcpyDeviceToHost);
cudaMemcpy(h_reduceFloat3, d_reduceFloat3, blocks*sizeof(QudaSumFloat3), cudaMemcpyDeviceToHost);
double3 gpu_result;
gpu_result.x = 0;
gpu_result.y = 0;
gpu_result.z = 0;
for (int i = 0; i < blocks; i++) {
gpu_result.x += h_odata[i].x;
gpu_result.y += h_odata[i].y;
gpu_result.z += h_odata[i].z;
gpu_result.x += h_reduceFloat3[i].x;
gpu_result.y += h_reduceFloat3[i].y;
gpu_result.z += h_reduceFloat3[i].z;
}
cudaFree(d_odata);
return gpu_result;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment