Advanced Computing Platform for Theoretical Physics

Commit 58a19a3e authored by mikeaclark's avatar mikeaclark
Browse files

Fixed clover memory bug

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@492 be54200a-260c-0410-bdd7-ce6af2a381ab
parent a61f7cf1
......@@ -40,7 +40,7 @@ unsigned long long blas_quda_bytes;
void initReduceFloat(int blocks) {
if (blocks != blocksFloat) {
if (blocksFloat > 0) cudaFree(h_reduceFloat);
if (blocksFloat > 0) cudaFree(d_reduceFloat);
if (cudaMalloc((void**) &d_reduceFloat, blocks*sizeof(QudaSumFloat))) {
printf("Error allocating reduction matrix\n");
......@@ -55,7 +55,7 @@ void initReduceFloat(int blocks) {
void initReduceComplex(int blocks) {
if (blocks != blocksComplex) {
if (blocksComplex > 0) cudaFree(h_reduceComplex);
if (blocksComplex > 0) cudaFree(d_reduceComplex);
if (cudaMalloc((void**) &d_reduceComplex, blocks*sizeof(QudaSumComplex))) {
printf("Error allocating reduction matrix\n");
......@@ -69,7 +69,7 @@ void initReduceComplex(int blocks) {
void initReduceFloat3(int blocks) {
if (blocks != blocksFloat3) {
if (blocksFloat3 > 0) cudaFree(h_reduceFloat3);
if (blocksFloat3 > 0) cudaFree(d_reduceFloat3);
if (cudaMalloc((void**) &d_reduceFloat3, blocks*sizeof(QudaSumFloat3))) {
printf("Error allocating reduction matrix\n");
......@@ -701,12 +701,21 @@ void mxpyCuda(ParitySpinor x, ParitySpinor y) {
blas_quda_flops += x.length;
}
template <typename Float>
__global__ void axKernel(Float a, Float *x, int len) {
float2 __device__ make_Float2(float x, float y) {
return make_float2(x, y);
}
double2 __device__ make_Float2(double x, double y) {
return make_double2(x, y);
}
template <typename Float, typename Float2>
__global__ void axKernel(Float a, Float2 *x, int len) {
unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
unsigned int gridSize = gridDim.x*blockDim.x;
while (i < len) {
x[i] *= a;
x[i].x *= a;
x[i].y *= a;
i += gridSize;
}
}
......@@ -731,9 +740,9 @@ void axCuda(double a, ParitySpinor x) {
dim3 dimGrid(blocks, 1, 1);
blas_quda_bytes += 2*x.length*x.precision;
if (x.precision == QUDA_DOUBLE_PRECISION) {
axKernel<<<dimGrid, dimBlock>>>(a, (double*)x.spinor, x.length);
axKernel<<<dimGrid, dimBlock>>>(a, (double2*)x.spinor, x.length/2);
} else if (x.precision == QUDA_SINGLE_PRECISION) {
axKernel<<<dimGrid, dimBlock>>>((float)a, (float*)x.spinor, x.length);
axKernel<<<dimGrid, dimBlock>>>((float)a, (float2*)x.spinor, x.length/2);
} else {
int spinor_bytes = x.length*sizeof(short);
cudaBindTexture(0, texHalf1, x.spinor, spinor_bytes);
......@@ -743,14 +752,6 @@ void axCuda(double a, ParitySpinor x) {
blas_quda_flops += x.length;
}
float2 __device__ make_Float2(float x, float y) {
return make_float2(x, y);
}
double2 __device__ make_Float2(double x, double y) {
return make_double2(x, y);
}
template <typename Float2>
__global__ void caxpyKernel(Float2 a, Float2 *x, Float2 *y, int len) {
......@@ -940,14 +941,16 @@ void cxpaypbzCuda(ParitySpinor x, double2 a, ParitySpinor y, double2 b, ParitySp
}
}
template <typename Float>
__global__ void axpyZpbxKernel(Float a, Float *x, Float *y, Float *z, Float b, int len) {
template <typename Float, typename Float2>
__global__ void axpyZpbxKernel(Float a, Float2 *x, Float2 *y, Float2 *z, Float b, int len) {
unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
unsigned int gridSize = gridDim.x*blockDim.x;
while (i < len) {
Float x_i = x[i];
y[i] += a*x_i;
x[i] = z[i] + b*x_i;
Float2 x_i = make_Float2(x[i].x, x[i].y);
y[i].x += a*x_i.x;
y[i].y += a*x_i.y;
x[i].x = z[i].x + b*x_i.x;
x[i].y = z[i].y + b*x_i.y;
i += gridSize;
}
}
......@@ -988,9 +991,9 @@ void axpyZpbxCuda(double a, ParitySpinor x, ParitySpinor y, ParitySpinor z, doub
dim3 dimGrid(blocks, 1, 1);
blas_quda_bytes += 5*x.length*x.precision;
if (x.precision == QUDA_DOUBLE_PRECISION) {
axpyZpbxKernel<<<dimGrid, dimBlock>>>(a, (double*)x.spinor, (double*)y.spinor, (double*)z.spinor, b, x.length);
axpyZpbxKernel<<<dimGrid, dimBlock>>>(a, (double2*)x.spinor, (double2*)y.spinor, (double2*)z.spinor, b, x.length/2);
} else if (x.precision == QUDA_SINGLE_PRECISION) {
axpyZpbxKernel<<<dimGrid, dimBlock>>>((float)a, (float*)x.spinor, (float*)y.spinor, (float*)z.spinor, (float)b, x.length);
axpyZpbxKernel<<<dimGrid, dimBlock>>>((float)a, (float2*)x.spinor, (float2*)y.spinor, (float2*)z.spinor, (float)b, x.length/2);
} else {
int spinor_bytes = x.length*sizeof(short);
cudaBindTexture(0, texHalf1, x.spinor, spinor_bytes);
......@@ -1945,81 +1948,3 @@ double cpuDouble(float *data, int size) {
return sum;
}
/*
void blasTest() {
int n = 3*1<<24;
float *h_data = (float *)malloc(n*sizeof(float));
float *d_data;
if (cudaMalloc((void **)&d_data, n*sizeof(float))) {
printf("Error allocating d_data\n");
exit(0);
}
for (int i = 0; i < n; i++) {
h_data[i] = rand()/(float)RAND_MAX - 0.5; // n-1.0-i;
}
cudaMemcpy(d_data, h_data, n*sizeof(float), cudaMemcpyHostToDevice);
cudaThreadSynchronize();
stopwatchStart();
int LOOPS = 20;
for (int i = 0; i < LOOPS; i++) {
sumCuda(d_data, n);
}
cudaThreadSynchronize();
float secs = stopwatchReadSeconds();
printf("%f GiB/s\n", 1.e-9*n*sizeof(float)*LOOPS / secs);
printf("Device: %f MiB\n", (float)n*sizeof(float) / (1 << 20));
printf("Shared: %f KiB\n", (float)REDUCE_THREADS*sizeof(float) / (1 << 10));
float correctDouble = cpuDouble(h_data, n);
printf("Total: %f\n", correctDouble);
printf("CUDA db: %f\n", fabs(correctDouble-sumCuda(d_data, n)));
cudaFree(d_data) ;
free(h_data);
}
*/
/*
void axpbyTest() {
int n = 3 * 1 << 20;
float *h_x = (float *)malloc(n*sizeof(float));
float *h_y = (float *)malloc(n*sizeof(float));
float *h_res = (float *)malloc(n*sizeof(float));
float *d_x, *d_y;
if (cudaMalloc((void **)&d_x, n*sizeof(float))) {
printf("Error allocating d_x\n");
exit(0);
}
if (cudaMalloc((void **)&d_y, n*sizeof(float))) {
printf("Error allocating d_y\n");
exit(0);
}
for (int i = 0; i < n; i++) {
h_x[i] = 1;
h_y[i] = 2;
}
cudaMemcpy(d_x, h_x, n*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, h_y, n*sizeof(float), cudaMemcpyHostToDevice);
axpbyCuda(4, d_x, 3, d_y, n/2);
cudaMemcpy( h_res, d_y, n*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < n; i++) {
float expect = (i < n/2) ? 4*h_x[i] + 3*h_y[i] : h_y[i];
if (h_res[i] != expect)
printf("FAILED %d : %f != %f\n", i, h_res[i], h_y[i]);
}
cudaFree(d_y);
cudaFree(d_x);
free(h_x);
free(h_y);
}
*/
......@@ -173,27 +173,27 @@ int main(int argc, char** argv) {
int kernels[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
char names[][100] = {
"axpbyCuda(a, x, b, y): ",
"xpyCuda(x, y): ",
"axpyCuda(a, x, y): ",
"xpayCuda(x, a, y): ",
"mxpyCuda(x, y): ",
"axCuda(a, x): ",
"caxpyCuda(a2, x, y): ",
"caxpbyCuda(a2, x, b2, y): ",
"cxpaypbzCuda(x, a2, y, b2, z): ",
"axpyZpbxCuda(a, x, y, z, b): ",
"caxpbypzYmbwCuda(a2, x, b2, y, z, w): ",
"sumCuda(x): ",
"normCuda(x): ",
"reDotProductCuda(x, y): ",
"axpyNormCuda(a, x, y): ",
"xmyNormCuda(x, y): ",
"cDotProductCuda(x, y): ",
"xpaycDotzyCuda(x, a, y, z): ",
"cDotProductNormACuda(x, y): ",
"cDotProductNormBCuda(x, y): ",
"caxpbypzYmbwcDotProductWYNormYQuda(a2, x, b2, y, z, w, v): "
"axpbyCuda: ",
"xpyCuda: ",
"axpyCuda: ",
"xpayCuda: ",
"mxpyCuda: ",
"axCuda: ",
"caxpyCuda: ",
"caxpbyCuda: ",
"cxpaypbzCuda: ",
"axpyZpbxCuda: ",
"caxpbypzYmbwCuda: ",
"sumCuda: ",
"normCuda: ",
"reDotProductCuda: ",
"axpyNormCuda: ",
"xmyNormCuda: ",
"cDotProductCuda: ",
"xpaycDotzyCuda: ",
"cDotProductNormACuda: ",
"cDotProductNormBCuda: ",
"caxpbypzYmbwcDotProductWYNormYQuda: "
};
nIters = 1;
......@@ -202,6 +202,10 @@ int main(int argc, char** argv) {
benchmark(kernels[i]);
}
char filename[100];
sprintf(filename, "%d_%d_blas.dat", inv_param.cuda_prec, 256);
FILE *blas_out = fopen(filename, "w");
nIters = 300;
for (int i = 0; i <= 20; i++) {
blas_quda_flops = 0;
......@@ -209,10 +213,14 @@ int main(int argc, char** argv) {
double secs = benchmark(kernels[i]);
double flops = blas_quda_flops;
double bytes = blas_quda_bytes;
printf("%s %f s, flops = %e, Gflops/s = %f, GiB/s = %f\n\n",
printf("%s %f s, flops = %e, Gflops/s = %f, GiB/s = %f\n",
names[i], secs, flops, (flops*1e-9)/(secs), bytes/(secs*(1<<30)));
fprintf(blas_out, "%s %f s, flops = %e, Gflops/s = %f, GiB/s = %f\n",
names[i], secs, flops, (flops*1e-9)/(secs), bytes/(secs*(1<<30)));
//printf("Bandwidth: %f GiB/s\n\n", GiB / secs);
}
fclose(blas_out);
}
......@@ -7,45 +7,44 @@
#include <util_quda.h>
ParityClover allocateParityClover(int *X, Precision precision)
void allocateParityClover(ParityClover *ret, int *X, Precision precision)
{
ParityClover ret;
ret.precision = precision;
ret.volume = 1;
ret->precision = precision;
ret->volume = 1;
for (int d=0; d<4; d++) {
ret.X[d] = X[d];
ret.volume *= X[d];
ret->X[d] = X[d];
ret->volume *= X[d];
}
ret.Nc = 3;
ret.Ns = 4;
ret.length = ret.volume*ret.Nc*ret.Nc*ret.Ns*ret.Ns/2; // block-diagonal Hermitian (72 reals)
if (precision == QUDA_DOUBLE_PRECISION) ret.bytes = ret.length*sizeof(double);
else if (precision == QUDA_SINGLE_PRECISION) ret.bytes = ret.length*sizeof(float);
else ret.bytes = ret.length*sizeof(short);
ret->Nc = 3;
ret->Ns = 4;
ret->length = ret->volume*ret->Nc*ret->Nc*ret->Ns*ret->Ns/2; // block-diagonal Hermitian (72 reals)
if (cudaMalloc((void**)&(ret.clover), ret.bytes) == cudaErrorMemoryAllocation) {
printf("Error allocating clover term\n");
exit(0);
}
if (precision == QUDA_DOUBLE_PRECISION) ret->bytes = ret->length*sizeof(double);
else if (precision == QUDA_SINGLE_PRECISION) ret->bytes = ret->length*sizeof(float);
else ret->bytes = ret->length*sizeof(short);
if (precision == QUDA_HALF_PRECISION) {
if (cudaMalloc((void**)&ret.cloverNorm, ret.bytes/18) == cudaErrorMemoryAllocation) {
printf("Error allocating cloverNorm\n");
if (!ret->clover) {
if (cudaMalloc((void**)&(ret->clover), ret->bytes) == cudaErrorMemoryAllocation) {
printf("Error allocating clover term\n");
exit(0);
}
}
if (!ret->cloverNorm) {
if (precision == QUDA_HALF_PRECISION) {
if (cudaMalloc((void**)&ret->cloverNorm, ret->bytes/18) == cudaErrorMemoryAllocation) {
printf("Error allocating cloverNorm\n");
exit(0);
}
}
}
return ret;
}
FullClover allocateCloverField(int *X, Precision precision)
void allocateCloverField(FullClover *ret, int *X, Precision precision)
{
FullClover ret;
ret.even = allocateParityClover(X, precision);
ret.odd = allocateParityClover(X, precision);
return ret;
allocateParityClover(&(ret->even), X, precision);
allocateParityClover(&(ret->odd), X, precision);
}
void freeParityClover(ParityClover *clover)
......
......@@ -156,14 +156,14 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
inv_param->cloverGiB = 0;
if (h_clover) {
cudaCloverPrecise = allocateCloverField(X, inv_param->clover_cuda_prec);
allocateCloverField(&cudaCloverPrecise, X, inv_param->clover_cuda_prec);
loadCloverField(cudaCloverPrecise, h_clover, inv_param->clover_cpu_prec, inv_param->clover_order);
inv_param->cloverGiB += 2.0*cudaCloverPrecise.even.bytes / (1<<30);
if (inv_param->matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC ||
inv_param->matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
if (inv_param->clover_cuda_prec != inv_param->clover_cuda_prec_sloppy) {
cudaCloverSloppy = allocateCloverField(X, inv_param->clover_cuda_prec_sloppy);
allocateCloverField(&cudaCloverSloppy, X, inv_param->clover_cuda_prec_sloppy);
loadCloverField(cudaCloverSloppy, h_clover, inv_param->clover_cpu_prec, inv_param->clover_order);
inv_param->cloverGiB += 2.0*cudaCloverInvSloppy.even.bytes / (1<<30);
} else {
......@@ -172,7 +172,7 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
} // sloppy precision clover term not needed otherwise
}
cudaCloverInvPrecise = allocateCloverField(X, inv_param->clover_cuda_prec);
allocateCloverField(&cudaCloverInvPrecise, X, inv_param->clover_cuda_prec);
if (!h_clovinv) {
printf("QUDA error: clover term inverse not implemented yet\n");
exit(-1);
......@@ -182,7 +182,7 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
inv_param->cloverGiB += 2.0*cudaCloverInvPrecise.even.bytes / (1<<30);
if (inv_param->clover_cuda_prec != inv_param->clover_cuda_prec_sloppy) {
cudaCloverInvSloppy = allocateCloverField(X, inv_param->clover_cuda_prec_sloppy);
allocateCloverField(&cudaCloverInvSloppy, X, inv_param->clover_cuda_prec_sloppy);
loadCloverField(cudaCloverInvSloppy, h_clovinv, inv_param->clover_cpu_prec, inv_param->clover_order);
inv_param->cloverGiB += 2.0*cudaCloverInvSloppy.even.bytes / (1<<30);
} else {
......
......@@ -32,8 +32,8 @@ extern "C" {
// -- clover_quda.cpp
ParityClover allocateParityClover(int *X, Precision precision);
FullClover allocateCloverField(int *X, Precision precision);
void allocateParityClover(ParityClover *, int *X, Precision precision);
void allocateCloverField(FullClover *, int *X, Precision precision);
void freeParityClover(ParityClover *clover);
void freeCloverField(FullClover *clover);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment