Advanced Computing Platform for Theoretical Physics

Commit 8cdc25ad authored by mikeaclark's avatar mikeaclark
Browse files

Almost complete double precision inverter...

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@384 be54200a-260c-0410-bdd7-ce6af2a381ab
parent a2f374da
This diff is collapsed.
......@@ -23,39 +23,40 @@
extern "C" {
#endif
// ---------- blas_quda.cu ----------
// ---------- blas_quda.cu ----------
void zeroQuda(ParitySpinor a);
void copyQuda(ParitySpinor dst, ParitySpinor src);
double axpyNormQuda(double a, ParitySpinor x, ParitySpinor y);
double sumQuda(ParitySpinor b);
double normQuda(ParitySpinor b);
double reDotProductQuda(ParitySpinor a, ParitySpinor b);
void axpbyQuda(double a, ParitySpinor x, double b, ParitySpinor y);
void axpyQuda(double a, ParitySpinor x, ParitySpinor y);
void axQuda(double a, ParitySpinor x);
void xpayQuda(ParitySpinor x, double a, ParitySpinor y);
void mxpyQuda(ParitySpinor x, ParitySpinor y);
void axpyZpbxQuda(double a, ParitySpinor x, ParitySpinor y, ParitySpinor z, double b);
void zeroCuda(float* dst, int cnt);
void copyCuda(float* dst, float *src, int len);
void caxpbyQuda(double2 a, ParitySpinor x, double2 b, ParitySpinor y);
void caxpyQuda(double2 a, ParitySpinor x, ParitySpinor y);
void cxpaypbzQuda(ParitySpinor, double2 b, ParitySpinor y, double2 c, ParitySpinor z);
void caxpbypzYmbwQuda(double2, ParitySpinor, double2, ParitySpinor, ParitySpinor, ParitySpinor);
void axpbyCuda(float a, float *x, float b, float *y, int len);
void axpyCuda(float a, float *x, float *y, int len);
void axCuda(float a, float *x, int len);
void xpayCuda(float *x, float a, float *y, int len);
void mxpyCuda(float *x, float *y, int len);
void axpyZpbxCuda(float a, float *x, float *y, float *z, float b, int len);
double axpyNormCuda(float a, float *x, float *y, int len);
double sumCuda(float *a, int n);
double normCuda(float *a, int n);
double reDotProductCuda(float *a, float *b, int n);
void blasTest();
void axpbyTest();
void caxpbyCuda(float2 a, float2 *x, float2 b, float2 *y, int len);
void caxpyCuda(float2 a, float2 *x, float2 *y, int len);
void cxpaypbzCuda(float2 *x, float2 b, float2 *y, float2 c, float2 *z, int len);
cuDoubleComplex cDotProductCuda(float2*, float2*, int len);
void caxpbypzYmbwCuda(float2, float2*, float2, float2*, float2*, float2*, int len);
double3 cDotProductNormACuda(float2 *a, float2 *b, int n);
double3 cDotProductNormBCuda(float2 *a, float2 *b, int n);
double3 caxpbypzYmbwcDotProductWYNormYCuda(float2 a, float2 *x, float2 b, float2 *y,
float2 *z, float2 *w, float2 *u, int len);
cuDoubleComplex xpaycDotzyCuda(float2 *x, float a, float2 *y, float2 *z, int len);
cuDoubleComplex cDotProductQuda(ParitySpinor, ParitySpinor);
cuDoubleComplex xpaycDotzyQuda(ParitySpinor x, double a, ParitySpinor y, ParitySpinor z);
void blasTest();
void axpbyTest();
double3 cDotProductNormAQuda(ParitySpinor a, ParitySpinor b);
double3 cDotProductNormBQuda(ParitySpinor a, ParitySpinor b);
double3 caxpbypzYmbwcDotProductWYNormYQuda(double2 a, ParitySpinor x, double2 b, ParitySpinor y,
ParitySpinor z, ParitySpinor w, ParitySpinor u);
#ifdef __cplusplus
}
#endif
......
......@@ -52,7 +52,11 @@
#define DD_PARAM2 int oddBit
#else // xpay
#define DD_XPAY_F Xpay
#if (DD_SPREC == 0)
#define DD_PARAM2 int oddBit, double a
#else
#define DD_PARAM2 int oddBit, float a
#endif
#define DSLASH_XPAY
#endif
......
......@@ -58,10 +58,16 @@ __constant__ int X3;
__constant__ int X4;
__constant__ int X1h;
__constant__ float anisotropy;
__constant__ float t_boundary;
__constant__ int gauge_fixed;
__constant__ float pi;
// single precision constants
__constant__ float anisotropy_f;
__constant__ float t_boundary_f;
__constant__ float pi_f;
// double precision constants
__constant__ double anisotropy;
__constant__ double t_boundary;
#include <dslash_def.h>
......@@ -150,13 +156,22 @@ __global__ void spinorHalfUnpack(ParitySpinor out) {
}
void setCudaGaugeParam() {
cudaMemcpyToSymbol("anisotropy", &(gauge_param->anisotropy), sizeof(float));
float t_bc = (gauge_param->t_boundary == QUDA_PERIODIC_T) ? 1.0 : -1.0;
cudaMemcpyToSymbol("t_boundary", &(t_bc), sizeof(float));
int gf = (gauge_param->gauge_fix == QUDA_GAUGE_FIXED_YES) ? 1 : 0;
cudaMemcpyToSymbol("gauge_fixed", &(gf), sizeof(int));
float h_pi = M_PI;
cudaMemcpyToSymbol("pi", &(h_pi), sizeof(float));
cudaMemcpyToSymbol("anisotropy", &(gauge_param->anisotropy), sizeof(double));
double t_bc = (gauge_param->t_boundary == QUDA_PERIODIC_T) ? 1.0 : -1.0;
cudaMemcpyToSymbol("t_boundary", &(t_bc), sizeof(double));
float anisotropy_f = gauge_param->anisotropy;
cudaMemcpyToSymbol("anisotropy_f", &(anisotropy_f), sizeof(float));
float t_bc_f = (gauge_param->t_boundary == QUDA_PERIODIC_T) ? 1.0 : -1.0;
cudaMemcpyToSymbol("t_boundary_f", &(t_bc_f), sizeof(float));
float h_pi_f = M_PI;
cudaMemcpyToSymbol("pi_f", &(h_pi_f), sizeof(float));
}
void bindGaugeTex(FullGauge gauge, int oddBit) {
......@@ -394,7 +409,7 @@ void dslashHCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
}
void dslashXpayDCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, float a) {
int oddBit, int daggerBit, ParitySpinor x, double a) {
dim3 gridDim(GRID_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
......@@ -402,52 +417,52 @@ void dslashXpayDCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
bindGaugeTex(gauge, oddBit);
int spinor_bytes = Nh*spinorSiteSize*sizeof(double);
cudaBindTexture(0, spinorTexSingle, spinor.spinor, spinor_bytes);
cudaBindTexture(0, accumTexSingle, x.spinor, spinor_bytes);
cudaBindTexture(0, spinorTexDouble, spinor.spinor, spinor_bytes);
cudaBindTexture(0, accumTexDouble, x.spinor, spinor_bytes);
if (gauge.precision == QUDA_DOUBLE_PRECISION) {
if (gauge.reconstruct == QUDA_RECONSTRUCT_12) {
if (!daggerBit) {
dslashDS12XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashDD12XpayKernel <<<gridDim, blockDim, SHARED_BYTES_DOUBLE>>> ((double2 *)res.spinor, oddBit, a);
} else {
dslashDS12DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashDD12DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_DOUBLE>>> ((double2 *)res.spinor, oddBit, a);
}
} else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) {
if (!daggerBit) {
dslashDS8XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashDD8XpayKernel <<<gridDim, blockDim, SHARED_BYTES_DOUBLE>>> ((double2 *)res.spinor, oddBit, a);
}
else {
dslashDS8DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashDD8DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_DOUBLE>>> ((double2 *)res.spinor, oddBit, a);
}
}
} else if (gauge.precision == QUDA_SINGLE_PRECISION) {
if (gauge.reconstruct == QUDA_RECONSTRUCT_12) {
if (!daggerBit) {
dslashSS12XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashSD12XpayKernel <<<gridDim, blockDim, SHARED_BYTES_DOUBLE>>> ((double2 *)res.spinor, oddBit, a);
} else {
dslashSS12DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashSD12DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_DOUBLE>>> ((double2 *)res.spinor, oddBit, a);
}
} else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) {
if (!daggerBit) {
dslashSS8XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashSD8XpayKernel <<<gridDim, blockDim, SHARED_BYTES_DOUBLE>>> ((double2 *)res.spinor, oddBit, a);
}
else {
dslashSS8DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashSD8DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_DOUBLE>>> ((double2 *)res.spinor, oddBit, a);
}
}
} else {
if (gauge.reconstruct == QUDA_RECONSTRUCT_12) {
if (!daggerBit) {
dslashHS12XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashHD12XpayKernel <<<gridDim, blockDim, SHARED_BYTES_DOUBLE>>> ((double2 *)res.spinor, oddBit, a);
} else {
dslashHS12DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashHD12DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_DOUBLE>>> ((double2 *)res.spinor, oddBit, a);
}
} else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) {
if (!daggerBit) {
dslashHS8XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashHD8XpayKernel <<<gridDim, blockDim, SHARED_BYTES_DOUBLE>>> ((double2 *)res.spinor, oddBit, a);
}
else {
dslashHS8DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashHD8DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_DOUBLE>>> ((double2 *)res.spinor, oddBit, a);
}
}
}
......@@ -455,7 +470,7 @@ void dslashXpayDCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
}
void dslashXpaySCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, float a) {
int oddBit, int daggerBit, ParitySpinor x, double a) {
dim3 gridDim(GRID_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
......@@ -516,7 +531,7 @@ void dslashXpaySCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
}
void dslashXpayHCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, float a) {
int oddBit, int daggerBit, ParitySpinor x, double a) {
dim3 gridDim(GRID_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
......@@ -579,11 +594,10 @@ int dslashCudaSharedBytes() {
return SHARED_BYTES_SINGLE;
}
// Apply the even-odd preconditioned Dirac operator
void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, float kappa,
void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa,
ParitySpinor tmp, MatPCType matpc_type) {
float kappa2 = -kappa*kappa;
double kappa2 = -kappa*kappa;
if (invert_param->cuda_prec == QUDA_DOUBLE_PRECISION) {
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
......@@ -619,9 +633,9 @@ void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, float kappa,
}
// Apply the even-odd preconditioned Dirac operator
void MatPCDagCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, float kappa,
void MatPCDagCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa,
ParitySpinor tmp, MatPCType matpc_type) {
float kappa2 = -kappa*kappa;
double kappa2 = -kappa*kappa;
if (invert_param->cuda_prec == QUDA_DOUBLE_PRECISION) {
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
......@@ -657,13 +671,13 @@ void MatPCDagCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, float kapp
}
void MatPCDagMatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in,
float kappa, ParitySpinor tmp, MatPCType matpc_type) {
double kappa, ParitySpinor tmp, MatPCType matpc_type) {
MatPCCuda(out, gauge, in, kappa, tmp, matpc_type);
MatPCDagCuda(out, gauge, out, kappa, tmp, matpc_type);
}
// Apply the full operator
void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, float kappa) {
void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa) {
if (invert_param->cuda_prec == QUDA_DOUBLE_PRECISION) {
dslashXpayDCuda(out.odd, gauge, in.even, 1, 0, in.odd, -kappa);
......@@ -679,7 +693,7 @@ void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, float kappa) {
}
// Apply the full operator dagger
void MatDaggerCuda(FullSpinor out, FullGauge gauge, FullSpinor in, float kappa) {
void MatDaggerCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa) {
if (invert_param->cuda_prec == QUDA_SINGLE_PRECISION) {
dslashXpayDCuda(out.odd, gauge, in.even, 1, 1, in.odd, -kappa);
......
......@@ -45,33 +45,33 @@ extern "C" {
void dslashDCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit);
void dslashXpayDCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, float a);
int oddBit, int daggerBit, ParitySpinor x, double a);
// Single precision routines
void dslashSCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit);
void dslashXpaySCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, float a);
int oddBit, int daggerBit, ParitySpinor x, double a);
// Half precision dslash routines
void dslashHCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit);
void dslashXpayHCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, float a);
int oddBit, int daggerBit, ParitySpinor x, double a);
// wrapper to above
void dslashCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity, int dagger);
// Full Wilson matrix
void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, float kappa);
void MatDagCuda(FullSpinor out, FullGauge gauge, FullSpinor in, float kappa);
void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa);
void MatDagCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa);
void MatPCCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven,
float kappa, ParitySpinor tmp, MatPCType matpc_type);
double kappa, ParitySpinor tmp, MatPCType matpc_type);
void MatPCDagCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven,
float kappa, ParitySpinor tmp, MatPCType matpc_type);
double kappa, ParitySpinor tmp, MatPCType matpc_type);
void MatPCDagMatPCCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven,
float kappa, ParitySpinor tmp, MatPCType matpc_type);
double kappa, ParitySpinor tmp, MatPCType matpc_type);
/*QudaSumComplex MatPCcDotWXCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven,
float kappa, ParitySpinor tmp, ParitySpinor d, MatPCType matpc_type);
......
......@@ -8,7 +8,7 @@
#include <gauge_quda.h>
// What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat)
int test_type = 0;
int test_type = 2;
QudaGaugeParam gaugeParam;
QudaInvertParam inv_param;
......@@ -25,15 +25,15 @@ void *spinorEven, *spinorOdd;
double kappa = 1.0;
int ODD_BIT = 0;
int DAGGER_BIT = 0;
int TRANSFER = 0; // include transfer time in the benchmark?
int TRANSFER = 1; // include transfer time in the benchmark?
void init() {
gaugeParam.cpu_prec = QUDA_SINGLE_PRECISION;
gaugeParam.reconstruct_precise = QUDA_RECONSTRUCT_8;
gaugeParam.cuda_prec_precise = QUDA_SINGLE_PRECISION;
gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_8;
gaugeParam.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION;
gaugeParam.reconstruct_precise = QUDA_RECONSTRUCT_12;
gaugeParam.cuda_prec_precise = QUDA_DOUBLE_PRECISION;
gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_12;
gaugeParam.cuda_prec_sloppy = QUDA_DOUBLE_PRECISION;
gaugeParam.X = L1;
gaugeParam.Y = L2;
gaugeParam.Z = L3;
......@@ -44,8 +44,8 @@ void init() {
gaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
gauge_param = &gaugeParam;
inv_param.cpu_prec = QUDA_SINGLE_PRECISION;
inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_DOUBLE_PRECISION;
if (test_type == 2) inv_param.dirac_order = QUDA_DIRAC_ORDER;
else inv_param.dirac_order = QUDA_DIRAC_ORDER;
inv_param.kappa = kappa;
......@@ -55,14 +55,16 @@ void init() {
size_t sSize = (inv_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
// construct input fields
for (int dir = 0; dir < 4; dir++)
hostGauge[dir] = malloc(N*gaugeSiteSize*gSize);
for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(N*gaugeSiteSize*gSize);
spinor = malloc(N*spinorSiteSize*sSize);
spinorRef = malloc(N*spinorSiteSize*sSize);
spinorGPU = malloc(N*spinorSiteSize*sSize);
spinorEven = spinor;
spinorOdd = spinor + Nh*spinorSiteSize;
if (inv_param.cpu_prec == QUDA_DOUBLE_PRECISION)
spinorOdd = (void*)((double*)spinor + Nh*spinorSiteSize);
else
spinorOdd = (void*)((float*)spinor + Nh*spinorSiteSize);
printf("Randomizing fields...");
construct_gauge_field(hostGauge, 1, gaugeParam.cpu_prec);
......@@ -79,14 +81,18 @@ void init() {
printf("Sending fields to GPU..."); fflush(stdout);
if (!TRANSFER) {
cudaSpinor = allocateSpinorField(inv_param.cuda_prec);
cudaSpinorOut = allocateSpinorField(inv_param.cuda_prec);
tmp = allocateParitySpinor(inv_param.cuda_prec);
loadSpinorField(cudaSpinor, spinor, inv_param.cpu_prec,
inv_param.cuda_prec, inv_param.dirac_order);
printf("\nnorm = %e\n", normCuda(cudaSpinor.even.spinor, Nh*24));
tmp = allocateParitySpinor(Nh, inv_param.cuda_prec);
cudaSpinor = allocateSpinorField(N, inv_param.cuda_prec);
cudaSpinorOut = allocateSpinorField(N, inv_param.cuda_prec);
if (test_type < 2) {
loadParitySpinor(cudaSpinor.even, spinorEven, inv_param.cpu_prec,
inv_param.cuda_prec, inv_param.dirac_order);
} else {
loadSpinorField(cudaSpinor, spinor, inv_param.cpu_prec,
inv_param.cuda_prec, inv_param.dirac_order);
}
}
}
......@@ -104,37 +110,10 @@ void end() {
endQuda();
}
void dslashRef() {
// compare to dslash reference implementation
printf("Calculating reference implementation...");
fflush(stdout);
switch (test_type) {
case 0:
dslash_reference(spinorRef, hostGauge, spinorEven, ODD_BIT, DAGGER_BIT,
inv_param.cpu_prec, gaugeParam.cpu_prec);
break;
case 1:
matpc(spinorRef, hostGauge, spinorEven, kappa, QUDA_MATPC_EVEN_EVEN, DAGGER_BIT,
inv_param.cpu_prec, gaugeParam.cpu_prec);
break;
case 2:
mat(spinorRef, hostGauge, spinor, kappa, DAGGER_BIT,
inv_param.cpu_prec, gaugeParam.cpu_prec);
break;
default:
printf("Test type not defined\n");
exit(-1);
}
printf("done.\n");
}
double dslashCUDA() {
// execute kernel
const int LOOPS = 20;
const int LOOPS = 10;
printf("Executing %d kernel loops...", LOOPS);
fflush(stdout);
stopwatchStart();
......@@ -164,31 +143,35 @@ double dslashCUDA() {
printf("done.\n\n");
return secs;
}
}
void strongCheck() {
int len;
void *spinorRes;
if (test_type < 2) {
len = Nh;
spinorRes = spinorOdd;
} else {
len = N;
spinorRes = spinorGPU;
void dslashRef() {
// compare to dslash reference implementation
printf("Calculating reference implementation...");
fflush(stdout);
switch (test_type) {
case 0:
dslash_reference(spinorRef, hostGauge, spinorEven, ODD_BIT, DAGGER_BIT,
inv_param.cpu_prec, gaugeParam.cpu_prec);
break;
case 1:
matpc(spinorRef, hostGauge, spinorEven, kappa, QUDA_MATPC_EVEN_EVEN, DAGGER_BIT,
inv_param.cpu_prec, gaugeParam.cpu_prec);
break;
case 2:
mat(spinorRef, hostGauge, spinor, kappa, DAGGER_BIT,
inv_param.cpu_prec, gaugeParam.cpu_prec);
break;
default:
printf("Test type not defined\n");
exit(-1);
}
printf("Reference:\n");
printSpinorElement(spinorRef, 0, inv_param.cpu_prec); printf("...\n");
printSpinorElement(spinorRef, len-1, inv_param.cpu_prec); printf("\n");
printf("done.\n");
printf("\nCUDA:\n");
printSpinorElement(spinorRes, 0, inv_param.cpu_prec); printf("...\n");
printSpinorElement(spinorRes, len-1, inv_param.cpu_prec); printf("\n");
//compare_spinor(spinorRef, spinorRes, len, inv_param.cpu_prec);
}
void dslashTest() {
init();
......@@ -207,9 +190,9 @@ void dslashTest() {
if (!TRANSFER) {
if (test_type < 2) retrieveParitySpinor(spinorOdd, cudaSpinor.odd, inv_param.cpu_prec,
inv_param.cuda_prec, QUDA_DIRAC_ORDER);
inv_param.cuda_prec, inv_param.dirac_order);
else retrieveSpinorField(spinorGPU, cudaSpinorOut, inv_param.cpu_prec,
inv_param.cuda_prec, QUDA_DIRAC_ORDER);
inv_param.cuda_prec, inv_param.dirac_order);
}
// print timing information
printf("%fms per loop\n", 1000*secs);
......@@ -223,7 +206,8 @@ void dslashTest() {
else res = compare_floats(spinorGPU, spinorRef, N*4*3*2, 1e-4, inv_param.cpu_prec);
printf("%d Test %s\n", i, (1 == res) ? "PASSED" : "FAILED");
strongCheck();
if (test_type < 2) strong_check(spinorRef, spinorOdd, Nh, inv_param.cpu_prec);
else strong_check(spinorRef, spinorGPU, Nh, inv_param.cpu_prec);
exit(0);
}
......
......@@ -6,6 +6,8 @@
#include <xmmintrin.h>
#define __DEVICE_EMULATION__
#define SHORT_LENGTH 65536
#define SCALE_FLOAT (SHORT_LENGTH-1) / 2.f
#define SHIFT_FLOAT -1.f / (SHORT_LENGTH-1)
......@@ -75,8 +77,8 @@ inline void packH8S(short4 *res, float *g) {
inline void packD12D(double2 *res, double *g) {
for (int j=0; j<6; j++) {
res[j*Nh].x = g[j*4+0];
res[j*Nh].y = g[j*4+1];
res[j*Nh].x = g[j*2+0];
res[j*Nh].y = g[j*2+1];
}
}
......
......@@ -12,38 +12,35 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSlop
FullGauge gaugePrecise, ParitySpinor tmp,
QudaInvertParam *invert_param, DagType dag_type)
{
int len = Nh*spinorSiteSize;
Precision prec = QUDA_SINGLE_PRECISION;
ParitySpinor r = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
ParitySpinor p = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
ParitySpinor v = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
ParitySpinor t = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
ParitySpinor r = allocateParitySpinor(prec);
ParitySpinor p = allocateParitySpinor(prec);
ParitySpinor v = allocateParitySpinor(prec);
ParitySpinor t = allocateParitySpinor(prec);
ParitySpinor y = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
ParitySpinor b = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
ParitySpinor y = allocateParitySpinor(prec);
ParitySpinor b = allocateParitySpinor(prec);
copyQuda(b, source);
copyQuda(r, b);
zeroQuda(y);
zeroQuda(x);
copyCuda((float *)b.spinor, (float *)source.spinor, len);
copyCuda((float *)r.spinor, (float *)b.spinor, len);
zeroCuda((float *)y.spinor, len);
zeroCuda((float *)x.spinor, len);
double b2 = normCuda((float *)b.spinor, len);
double b2 = normQuda(b);
double r2 = b2;
double stop = b2*invert_param->tol*invert_param->tol; // stopping condition of solver
cuComplex rho = make_cuFloatComplex(1.0f, 0.0f);
cuComplex rho0 = rho;
cuComplex alpha = make_cuFloatComplex(1.0f, 0.0f);
cuComplex omega = make_cuFloatComplex(1.0f, 0.0f);
cuComplex beta;
cuDoubleComplex rho = make_cuDoubleComplex(1.0, 0.0);
cuDoubleComplex rho0 = rho;
cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0);
cuDoubleComplex omega = make_cuDoubleComplex(1.0, 0.0);
cuDoubleComplex beta;
cuDoubleComplex rv;
cuComplex rho_rho0;
cuComplex alpha_omega;
cuComplex beta_omega;
cuComplex one = make_cuFloatComplex(1.0f, 0.0f);
cuDoubleComplex rho_rho0;
cuDoubleComplex alpha_omega;
cuDoubleComplex beta_omega;
cuDoubleComplex one = make_cuDoubleComplex(1.0, 0.0);
double3 rho_r2;
double3 omega_t2;
......@@ -62,16 +59,16 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSlop
while (r2 > stop && k<invert_param->maxiter) {
if (k==0) {
rho = make_cuFloatComplex(r2, 0.0);
copyCuda((float *)p.spinor, (float *)r.spinor, len);
rho = make_cuDoubleComplex(r2, 0.0);
copyQuda(p, r);
} else {
alpha_omega = cuCdivf(alpha, omega);
rho_rho0 = cuCdivf(rho, rho0);
beta = cuCmulf(rho_rho0, alpha_omega);
alpha_omega = cuCdiv(alpha, omega);
rho_rho0 = cuCdiv(rho, rho0);
beta = cuCmul(rho_rho0, alpha_omega);
// p = r - beta*omega*v + beta*(p)
beta_omega = cuCmulf(beta, omega); beta_omega.x *= -1.0f; beta_omega.y *= -1.0f;
cxpaypbzCuda((float2*)r.spinor, beta_omega, (float2*)v.spinor, beta, (float2*)p.spinor, len/2); // 8
beta_omega = cuCmul(beta, omega); beta_omega.x *= -1.0; beta_omega.y *= -1.0;
cxpaypbzQuda(r, beta_omega, v, beta, p); // 8
}
if (dag_type == QUDA_DAG_NO)
......@@ -81,14 +78,13 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSlop
//rv = MatPCDagcDotWXCuda(v, gauge, p, invert_param->kappa, tmp, b, invert_param->matpc_type);
MatPCDagCuda(v, gaugeSloppy, p, invert_param->kappa, tmp, invert_param->matpc_type);
rv = cDotProductCuda((float2*)source.spinor, (float2*)v.spinor, len/2);
cuComplex rv32 = make_cuFloatComplex((float)rv.x, (float)rv.y);
alpha = cuCdivf(rho, rv32);
rv = cDotProductQuda(source, v);
alpha = cuCdiv(rho, rv);
// r -= alpha*v
alpha.x *= -1.0f; alpha.y *= -1.0f;
caxpyCuda(alpha, (float2*)v.spinor, (float2*)r.spinor, len/2); // 4
alpha.x *= -1.0f; alpha.y *= -1.0f;
alpha.x *= -1.0; alpha.y *= -1.0;
caxpyQuda(alpha, v, r); // 4
alpha.x *= -1.0; alpha.y *= -1.0;
if (dag_type == QUDA_DAG_NO)
MatPCCuda(t, gaugeSloppy, r, invert_param->kappa, tmp, invert_param->matpc_type);
......@@ -96,12 +92,11 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSlop
MatPCDagCuda(t, gaugeSloppy, r, invert_param->kappa, tmp, invert_param->matpc_type);
// omega = (t, r) / (t, t)
omega_t2 = cDotProductNormACuda((float2*)t.spinor, (float2*)r.spinor, len/2); // 6
omega_t2 = cDotProductNormAQuda(t, r); // 6
omega.x = omega_t2.x / omega_t2.z; omega.y = omega_t2.y/omega_t2.z;
//x += alpha*p + omega*r, r -= omega*t, r2 = (r,r), rho = (r0, r)
rho_r2 = caxpbypzYmbwcDotProductWYNormYCuda(alpha, (float2*)p.spinor, omega, (float2*)r.spinor,
(float2*)x.spinor, (float2*)t.spinor, (float2*)source.spinor, len/2);
rho_r2 = caxpbypzYmbwcDotProductWYNormYQuda(alpha, p, omega, r, x, t, source);
rho0 = rho; rho.x = rho_r2.x; rho.y = rho_r2.y; r2 = rho_r2.z;
// reliable updates (ideally should be double precision)
......@@ -122,18 +117,18 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSlop
invert_param -> cuda_prec = spinorPrec;
copyCuda((float*)r.spinor, (float*)b.spinor, len);
mxpyCuda((float*)t.spinor, (float*)r.spinor, len);
r2 = normCuda((float*)r.spinor, len);
copyQuda(r, b);
mxpyQuda(t, r);
r2 = normQuda(r);
rNorm = sqrt(r2);
maxrr = rNorm;
rUpdate++;
if (updateX) {
axpyCuda(1.0f, (float*)x.spinor, (float*)y.spinor, len);
zeroCuda((float*)x.spinor, len);
copyCuda((float*)b.spinor, (float*)r.spinor, len);
axpyQuda(1.0, x, y);
zeroQuda(x);
copyQuda(b, r);
r0Norm = rNorm;
maxrx = rNorm;
......@@ -144,9 +139,9 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSlop
k++;
printf("%d iterations, r2 = %e, x2 = %e\n", k, r2, normCuda((float*)x.spinor, len));
printf("%d iterations, r2 = %e, x2 = %e\n", k, r2, normQuda(x));
}
axpyCuda(1.0f, (float*)y.spinor, (float*)x.spinor, len);
axpyQuda(1.0f, y, x);
invert_param->secs += stopwatchReadSeconds();
......@@ -167,9 +162,9 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSlop
MatPCCuda(t.spinor, gauge, x.spinor, invert_param->kappa, tmp.spinor, invert_param->matpc_type);
else
MatPCDagCuda(t.spinor, gauge, x.spinor, invert_param->kappa, tmp.spinor, invert_param->matpc_type);
copyCuda((float *)r.spinor, (float *)b.spinor, len);
mxpyCuda((float *)t.spinor, (float *)r.spinor, len);
double true_res = normCuda((float *)r.spinor, len);
copyQuda(r, b);
mxpyQuda(t, r);
double true_res = normQuda(r);
printf("Converged after %d iterations, r2 = %e, true_r2 = %e\n",
k, r2, true_res / b2);
......
......@@ -10,31 +10,28 @@
void invertCgCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge,
ParitySpinor tmp, QudaInvertParam *perf)
{
int len = Nh*spinorSiteSize;
Precision prec = QUDA_SINGLE_PRECISION;
ParitySpinor r;
ParitySpinor p = allocateParitySpinor(prec);
ParitySpinor Ap = allocateParitySpinor(prec);
ParitySpinor p = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
ParitySpinor Ap = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
double b2 = 0.0;
b2 = normCuda((float *)b.spinor, len);
b2 = normQuda(b);
float r2 = b2;
float r2_old;
float stop = r2*perf->tol*perf->tol; // stopping condition of solver
double r2 = b2;
double r2_old;
double stop = r2*perf->tol*perf->tol; // stopping condition of solver
float alpha, beta;
double alpha, beta;
double pAp;
if (perf->preserve_source == QUDA_PRESERVE_SOURCE_YES) {
r = allocateParitySpinor(prec);
copyCuda((float *)r.spinor, (float *)b.spinor, len);
r = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
copyQuda(r, b);
} else {
r = b;
}
copyCuda((float *)p.spinor, (float *)r.spinor, len);
zeroCuda((float *)x.spinor, len);
copyQuda(p, r);
zeroQuda(x);
int k=0;
//printf("%d iterations, r2 = %e\n", k, r2);
......@@ -42,15 +39,15 @@ void invertCgCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge,
while (r2 > stop && k<perf->maxiter) {
MatPCDagMatPCCuda(Ap, gauge, p, perf->kappa, tmp, perf->matpc_type);
pAp = reDotProductCuda((float *)p.spinor, (float *)Ap.spinor, len);
pAp = reDotProductQuda(p, Ap);
alpha = r2 / (float)pAp;
alpha = r2 / pAp;
r2_old = r2;
r2 = axpyNormCuda(-alpha, (float *)Ap.spinor, (float *)r.spinor, len);
r2 = axpyNormQuda(-alpha, Ap, r);
beta = r2 / r2_old;
axpyZpbxCuda(alpha, (float *)p.spinor, (float *)x.spinor, (float *)r.spinor, beta, len);
axpyZpbxQuda(alpha, p, x, r, beta);
k++;
printf("%d iterations, r2 = %e\n", k, r2);
......@@ -68,9 +65,9 @@ void invertCgCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge,
#if 0
// Calculate the true residual
MatPCDagMatPCCuda(Ap, gauge, x, perf->kappa, tmp, perf->matpc_type);
copyCuda((float *)r.spinor, (float *)b.spinor, len);
mxpyCuda((float *)Ap.spinor, (float *)r.spinor, len);
double true_res = normCuda((float *)r.spinor, len);
copyQuda(r, b);
mxpyQuda(Ap, r);
double true_res = normQuda(r);
printf("Converged after %d iterations, r2 = %e, true_r2 = %e\n",
k, r2, true_res / b2);
......
......@@ -9,6 +9,8 @@
#include <spinor_quda.h>
#include <gauge_quda.h>
#include <blas_reference.h>
FullGauge cudaGaugePrecise; // precise gauge field
FullGauge cudaGaugeSloppy; // sloppy gauge field
......@@ -106,10 +108,10 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
gauge_param->packed_size = (gauge_param->reconstruct_precise == QUDA_RECONSTRUCT_8) ? 8 : 12;
createGaugeField(&cudaGaugePrecise, h_gauge, gauge_param->reconstruct_precise, gauge_param->cuda_prec_precise);
gauge_param->gaugeGiB = (float)2*cudaGaugePrecise.packedGaugeBytes/ (1 << 30);
gauge_param->gaugeGiB = 2.0*cudaGaugePrecise.packedGaugeBytes/ (1 << 30);
if (gauge_param->cuda_prec_sloppy != gauge_param->cuda_prec_precise) {
createGaugeField(&cudaGaugeSloppy, h_gauge, gauge_param->reconstruct_sloppy, gauge_param->cuda_prec_sloppy);
gauge_param->gaugeGiB = (float)2*cudaGaugeSloppy.packedGaugeBytes/ (1 << 30);
gauge_param->gaugeGiB = 2.0*cudaGaugeSloppy.packedGaugeBytes/ (1 << 30);
} else {
cudaGaugeSloppy = cudaGaugePrecise;
}
......@@ -125,11 +127,10 @@ void endQuda()
void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int parity, int dagger)
{
ParitySpinor in = allocateParitySpinor(inv_param->cuda_prec);
ParitySpinor out = allocateParitySpinor(inv_param->cuda_prec);
ParitySpinor in = allocateParitySpinor(Nh, inv_param->cuda_prec);
ParitySpinor out = allocateParitySpinor(Nh, inv_param->cuda_prec);
loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->cuda_prec, inv_param->dirac_order);
printf("\nnorm = %e\n", normCuda((float*)(in.spinor), Nh*24));
dslashCuda(out, cudaGaugePrecise, in, parity, dagger);
retrieveParitySpinor(h_out, out, inv_param->cpu_prec, inv_param->cuda_prec, inv_param->dirac_order);
......@@ -139,9 +140,9 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int parity,
void MatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
{
ParitySpinor in = allocateParitySpinor(inv_param->cuda_prec);
ParitySpinor out = allocateParitySpinor(inv_param->cuda_prec);
ParitySpinor tmp = allocateParitySpinor(inv_param->cuda_prec);
ParitySpinor in = allocateParitySpinor(Nh, inv_param->cuda_prec);
ParitySpinor out = allocateParitySpinor(Nh, inv_param->cuda_prec);
ParitySpinor tmp = allocateParitySpinor(Nh, inv_param->cuda_prec);
loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->cuda_prec, inv_param->dirac_order);
MatPCCuda(out, cudaGaugePrecise, in, inv_param->kappa, tmp, inv_param->matpc_type);
......@@ -154,9 +155,9 @@ void MatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
void MatPCDagQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
{
ParitySpinor in = allocateParitySpinor(inv_param->cuda_prec);
ParitySpinor out = allocateParitySpinor(inv_param->cuda_prec);
ParitySpinor tmp = allocateParitySpinor(inv_param->cuda_prec);
ParitySpinor in = allocateParitySpinor(Nh, inv_param->cuda_prec);
ParitySpinor out = allocateParitySpinor(Nh, inv_param->cuda_prec);
ParitySpinor tmp = allocateParitySpinor(Nh, inv_param->cuda_prec);
loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->cuda_prec, inv_param->dirac_order);
MatPCDagCuda(out, cudaGaugePrecise, in, inv_param->kappa, tmp, inv_param->matpc_type);
......@@ -169,9 +170,9 @@ void MatPCDagQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
void MatPCDagMatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
{
ParitySpinor in = allocateParitySpinor(inv_param->cuda_prec);
ParitySpinor out = allocateParitySpinor(inv_param->cuda_prec);
ParitySpinor tmp = allocateParitySpinor(inv_param->cuda_prec);
ParitySpinor in = allocateParitySpinor(Nh, inv_param->cuda_prec);
ParitySpinor out = allocateParitySpinor(Nh, inv_param->cuda_prec);
ParitySpinor tmp = allocateParitySpinor(Nh, inv_param->cuda_prec);
loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->cuda_prec, inv_param->dirac_order);
MatPCDagMatPCCuda(out, cudaGaugePrecise, in, inv_param->kappa, tmp, inv_param->matpc_type);
......@@ -183,37 +184,27 @@ void MatPCDagMatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
}
void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) {
FullSpinor in, out;
in.even = allocateParitySpinor(inv_param->cuda_prec);
in.odd = allocateParitySpinor(inv_param->cuda_prec);
out.even = allocateParitySpinor(inv_param->cuda_prec);
out.odd = allocateParitySpinor(inv_param->cuda_prec);
FullSpinor in = allocateSpinorField(N, inv_param->cuda_prec);
FullSpinor out = allocateSpinorField(N, inv_param->cuda_prec);
loadSpinorField(in, h_in, inv_param->cpu_prec, inv_param->cuda_prec, inv_param->dirac_order);
dslashCuda(out.odd, cudaGaugePrecise, in.even, 1, 0);
dslashCuda(out.even, cudaGaugePrecise, in.odd, 0, 0);
xpayCuda((float*)in.even.spinor, -inv_param->kappa, (float*)out.even.spinor, Nh*spinorSiteSize);
xpayCuda((float*)in.odd.spinor, -inv_param->kappa, (float*)out.odd.spinor, Nh*spinorSiteSize);
xpayQuda(in.even, -inv_param->kappa, out.even);
xpayQuda(in.odd, -inv_param->kappa, out.odd);
retrieveSpinorField(h_out, out, inv_param->cpu_prec,
inv_param->cuda_prec, inv_param->dirac_order);
freeParitySpinor(in.even);
freeParitySpinor(in.odd);
freeParitySpinor(out.even);
freeParitySpinor(out.odd);
freeSpinorField(out);
freeSpinorField(in);
}
void MatDagQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) {
FullSpinor in, out;
in.even = allocateParitySpinor(inv_param->cuda_prec);
in.odd = allocateParitySpinor(inv_param->cuda_prec);
out.even = allocateParitySpinor(inv_param->cuda_prec);
out.odd = allocateParitySpinor(inv_param->cuda_prec);
FullSpinor in = allocateSpinorField(N, inv_param->cuda_prec);
FullSpinor out = allocateSpinorField(N, inv_param->cuda_prec);
loadSpinorField(in, h_in, inv_param->cpu_prec,
inv_param->cuda_prec, inv_param->dirac_order);
......@@ -221,56 +212,46 @@ void MatDagQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) {
dslashCuda(out.odd, cudaGaugePrecise, in.even, 1, 1);
dslashCuda(out.even, cudaGaugePrecise, in.odd, 0, 1);
xpayCuda((float*)in.even.spinor, -inv_param->kappa, (float*)out.even.spinor, Nh*spinorSiteSize);
xpayCuda((float*)in.odd.spinor, -inv_param->kappa, (float*)out.odd.spinor, Nh*spinorSiteSize);
xpayQuda(in.even, -inv_param->kappa, out.even);
xpayQuda(in.odd, -inv_param->kappa, out.odd);
retrieveSpinorField(h_out, out, inv_param->cpu_prec,
inv_param->cuda_prec, inv_param->dirac_order);
freeParitySpinor(in.even);
freeParitySpinor(in.odd);
freeParitySpinor(out.even);
freeParitySpinor(out.odd);
freeSpinorField(out);
freeSpinorField(in);
}
void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
{
invert_param = param;
if (param->cuda_prec == QUDA_DOUBLE_PRECISION) {
printf("Sorry, only double precision not yet supported\n");
exit(-1);
}
if (param->cpu_prec == QUDA_HALF_PRECISION) {
printf("Half precision not supported on cpu\n");
exit(-1);
}
int slenh = Nh*spinorSiteSize;
float spinorGiB = (float)slenh*sizeof(float) / (1 << 30);
param->spinorGiB = (double)slenh*(param->cuda_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double): sizeof(float);
if (param->preserve_source == QUDA_PRESERVE_SOURCE_NO)
spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 5 : 7);
param->spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 5 : 7)/(1<<30);
else
spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 8 : 9);
param->spinorGiB = spinorGiB;
param->spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 8 : 9)/(1<<30);
param->secs = 0;
param->gflops = 0;
param->iter = 0;
float kappa = param->kappa;
double kappa = param->kappa;
FullSpinor b, x;
ParitySpinor in = allocateParitySpinor(invert_param->cuda_prec); // source vector
ParitySpinor out = allocateParitySpinor(invert_param->cuda_prec); // solution vector
ParitySpinor tmp = allocateParitySpinor(invert_param->cuda_prec); // temporary used when applying operator
ParitySpinor in = allocateParitySpinor(Nh, invert_param->cuda_prec); // source vector
ParitySpinor out = allocateParitySpinor(Nh, invert_param->cuda_prec); // solution vector
ParitySpinor tmp = allocateParitySpinor(Nh, invert_param->cuda_prec); // temporary used when applying operator
if (param->solution_type == QUDA_MAT_SOLUTION) {
if (param->preserve_source == QUDA_PRESERVE_SOURCE_YES) {
b.even = allocateParitySpinor(invert_param->cuda_prec);
b.odd = allocateParitySpinor(invert_param->cuda_prec);
b = allocateSpinorField(N, invert_param->cuda_prec);
} else {
b.even = out;
b.odd = tmp;
......@@ -283,8 +264,8 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
// multiply the source to get the mass normalization
if (param->mass_normalization == QUDA_MASS_NORMALIZATION) {
axCuda(2*kappa, (float *)b.even.spinor, slenh);
axCuda(2*kappa, (float *)b.odd.spinor, slenh);
axQuda(2.0*kappa, b.even);
axQuda(2.0*kappa, b.odd);
}
if (param->matpc_type == QUDA_MATPC_EVEN_EVEN) {
......@@ -300,15 +281,15 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
// multiply the source to get the mass normalization
if (param->mass_normalization == QUDA_MASS_NORMALIZATION)
if (param->solution_type == QUDA_MATPC_SOLUTION)
axCuda(4*kappa*kappa, (float *)in.spinor, slenh);
axQuda(4.0*kappa*kappa, in);
else
axCuda(16*pow(kappa,4), (float *)in.spinor, slenh);
axQuda(16.0*pow(kappa,4), in);
}
switch (param->inv_type) {
case QUDA_CG_INVERTER:
if (param->solution_type != QUDA_MATPCDAG_MATPC_SOLUTION) {
copyCuda((float *)out.spinor, (float *)in.spinor, slenh);
copyQuda(out, in);
MatPCDagCuda(in, cudaGaugePrecise, out, kappa, tmp, param->matpc_type);
}
invertCgCuda(out, in, cudaGaugeSloppy, tmp, param);
......@@ -316,7 +297,7 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
case QUDA_BICGSTAB_INVERTER:
if (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) {
invertBiCGstabCuda(out, in, cudaGaugeSloppy, cudaGaugePrecise, tmp, param, QUDA_DAG_YES);
copyCuda((float *)in.spinor, (float *)out.spinor, slenh);
copyQuda(in, out);
}
invertBiCGstabCuda(out, in, cudaGaugeSloppy, cudaGaugePrecise, tmp, param, QUDA_DAG_NO);
break;
......
......@@ -14,7 +14,7 @@ extern "C" {
int Z;
int T;
float anisotropy;
double anisotropy;
QudaGaugeFieldOrder gauge_order;
......@@ -31,21 +31,21 @@ extern "C" {
QudaTboundary t_boundary;
int packed_size;
float gaugeGiB;
double gaugeGiB;
} QudaGaugeParam;
typedef struct QudaInvertParam_s {
float kappa;
double kappa;
QudaMassNormalization mass_normalization;
QudaDslashType dslash_type;
QudaInverterType inv_type;
float tol;
double tol;
int iter;
int maxiter;
float reliable_delta; // reliable update tolerance
double reliable_delta; // reliable update tolerance
QudaMatPCType matpc_type;
QudaSolutionType solution_type;
......@@ -56,9 +56,9 @@ extern "C" {
QudaPrecision cuda_prec;
QudaDiracFieldOrder dirac_order;
float spinorGiB;
float gflops;
float secs;
double spinorGiB;
double gflops;
double secs;
} QudaInvertParam;
......
......@@ -36,36 +36,35 @@ int main(int argc, char **argv)
Gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
gauge_param = &Gauge_param;
float mass = -0.958;
double mass = -0.958;
inv_param.kappa = 1.0 / (2.0*(4 + mass));
inv_param.tol = 5e-7;
inv_param.maxiter = 5000;
inv_param.reliable_delta = 1e-6;
inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION;
inv_param.cpu_prec = QUDA_SINGLE_PRECISION;
inv_param.cuda_prec = QUDA_HALF_PRECISION;
inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
inv_param.solution_type = QUDA_MAT_SOLUTION;
inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
inv_param.preserve_source = QUDA_PRESERVE_SOURCE_YES;
inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO;
inv_param.dirac_order = QUDA_DIRAC_ORDER;
size_t gSize = (Gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
size_t sSize = (inv_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
for (int dir = 0; dir < 4; dir++) {
gauge[dir] = malloc(N*gaugeSiteSize*sizeof(float));
}
construct_gauge_field(gauge, 1, QUDA_SINGLE_PRECISION);
float *spinorIn = (float*)malloc(N*spinorSiteSize*sizeof(float));
float *spinorOut = (float*)malloc(N*spinorSiteSize*sizeof(float));
float *spinorCheck = (float*)malloc(N*spinorSiteSize*sizeof(float));
if(!spinorCheck) {
printf("malloc failed in %s\n", __func__);
exit(1);
gauge[dir] = malloc(N*gaugeSiteSize*gSize);
}
construct_gauge_field(gauge, 1, Gauge_param.cpu_prec);
void *spinorIn = malloc(N*spinorSiteSize*sSize);
void *spinorOut = malloc(N*spinorSiteSize*sSize);
void *spinorCheck = malloc(N*spinorSiteSize*sSize);
int i0 = 0;
int s0 = 0;
int c0 = 0;
construct_spinor_field(spinorIn, 0, i0, s0, c0, QUDA_SINGLE_PRECISION);
construct_spinor_field(spinorIn, 0, i0, s0, c0, inv_param.cpu_prec);
double time0 = -((double)clock()); // Start the timer
......@@ -87,8 +86,8 @@ int main(int argc, char **argv)
ax(0.5/inv_param.kappa, spinorCheck, N*spinorSiteSize, inv_param.cpu_prec);
mxpy(spinorIn, spinorCheck, N*spinorSiteSize, inv_param.cpu_prec);
float nrm2 = norm_2(spinorCheck, N*spinorSiteSize, QUDA_SINGLE_PRECISION);
float src2 = norm_2(spinorIn, N*spinorSiteSize, QUDA_SINGLE_PRECISION);
double nrm2 = norm_2(spinorCheck, N*spinorSiteSize, inv_param.cpu_prec);
double src2 = norm_2(spinorIn, N*spinorSiteSize, inv_param.cpu_prec);
printf("Relative residual, requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2));
endQuda();
......
......@@ -71,8 +71,8 @@ void init() {
int dev = 0;
cudaSetDevice(dev);
cudaFullSpinor = allocateSpinorField(single);
cudaParitySpinor = allocateParitySpinor(single);
cudaFullSpinor = allocateSpinorField(N, single);
cudaParitySpinor = allocateParitySpinor(Nh, single);
}
......
......@@ -3,10 +3,10 @@
#include <cuda_runtime.h>
#define L1 8 // "x" dimension
#define L2 8 // "y" dimension
#define L3 8 // "z" dimension
#define L4 8 // "time" dimension
#define L1 4 // "x" dimension
#define L2 4 // "y" dimension
#define L3 4 // "z" dimension
#define L4 4 // "time" dimension
#define L1h (L1/2) // half of the full "x" dimension, useful for even/odd lattice indexing
#define N (L1*L2*L3*L4) // total number of lattice points
......@@ -53,6 +53,7 @@ extern "C" {
typedef struct {
Precision precision;
int length; // geometric length of spinor
void *spinor; // either (double2*), (float4 *) or (short4 *), depending on precision
float *spinorNorm; // used only when precision is QUDA_HALF_PRECISION
} ParitySpinor;
......
......@@ -30,12 +30,12 @@
a##_im -= b##_re * c##_im + b##_im * c##_re
#define READ_GAUGE_MATRIX_12_DOUBLE(gauge, dir) \
double2 G0 = fetch_double2((gauge), ga_idx + ((dir/2)*3+0)*Nh); \
double2 G1 = fetch_double2((gauge), ga_idx + ((dir/2)*3+1)*Nh); \
double2 G2 = fetch_double2((gauge), ga_idx + ((dir/2)*3+2)*Nh); \
double2 G3 = fetch_double2((gauge), ga_idx + ((dir/2)*3+3)*Nh); \
double2 G4 = fetch_double2((gauge), ga_idx + ((dir/2)*3+4)*Nh); \
double2 G5 = fetch_double2((gauge), ga_idx + ((dir/2)*3+5)*Nh); \
double2 G0 = fetch_double2((gauge), ga_idx + ((dir/2)*6+0)*Nh); \
double2 G1 = fetch_double2((gauge), ga_idx + ((dir/2)*6+1)*Nh); \
double2 G2 = fetch_double2((gauge), ga_idx + ((dir/2)*6+2)*Nh); \
double2 G3 = fetch_double2((gauge), ga_idx + ((dir/2)*6+3)*Nh); \
double2 G4 = fetch_double2((gauge), ga_idx + ((dir/2)*6+4)*Nh); \
double2 G5 = fetch_double2((gauge), ga_idx + ((dir/2)*6+5)*Nh); \
double2 G6 = make_double2(0,0); \
double2 G7 = make_double2(0,0); \
double2 G8 = make_double2(0,0); \
......@@ -65,15 +65,15 @@
ACC_CONJ_PROD(g21, -g00, +g12); \
ACC_CONJ_PROD(g22, +g00, +g11); \
ACC_CONJ_PROD(g22, -g01, +g10); \
float u0 = (dir < 6 ? anisotropy : (ga_idx >= (L4-1)*L1h*L2*L3 ? t_boundary : 1)); \
float u0 = (dir < 6 ? anisotropy_f : (ga_idx >= (L4-1)*L1h*L2*L3 ? t_boundary_f : 1)); \
G3.x*=u0; G3.y*=u0; G3.z*=u0; G3.w*=u0; G4.x*=u0; G4.y*=u0;
// set A to be last components of G4 (otherwise unused)
#define READ_GAUGE_MATRIX_8_DOUBLE(gauge, dir) \
double2 G0 = fetch_double2((gauge), ga_idx + ((dir/2)*2+0)*Nh); \
double2 G1 = fetch_double2((gauge), ga_idx + ((dir/2)*2+1)*Nh); \
double2 G2 = fetch_double2((gauge), ga_idx + ((dir/2)*2+2)*Nh); \
double2 G3 = fetch_double2((gauge), ga_idx + ((dir/2)*2+3)*Nh); \
double2 G0 = fetch_double2((gauge), ga_idx + ((dir/2)*4+0)*Nh); \
double2 G1 = fetch_double2((gauge), ga_idx + ((dir/2)*4+1)*Nh); \
double2 G2 = fetch_double2((gauge), ga_idx + ((dir/2)*4+2)*Nh); \
double2 G3 = fetch_double2((gauge), ga_idx + ((dir/2)*4+3)*Nh); \
double2 G4 = make_double2(0,0); \
double2 G5 = make_double2(0,0); \
double2 G6 = make_double2(0,0); \
......@@ -99,8 +99,8 @@
float4 G2 = make_float4(0,0,0,0); \
float4 G3 = make_float4(0,0,0,0); \
float4 G4 = make_float4(0,0,0,0); \
g21_re = pi*g00_re; \
g21_im = pi*g00_im;
g21_re = pi_f*g00_re; \
g21_im = pi_f*g00_im;
#define RECONSTRUCT_MATRIX_8_DOUBLE(dir) \
double row_sum = g01_re*g01_re + g01_im*g01_im; \
......@@ -143,7 +143,7 @@
#define RECONSTRUCT_MATRIX_8_SINGLE(dir) \
float row_sum = g01_re*g01_re + g01_im*g01_im; \
row_sum += g02_re*g02_re + g02_im*g02_im; \
float u0 = (dir < 6 ? anisotropy : (ga_idx >= (L4-1)*L1h*L2*L3 ? t_boundary : 1)); \
float u0 = (dir < 6 ? anisotropy_f : (ga_idx >= (L4-1)*L1h*L2*L3 ? t_boundary_f : 1)); \
float u02_inv = __fdividef(1.f, u0*u0); \
float column_sum = u02_inv - row_sum; \
float U00_mag = sqrtf(column_sum); \
......
......@@ -6,128 +6,129 @@
#define ZCACC(c0, c1, a0, a1) zcadd((c0), (c1), (c0), (c1), (a0), (a1))
__global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumComplex *g_odata, unsigned int n) {
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(REDUCE_THREADS) + threadIdx.x;
unsigned int gridSize = REDUCE_THREADS*gridDim.x;
QudaSumFloat acc0 = 0;
QudaSumFloat acc1 = 0;
QudaSumFloat acc2 = 0;
QudaSumFloat acc3 = 0;
while (i < n) {
REDUCE_REAL_AUXILIARY(i);
REDUCE_IMAG_AUXILIARY(i);
DSACC(acc0, acc1, REDUCE_REAL_OPERATION(i), 0);
DSACC(acc2, acc3, REDUCE_IMAG_OPERATION(i), 0);
i += gridSize;
}
extern __shared__ QudaSumComplex cdata[];
QudaSumComplex *s = cdata + 2*tid;
s[0].x = acc0;
s[1].x = acc1;
s[0].y = acc2;
s[1].y = acc3;
__syncthreads();
if (REDUCE_THREADS >= 256) { if (tid < 128) { ZCACC(s[0],s[1],s[256+0],s[256+1]); } __syncthreads(); }
if (REDUCE_THREADS >= 128) { if (tid < 64) { ZCACC(s[0],s[1],s[128+0],s[128+1]); } __syncthreads(); }
if (tid < 32) {
if (REDUCE_THREADS >= 64) { ZCACC(s[0],s[1],s[64+0],s[64+1]); }
if (REDUCE_THREADS >= 32) { ZCACC(s[0],s[1],s[32+0],s[32+1]); }
if (REDUCE_THREADS >= 16) { ZCACC(s[0],s[1],s[16+0],s[16+1]); }
if (REDUCE_THREADS >= 8) { ZCACC(s[0],s[1], s[8+0], s[8+1]); }
if (REDUCE_THREADS >= 4) { ZCACC(s[0],s[1], s[4+0], s[4+1]); }
if (REDUCE_THREADS >= 2) { ZCACC(s[0],s[1], s[2+0], s[2+1]); }
}
// write result for this block to global mem as single QudaSumComplex
if (tid == 0) {
g_odata[blockIdx.x].x = cdata[0].x+cdata[1].x;
g_odata[blockIdx.x].y = cdata[0].y+cdata[1].y;
}
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(REDUCE_THREADS) + threadIdx.x;
unsigned int gridSize = REDUCE_THREADS*gridDim.x;
QudaSumFloat acc0 = 0;
QudaSumFloat acc1 = 0;
QudaSumFloat acc2 = 0;
QudaSumFloat acc3 = 0;
while (i < n) {
REDUCE_REAL_AUXILIARY(i);
REDUCE_IMAG_AUXILIARY(i);
DSACC(acc0, acc1, REDUCE_REAL_OPERATION(i), 0);
DSACC(acc2, acc3, REDUCE_IMAG_OPERATION(i), 0);
i += gridSize;
}
extern __shared__ QudaSumComplex cdata[];
QudaSumComplex *s = cdata + 2*tid;
s[0].x = acc0;
s[1].x = acc1;
s[0].y = acc2;
s[1].y = acc3;
__syncthreads();
if (REDUCE_THREADS >= 256) { if (tid < 128) { ZCACC(s[0],s[1],s[256+0],s[256+1]); } __syncthreads(); }
if (REDUCE_THREADS >= 128) { if (tid < 64) { ZCACC(s[0],s[1],s[128+0],s[128+1]); } __syncthreads(); }
if (tid < 32) {
if (REDUCE_THREADS >= 64) { ZCACC(s[0],s[1],s[64+0],s[64+1]); }
if (REDUCE_THREADS >= 32) { ZCACC(s[0],s[1],s[32+0],s[32+1]); }
if (REDUCE_THREADS >= 16) { ZCACC(s[0],s[1],s[16+0],s[16+1]); }
if (REDUCE_THREADS >= 8) { ZCACC(s[0],s[1], s[8+0], s[8+1]); }
if (REDUCE_THREADS >= 4) { ZCACC(s[0],s[1], s[4+0], s[4+1]); }
if (REDUCE_THREADS >= 2) { ZCACC(s[0],s[1], s[2+0], s[2+1]); }
}
// write result for this block to global mem as single QudaSumComplex
if (tid == 0) {
g_odata[blockIdx.x].x = cdata[0].x+cdata[1].x;
g_odata[blockIdx.x].y = cdata[0].y+cdata[1].y;
}
}
#else
__global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumComplex *g_odata, unsigned int n) {
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*REDUCE_THREADS + threadIdx.x;
unsigned int gridSize = REDUCE_THREADS*gridDim.x;
extern __shared__ QudaSumComplex cdata[];
QudaSumComplex *s = cdata + tid;
s[0].x = 0;
s[0].y = 0;
while (i < n) {
REDUCE_REAL_AUXILIARY(i);
REDUCE_IMAG_AUXILIARY(i);
s[0].x += REDUCE_REAL_OPERATION(i);
s[0].y += REDUCE_IMAG_OPERATION(i);
i += gridSize;
}
__syncthreads();
// do reduction in shared mem
if (REDUCE_THREADS >= 256) { if (tid < 128) { s[0].x += s[128].x; s[0].y += s[128].y; } __syncthreads(); }
if (REDUCE_THREADS >= 128) { if (tid < 64) { s[0].x += s[ 64].x; s[0].y += s[ 64].y; } __syncthreads(); }
if (tid < 32) {
if (REDUCE_THREADS >= 64) { s[0].x += s[32].x; s[0].y += s[32].y; }
if (REDUCE_THREADS >= 32) { s[0].x += s[16].x; s[0].y += s[16].y; }
if (REDUCE_THREADS >= 16) { s[0].x += s[ 8].x; s[0].y += s[ 8].y; }
if (REDUCE_THREADS >= 8) { s[0].x += s[ 4].x; s[0].y += s[ 4].y; }
if (REDUCE_THREADS >= 4) { s[0].x += s[ 2].x; s[0].y += s[ 2].y; }
if (REDUCE_THREADS >= 2) { s[0].x += s[ 1].x; s[0].y += s[ 1].y; }
}
// write result for this block to global mem
if (tid == 0) {
g_odata[blockIdx.x].x = s[0].x;
g_odata[blockIdx.x].y = s[0].y;
}
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*REDUCE_THREADS + threadIdx.x;
unsigned int gridSize = REDUCE_THREADS*gridDim.x;
extern __shared__ QudaSumComplex cdata[];
QudaSumComplex *s = cdata + tid;
s[0].x = 0;
s[0].y = 0;
while (i < n) {
REDUCE_REAL_AUXILIARY(i);
REDUCE_IMAG_AUXILIARY(i);
s[0].x += REDUCE_REAL_OPERATION(i);
s[0].y += REDUCE_IMAG_OPERATION(i);
i += gridSize;
}
__syncthreads();
// do reduction in shared mem
if (REDUCE_THREADS >= 256) { if (tid < 128) { s[0].x += s[128].x; s[0].y += s[128].y; } __syncthreads(); }
if (REDUCE_THREADS >= 128) { if (tid < 64) { s[0].x += s[ 64].x; s[0].y += s[ 64].y; } __syncthreads(); }
if (tid < 32) {
if (REDUCE_THREADS >= 64) { s[0].x += s[32].x; s[0].y += s[32].y; }
if (REDUCE_THREADS >= 32) { s[0].x += s[16].x; s[0].y += s[16].y; }
if (REDUCE_THREADS >= 16) { s[0].x += s[ 8].x; s[0].y += s[ 8].y; }
if (REDUCE_THREADS >= 8) { s[0].x += s[ 4].x; s[0].y += s[ 4].y; }
if (REDUCE_THREADS >= 4) { s[0].x += s[ 2].x; s[0].y += s[ 2].y; }
if (REDUCE_THREADS >= 2) { s[0].x += s[ 1].x; s[0].y += s[ 1].y; }
}
// write result for this block to global mem
if (tid == 0) {
g_odata[blockIdx.x].x = s[0].x;
g_odata[blockIdx.x].y = s[0].y;
}
}
#endif
template <typename Float, typename Float2>
cuDoubleComplex REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n) {
if (n % REDUCE_THREADS != 0) {
printf("ERROR reduceCuda(): length must be a multiple of %d\n", REDUCE_THREADS);
exit(-1);
}
// allocate arrays on device and host to store one QudaSumComplex for each block
int blocks = min(REDUCE_MAX_BLOCKS, n / REDUCE_THREADS);
QudaSumComplex h_odata[REDUCE_MAX_BLOCKS];
QudaSumComplex *d_odata;
if (cudaMalloc((void**) &d_odata, blocks*sizeof(QudaSumComplex))) {
printf("Error allocating reduction matrix\n");
exit(0);
}
// partial reduction; each block generates one number
dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1);
int smemSize = REDUCE_THREADS * sizeof(QudaSumComplex);
REDUCE_FUNC_NAME(Kernel)<<< dimGrid, dimBlock, smemSize >>>(REDUCE_PARAMS, d_odata, n);
// copy result from device to host, and perform final reduction on CPU
cudaMemcpy(h_odata, d_odata, blocks*sizeof(QudaSumComplex), cudaMemcpyDeviceToHost);
cuDoubleComplex gpu_result;
gpu_result.x = 0;
gpu_result.y = 0;
for (int i = 0; i < blocks; i++) {
gpu_result.x += h_odata[i].x;
gpu_result.y += h_odata[i].y;
}
cudaFree(d_odata);
return gpu_result;
if (n % REDUCE_THREADS != 0) {
printf("ERROR reduceCuda(): length must be a multiple of %d\n", REDUCE_THREADS);
exit(-1);
}
// allocate arrays on device and host to store one QudaSumComplex for each block
int blocks = min(REDUCE_MAX_BLOCKS, n / REDUCE_THREADS);
QudaSumComplex h_odata[REDUCE_MAX_BLOCKS];
QudaSumComplex *d_odata;
if (cudaMalloc((void**) &d_odata, blocks*sizeof(QudaSumComplex))) {
printf("Error allocating reduction matrix\n");
exit(0);
}
// partial reduction; each block generates one number
dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1);
int smemSize = REDUCE_THREADS * sizeof(QudaSumComplex);
REDUCE_FUNC_NAME(Kernel)<<< dimGrid, dimBlock, smemSize >>>(REDUCE_PARAMS, d_odata, n);
// copy result from device to host, and perform final reduction on CPU
cudaMemcpy(h_odata, d_odata, blocks*sizeof(QudaSumComplex), cudaMemcpyDeviceToHost);
cuDoubleComplex gpu_result;
gpu_result.x = 0;
gpu_result.y = 0;
for (int i = 0; i < blocks; i++) {
gpu_result.x += h_odata[i].x;
gpu_result.y += h_odata[i].y;
}
cudaFree(d_odata);
return gpu_result;
}
......@@ -4,107 +4,108 @@
#define DSACC(c0, c1, a0, a1) dsadd((c0), (c1), (c0), (c1), (a0), (a1))
__global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumFloat *g_odata, unsigned int n) {
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(REDUCE_THREADS) + threadIdx.x;
unsigned int gridSize = REDUCE_THREADS*gridDim.x;
QudaSumFloat acc0 = 0;
QudaSumFloat acc1 = 0;
while (i < n) {
REDUCE_AUXILIARY(i);
DSACC(acc0, acc1, REDUCE_OPERATION(i), 0);
i += gridSize;
}
extern __shared__ QudaSumFloat sdata[];
QudaSumFloat *s = sdata + 2*tid;
s[0] = acc0;
s[1] = acc1;
__syncthreads();
if (REDUCE_THREADS >= 256) { if (tid < 128) { DSACC(s[0],s[1],s[256+0],s[256+1]); } __syncthreads(); }
if (REDUCE_THREADS >= 128) { if (tid < 64) { DSACC(s[0],s[1],s[128+0],s[128+1]); } __syncthreads(); }
if (tid < 32) {
if (REDUCE_THREADS >= 64) { DSACC(s[0],s[1],s[64+0],s[64+1]); }
if (REDUCE_THREADS >= 32) { DSACC(s[0],s[1],s[32+0],s[32+1]); }
if (REDUCE_THREADS >= 16) { DSACC(s[0],s[1],s[16+0],s[16+1]); }
if (REDUCE_THREADS >= 8) { DSACC(s[0],s[1], s[8+0], s[8+1]); }
if (REDUCE_THREADS >= 4) { DSACC(s[0],s[1], s[4+0], s[4+1]); }
if (REDUCE_THREADS >= 2) { DSACC(s[0],s[1], s[2+0], s[2+1]); }
}
// write result for this block to global mem as single float
if (tid == 0) g_odata[blockIdx.x] = sdata[0]+sdata[1];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(REDUCE_THREADS) + threadIdx.x;
unsigned int gridSize = REDUCE_THREADS*gridDim.x;
QudaSumFloat acc0 = 0;
QudaSumFloat acc1 = 0;
while (i < n) {
REDUCE_AUXILIARY(i);
DSACC(acc0, acc1, REDUCE_OPERATION(i), 0);
i += gridSize;
}
extern __shared__ QudaSumFloat sdata[];
QudaSumFloat *s = sdata + 2*tid;
s[0] = acc0;
s[1] = acc1;
__syncthreads();
if (REDUCE_THREADS >= 256) { if (tid < 128) { DSACC(s[0],s[1],s[256+0],s[256+1]); } __syncthreads(); }
if (REDUCE_THREADS >= 128) { if (tid < 64) { DSACC(s[0],s[1],s[128+0],s[128+1]); } __syncthreads(); }
if (tid < 32) {
if (REDUCE_THREADS >= 64) { DSACC(s[0],s[1],s[64+0],s[64+1]); }
if (REDUCE_THREADS >= 32) { DSACC(s[0],s[1],s[32+0],s[32+1]); }
if (REDUCE_THREADS >= 16) { DSACC(s[0],s[1],s[16+0],s[16+1]); }
if (REDUCE_THREADS >= 8) { DSACC(s[0],s[1], s[8+0], s[8+1]); }
if (REDUCE_THREADS >= 4) { DSACC(s[0],s[1], s[4+0], s[4+1]); }
if (REDUCE_THREADS >= 2) { DSACC(s[0],s[1], s[2+0], s[2+1]); }
}
// write result for this block to global mem as single float
if (tid == 0) g_odata[blockIdx.x] = sdata[0]+sdata[1];
}
#else // true double precision kernel
__global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumFloat *g_odata, unsigned int n) {
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*REDUCE_THREADS + threadIdx.x;
unsigned int gridSize = REDUCE_THREADS*gridDim.x;
extern __shared__ QudaSumFloat sdata[];
QudaSumFloat *s = sdata + tid;
s[0] = 0;
while (i < n) {
REDUCE_AUXILIARY(i);
s[0] += REDUCE_OPERATION(i);
i += gridSize;
}
__syncthreads();
// do reduction in shared mem
if (REDUCE_THREADS >= 256) { if (tid < 128) { s[0] += s[128]; } __syncthreads(); }
if (REDUCE_THREADS >= 128) { if (tid < 64) { s[0] += s[ 64]; } __syncthreads(); }
if (tid < 32) {
if (REDUCE_THREADS >= 64) { s[0] += s[32]; }
if (REDUCE_THREADS >= 32) { s[0] += s[16]; }
if (REDUCE_THREADS >= 16) { s[0] += s[ 8]; }
if (REDUCE_THREADS >= 8) { s[0] += s[ 4]; }
if (REDUCE_THREADS >= 4) { s[0] += s[ 2]; }
if (REDUCE_THREADS >= 2) { s[0] += s[ 1]; }
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = s[0];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*REDUCE_THREADS + threadIdx.x;
unsigned int gridSize = REDUCE_THREADS*gridDim.x;
extern __shared__ QudaSumFloat sdata[];
QudaSumFloat *s = sdata + tid;
s[0] = 0;
while (i < n) {
REDUCE_AUXILIARY(i);
s[0] += REDUCE_OPERATION(i);
i += gridSize;
}
__syncthreads();
// do reduction in shared mem
if (REDUCE_THREADS >= 256) { if (tid < 128) { s[0] += s[128]; } __syncthreads(); }
if (REDUCE_THREADS >= 128) { if (tid < 64) { s[0] += s[ 64]; } __syncthreads(); }
if (tid < 32) {
if (REDUCE_THREADS >= 64) { s[0] += s[32]; }
if (REDUCE_THREADS >= 32) { s[0] += s[16]; }
if (REDUCE_THREADS >= 16) { s[0] += s[ 8]; }
if (REDUCE_THREADS >= 8) { s[0] += s[ 4]; }
if (REDUCE_THREADS >= 4) { s[0] += s[ 2]; }
if (REDUCE_THREADS >= 2) { s[0] += s[ 1]; }
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = s[0];
}
#endif
template <typename Float>
double REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n) {
if (n % REDUCE_THREADS != 0) {
printf("ERROR reduceCuda(): length must be a multiple of %d\n", REDUCE_THREADS);
return 0.;
}
// allocate arrays on device and host to store one QudaSumFloat for each block
int blocks = min(REDUCE_MAX_BLOCKS, n / REDUCE_THREADS);
QudaSumFloat h_odata[REDUCE_MAX_BLOCKS];
QudaSumFloat *d_odata;
if (cudaMalloc((void**) &d_odata, blocks*sizeof(QudaSumFloat))) {
printf("Error allocating reduction matrix\n");
exit(0);
}
// partial reduction; each block generates one number
dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1);
int smemSize = REDUCE_THREADS * sizeof(QudaSumFloat);
REDUCE_FUNC_NAME(Kernel)<<< dimGrid, dimBlock, smemSize >>>(REDUCE_PARAMS, d_odata, n);
// copy result from device to host, and perform final reduction on CPU
cudaMemcpy(h_odata, d_odata, blocks*sizeof(QudaSumFloat), cudaMemcpyDeviceToHost);
double cpu_sum = 0;
for (int i = 0; i < blocks; i++)
cpu_sum += h_odata[i];
cudaFree(d_odata);
return cpu_sum;
if (n % REDUCE_THREADS != 0) {
printf("ERROR reduceCuda(): length must be a multiple of %d\n", REDUCE_THREADS);
return 0.;
}
// allocate arrays on device and host to store one QudaSumFloat for each block
int blocks = min(REDUCE_MAX_BLOCKS, n / REDUCE_THREADS);
QudaSumFloat h_odata[REDUCE_MAX_BLOCKS];
QudaSumFloat *d_odata;
if (cudaMalloc((void**) &d_odata, blocks*sizeof(QudaSumFloat))) {
printf("Error allocating reduction matrix of size %d\n", blocks*sizeof(QudaSumFloat));
exit(0);
}
// partial reduction; each block generates one number
dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1);
int smemSize = REDUCE_THREADS * sizeof(QudaSumFloat);
REDUCE_FUNC_NAME(Kernel)<<< dimGrid, dimBlock, smemSize >>>(REDUCE_PARAMS, d_odata, n);
// copy result from device to host, and perform final reduction on CPU
cudaMemcpy(h_odata, d_odata, blocks*sizeof(QudaSumFloat), cudaMemcpyDeviceToHost);
double cpu_sum = 0;
for (int i = 0; i < blocks; i++)
cpu_sum += h_odata[i];
cudaFree(d_odata);
return cpu_sum;
}
#if (REDUCE_TYPE == REDUCE_KAHAN)
#define DSACC(c0, c1, a0, a1) dsadd((c0), (c1), (c0), (c1), (a0), (a1))
#define DSACC3(c0, c1, a0, a1) dsadd3((c0), (c1), (c0), (c1), (a0), (a1))
......@@ -106,6 +105,7 @@ __global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumFloat3 *g_odata,
#endif
template <typename Float2>
double3 REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n) {
if (n % REDUCE_THREADS != 0) {
printf("ERROR reduceCuda(): length must be a multiple of %d\n", REDUCE_THREADS);
......
......@@ -6,6 +6,8 @@
#include <xmmintrin.h>
#define __DEVICE_EMULATION__
// GPU clover matrix
FullClover cudaClover;
......@@ -16,25 +18,26 @@ void *packedSpinor2 = 0;
// Half precision spinor field temporaries
ParitySpinor hSpinor1, hSpinor2;
ParitySpinor allocateParitySpinor(Precision precision) {
ParitySpinor allocateParitySpinor(int geometric_length, Precision precision) {
ParitySpinor ret;
ret.precision = precision;
ret.length = geometric_length*spinorSiteSize;
if (precision == QUDA_DOUBLE_PRECISION) {
int spinor_bytes = Nh*spinorSiteSize*sizeof(double);
int spinor_bytes = ret.length*sizeof(double);
if (cudaMalloc((void**)&ret.spinor, spinor_bytes) == cudaErrorMemoryAllocation) {
printf("Error allocating spinor\n");
exit(0);
}
} else if (precision == QUDA_SINGLE_PRECISION) {
int spinor_bytes = Nh*spinorSiteSize*sizeof(float);
int spinor_bytes = ret.length*sizeof(float);
if (cudaMalloc((void**)&ret.spinor, spinor_bytes) == cudaErrorMemoryAllocation) {
printf("Error allocating spinor\n");
exit(0);
}
} else if (precision == QUDA_HALF_PRECISION) {
int spinor_bytes = Nh*spinorSiteSize*sizeof(float)/2;
int spinor_bytes = ret.length*sizeof(float)/2;
if (cudaMalloc((void**)&ret.spinor, spinor_bytes) == cudaErrorMemoryAllocation) {
printf("Error allocating spinor\n");
exit(0);
......@@ -47,22 +50,22 @@ ParitySpinor allocateParitySpinor(Precision precision) {
printf("Error allocating spinor: double precision not supported\n");
exit(0);
}
return ret;
}
FullSpinor allocateSpinorField(Precision precision) {
FullSpinor allocateSpinorField(int length, Precision precision) {
FullSpinor ret;
ret.even = allocateParitySpinor(precision);
ret.odd = allocateParitySpinor(precision);
ret.even = allocateParitySpinor(length/2, precision);
ret.odd = allocateParitySpinor(length/2, precision);
return ret;
}
void allocateSpinorHalf() {
Precision precision = QUDA_HALF_PRECISION;
hSpinor1 = allocateParitySpinor(precision);
hSpinor2 = allocateParitySpinor(precision);
hSpinor1 = allocateParitySpinor(Nh, precision);
hSpinor2 = allocateParitySpinor(Nh, precision);
}
ParityClover allocateParityClover() {
......@@ -131,7 +134,7 @@ inline void unpackDouble2(double *a, double2 *b) {
inline void packFloat4(float4* a, float *b) {
__m128 SSEtmp;
SSEtmp = _mm_loadu_ps((const float*)b);
_mm_store_ps((float*)a, SSEtmp);
_mm_storeu_ps((float*)a, SSEtmp);
//a->x = b[0]; a->y = b[1]; a->z = b[2]; a->w = b[3];
}
......@@ -163,7 +166,7 @@ void packFullSpinorDD(double2 *even, double2 *odd, double *spinor) {
}
}
for (int j = 0; j < 12; j++) packDouble2(even+j*Nh+i, b+j*4);
for (int j = 0; j < 12; j++) packDouble2(even+j*Nh+i, b+j*2);
}
{ // odd sites
......@@ -179,7 +182,7 @@ void packFullSpinorDD(double2 *even, double2 *odd, double *spinor) {
}
}
for (int j = 0; j < 12; j++) packDouble2(odd+j*Nh+i, b+j*4);
for (int j=0; j<12; j++) packDouble2(odd+j*Nh+i, b+j*2);
}
}
......@@ -206,7 +209,7 @@ void packFullSpinorSD(float4 *even, float4 *odd, double *spinor) {
}
}
for (int j = 0; j < 6; j++) packFloat4(even+j*Nh+i, b+j*4);
for (int j=0; j<6; j++) packFloat4(even+j*Nh+i, b+j*4);
}
{ // odd sites
......@@ -222,7 +225,7 @@ void packFullSpinorSD(float4 *even, float4 *odd, double *spinor) {
}
}
for (int j = 0; j < 6; j++) packFloat4(odd+j*Nh+i, b+j*4);
for (int j=0; j<6; j++) packFloat4(odd+j*Nh+i, b+j*4);
}
}
......@@ -238,7 +241,6 @@ void packFullSpinorSS(float4 *even, float4 *odd, float *spinor) {
{ // even sites
int k = 2*i + boundaryCrossings%2;
//printf("%d %d\n", i, k);
float *a = spinor + k*24;
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
......@@ -250,12 +252,11 @@ void packFullSpinorSS(float4 *even, float4 *odd, float *spinor) {
}
}
for (int j = 0; j < 6; j++) packFloat4(even+j*Nh+i, b+j*4);
for (int j=0; j<6; j++) packFloat4(even+j*Nh+i, b+j*4);
}
{ // odd sites
int k = 2*i + (boundaryCrossings+1)%2;
//printf("%d %d\n", i, k);
float *a = spinor + k*24;
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
......@@ -267,7 +268,7 @@ void packFullSpinorSS(float4 *even, float4 *odd, float *spinor) {
}
}
for (int j = 0; j < 6; j++) packFloat4(odd+j*Nh+i, b+j*4);
for (int j=0; j<6; j++) packFloat4(odd+j*Nh+i, b+j*4);
}
}
......@@ -291,7 +292,7 @@ void packParitySpinorDD(double2 *res, double *spinor) {
}
}
for (int j = 0; j < 12; j++) packDouble2(res+j*Nh+i, b+j*4);
for (int j=0; j<12; j++) packDouble2(res+j*Nh+i, b+j*2);
}
}
......@@ -312,7 +313,7 @@ void packParitySpinorSD(float4 *res, double *spinor) {
}
}
for (int j = 0; j < 6; j++) packFloat4(res+j*Nh+i, b+j*4);
for (int j=0; j<6; j++) packFloat4(res+j*Nh+i, b+j*4);
}
}
......@@ -357,7 +358,7 @@ void packQDPParitySpinorDD(double2 *res, double *spinor) {
}
}
for (int j = 0; j < 6; j++) packDouble2(res+j*Nh+i, b+j*4);
for (int j = 0; j < 6; j++) packDouble2(res+j*Nh+i, b+j*2);
}
}
......@@ -419,7 +420,7 @@ void unpackFullSpinorDD(double *res, double2 *even, double2 *odd) {
int k = 2*i + boundaryCrossings%2;
double *a = res + k*24;
for (int j = 0; j < 12; j++) unpackDouble2(b+j*4, even+j*Nh+i);
for (int j = 0; j < 12; j++) unpackDouble2(b+j*2, even+j*Nh+i);
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
......@@ -437,7 +438,7 @@ void unpackFullSpinorDD(double *res, double2 *even, double2 *odd) {
int k = 2*i + (boundaryCrossings+1)%2;
double *a = res + k*24;
for (int j = 0; j < 12; j++) unpackDouble2(b+j*4, odd+j*Nh+i);
for (int j = 0; j < 12; j++) unpackDouble2(b+j*2, odd+j*Nh+i);
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
......@@ -555,7 +556,7 @@ void unpackParitySpinorDD(double *res, double2 *spinorPacked) {
for (int i = 0; i < Nh; i++) {
double *a = res+i*24;
for (int j = 0; j < 12; j++) unpackDouble2(b+j*4, spinorPacked+j*Nh+i);
for (int j = 0; j < 12; j++) unpackDouble2(b+j*2, spinorPacked+j*Nh+i);
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
......@@ -624,7 +625,7 @@ void unpackQDPParitySpinorDD(double *res, double2 *spinorPacked) {
for (int i = 0; i < Nh; i++) {
double *a = res+i*24;
for (int j = 0; j < 12; j++) unpackDouble2(b+j*4, spinorPacked+j*Nh+i);
for (int j = 0; j < 12; j++) unpackDouble2(b+j*2, spinorPacked+j*Nh+i);
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
......@@ -687,6 +688,11 @@ void unpackQDPParitySpinorSS(float *res, float4 *spinorPacked) {
void loadParitySpinor(ParitySpinor ret, void *spinor, Precision cpu_prec,
Precision cuda_prec, DiracFieldOrder dirac_order) {
if (cuda_prec == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) {
printf("Error, cannot have CUDA double precision without double CPU precision\n");
exit(-1);
}
if (cuda_prec == QUDA_HALF_PRECISION) {
if (!hSpinor1.spinor && !hSpinor1.spinorNorm &&
!hSpinor2.spinor && !hSpinor2.spinorNorm ) {
......@@ -705,10 +711,12 @@ void loadParitySpinor(ParitySpinor ret, void *spinor, Precision cpu_prec,
else spinor_bytes = Nh*spinorSiteSize*sizeof(float);
#ifndef __DEVICE_EMULATION__
//if (!packedSpinor1) cudaHostAlloc(&packedSpinor1, spinor_bytes, cudaHostAllocDefault);
if (!packedSpinor1) cudaMallocHost(&packedSpinor1, spinor_bytes);
#else
if (!packedSpinor1) packedSpinor1 = malloc(spinor_bytes);
#endif
if (dirac_order == QUDA_DIRAC_ORDER || QUDA_CPS_WILSON_DIRAC_ORDER) {
if (cuda_prec == QUDA_DOUBLE_PRECISION) {
packParitySpinorDD((double2*)packedSpinor1, (double*)spinor);
......
......@@ -8,8 +8,8 @@
extern "C" {
#endif
ParitySpinor allocateParitySpinor(Precision precision);
FullSpinor allocateSpinorField(Precision precision);
ParitySpinor allocateParitySpinor(int length, Precision precision);
FullSpinor allocateSpinorField(int length, Precision precision);
ParityClover allocateParityClover();
FullClover allocateCloverField();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment