Advanced Computing Platform for Theoretical Physics

commit大文件会使得服务器变得不稳定,请大家尽量只commit代码,不要commit大的文件。

Commit 8cdc25ad authored by mikeaclark's avatar mikeaclark
Browse files

Almost complete double precision inverter...

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@384 be54200a-260c-0410-bdd7-ce6af2a381ab
parent a2f374da
This diff is collapsed.
......@@ -23,39 +23,40 @@
extern "C" {
#endif
// ---------- blas_quda.cu ----------
// ---------- blas_quda.cu ----------
void zeroQuda(ParitySpinor a);
void copyQuda(ParitySpinor dst, ParitySpinor src);
double axpyNormQuda(double a, ParitySpinor x, ParitySpinor y);
double sumQuda(ParitySpinor b);
double normQuda(ParitySpinor b);
double reDotProductQuda(ParitySpinor a, ParitySpinor b);
void axpbyQuda(double a, ParitySpinor x, double b, ParitySpinor y);
void axpyQuda(double a, ParitySpinor x, ParitySpinor y);
void axQuda(double a, ParitySpinor x);
void xpayQuda(ParitySpinor x, double a, ParitySpinor y);
void mxpyQuda(ParitySpinor x, ParitySpinor y);
void axpyZpbxQuda(double a, ParitySpinor x, ParitySpinor y, ParitySpinor z, double b);
void zeroCuda(float* dst, int cnt);
void copyCuda(float* dst, float *src, int len);
void caxpbyQuda(double2 a, ParitySpinor x, double2 b, ParitySpinor y);
void caxpyQuda(double2 a, ParitySpinor x, ParitySpinor y);
void cxpaypbzQuda(ParitySpinor, double2 b, ParitySpinor y, double2 c, ParitySpinor z);
void caxpbypzYmbwQuda(double2, ParitySpinor, double2, ParitySpinor, ParitySpinor, ParitySpinor);
void axpbyCuda(float a, float *x, float b, float *y, int len);
void axpyCuda(float a, float *x, float *y, int len);
void axCuda(float a, float *x, int len);
void xpayCuda(float *x, float a, float *y, int len);
void mxpyCuda(float *x, float *y, int len);
void axpyZpbxCuda(float a, float *x, float *y, float *z, float b, int len);
double axpyNormCuda(float a, float *x, float *y, int len);
double sumCuda(float *a, int n);
double normCuda(float *a, int n);
double reDotProductCuda(float *a, float *b, int n);
void blasTest();
void axpbyTest();
void caxpbyCuda(float2 a, float2 *x, float2 b, float2 *y, int len);
void caxpyCuda(float2 a, float2 *x, float2 *y, int len);
void cxpaypbzCuda(float2 *x, float2 b, float2 *y, float2 c, float2 *z, int len);
cuDoubleComplex cDotProductCuda(float2*, float2*, int len);
void caxpbypzYmbwCuda(float2, float2*, float2, float2*, float2*, float2*, int len);
double3 cDotProductNormACuda(float2 *a, float2 *b, int n);
double3 cDotProductNormBCuda(float2 *a, float2 *b, int n);
double3 caxpbypzYmbwcDotProductWYNormYCuda(float2 a, float2 *x, float2 b, float2 *y,
float2 *z, float2 *w, float2 *u, int len);
cuDoubleComplex xpaycDotzyCuda(float2 *x, float a, float2 *y, float2 *z, int len);
cuDoubleComplex cDotProductQuda(ParitySpinor, ParitySpinor);
cuDoubleComplex xpaycDotzyQuda(ParitySpinor x, double a, ParitySpinor y, ParitySpinor z);
void blasTest();
void axpbyTest();
double3 cDotProductNormAQuda(ParitySpinor a, ParitySpinor b);
double3 cDotProductNormBQuda(ParitySpinor a, ParitySpinor b);
double3 caxpbypzYmbwcDotProductWYNormYQuda(double2 a, ParitySpinor x, double2 b, ParitySpinor y,
ParitySpinor z, ParitySpinor w, ParitySpinor u);
#ifdef __cplusplus
}
#endif
......
......@@ -52,7 +52,11 @@
#define DD_PARAM2 int oddBit
#else // xpay
#define DD_XPAY_F Xpay
#if (DD_SPREC == 0)
#define DD_PARAM2 int oddBit, double a
#else
#define DD_PARAM2 int oddBit, float a
#endif
#define DSLASH_XPAY
#endif
......
......@@ -58,10 +58,16 @@ __constant__ int X3;
__constant__ int X4;
__constant__ int X1h;
__constant__ float anisotropy;
__constant__ float t_boundary;
__constant__ int gauge_fixed;
__constant__ float pi;
// single precision constants
__constant__ float anisotropy_f;
__constant__ float t_boundary_f;
__constant__ float pi_f;
// double precision constants
__constant__ double anisotropy;
__constant__ double t_boundary;
#include <dslash_def.h>
......@@ -150,13 +156,22 @@ __global__ void spinorHalfUnpack(ParitySpinor out) {
}
void setCudaGaugeParam() {
cudaMemcpyToSymbol("anisotropy", &(gauge_param->anisotropy), sizeof(float));
float t_bc = (gauge_param->t_boundary == QUDA_PERIODIC_T) ? 1.0 : -1.0;
cudaMemcpyToSymbol("t_boundary", &(t_bc), sizeof(float));
int gf = (gauge_param->gauge_fix == QUDA_GAUGE_FIXED_YES) ? 1 : 0;
cudaMemcpyToSymbol("gauge_fixed", &(gf), sizeof(int));
float h_pi = M_PI;
cudaMemcpyToSymbol("pi", &(h_pi), sizeof(float));
cudaMemcpyToSymbol("anisotropy", &(gauge_param->anisotropy), sizeof(double));
double t_bc = (gauge_param->t_boundary == QUDA_PERIODIC_T) ? 1.0 : -1.0;
cudaMemcpyToSymbol("t_boundary", &(t_bc), sizeof(double));
float anisotropy_f = gauge_param->anisotropy;
cudaMemcpyToSymbol("anisotropy_f", &(anisotropy_f), sizeof(float));
float t_bc_f = (gauge_param->t_boundary == QUDA_PERIODIC_T) ? 1.0 : -1.0;
cudaMemcpyToSymbol("t_boundary_f", &(t_bc_f), sizeof(float));
float h_pi_f = M_PI;
cudaMemcpyToSymbol("pi_f", &(h_pi_f), sizeof(float));
}
void bindGaugeTex(FullGauge gauge, int oddBit) {
......@@ -394,7 +409,7 @@ void dslashHCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
}
void dslashXpayDCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, float a) {
int oddBit, int daggerBit, ParitySpinor x, double a) {
dim3 gridDim(GRID_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
......@@ -402,52 +417,52 @@ void dslashXpayDCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
bindGaugeTex(gauge, oddBit);
int spinor_bytes = Nh*spinorSiteSize*sizeof(double);
cudaBindTexture(0, spinorTexSingle, spinor.spinor, spinor_bytes);
cudaBindTexture(0, accumTexSingle, x.spinor, spinor_bytes);
cudaBindTexture(0, spinorTexDouble, spinor.spinor, spinor_bytes);
cudaBindTexture(0, accumTexDouble, x.spinor, spinor_bytes);
if (gauge.precision == QUDA_DOUBLE_PRECISION) {
if (gauge.reconstruct == QUDA_RECONSTRUCT_12) {
if (!daggerBit) {
dslashDS12XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashDD12XpayKernel <<<gridDim, blockDim, SHARED_BYTES_DOUBLE>>> ((double2 *)res.spinor, oddBit, a);
} else {
dslashDS12DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashDD12DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_DOUBLE>>> ((double2 *)res.spinor, oddBit, a);
}
} else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) {
if (!daggerBit) {
dslashDS8XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashDD8XpayKernel <<<gridDim, blockDim, SHARED_BYTES_DOUBLE>>> ((double2 *)res.spinor, oddBit, a);
}
else {
dslashDS8DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashDD8DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_DOUBLE>>> ((double2 *)res.spinor, oddBit, a);
}
}
} else if (gauge.precision == QUDA_SINGLE_PRECISION) {
if (gauge.reconstruct == QUDA_RECONSTRUCT_12) {
if (!daggerBit) {
dslashSS12XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashSD12XpayKernel <<<gridDim, blockDim, SHARED_BYTES_DOUBLE>>> ((double2 *)res.spinor, oddBit, a);
} else {
dslashSS12DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashSD12DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_DOUBLE>>> ((double2 *)res.spinor, oddBit, a);
}
} else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) {
if (!daggerBit) {
dslashSS8XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashSD8XpayKernel <<<gridDim, blockDim, SHARED_BYTES_DOUBLE>>> ((double2 *)res.spinor, oddBit, a);
}
else {
dslashSS8DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashSD8DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_DOUBLE>>> ((double2 *)res.spinor, oddBit, a);
}
}
} else {
if (gauge.reconstruct == QUDA_RECONSTRUCT_12) {
if (!daggerBit) {
dslashHS12XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashHD12XpayKernel <<<gridDim, blockDim, SHARED_BYTES_DOUBLE>>> ((double2 *)res.spinor, oddBit, a);
} else {
dslashHS12DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashHD12DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_DOUBLE>>> ((double2 *)res.spinor, oddBit, a);
}
} else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) {
if (!daggerBit) {
dslashHS8XpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashHD8XpayKernel <<<gridDim, blockDim, SHARED_BYTES_DOUBLE>>> ((double2 *)res.spinor, oddBit, a);
}
else {
dslashHS8DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>> ((float4 *)res.spinor, oddBit, a);
dslashHD8DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES_DOUBLE>>> ((double2 *)res.spinor, oddBit, a);
}
}
}
......@@ -455,7 +470,7 @@ void dslashXpayDCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
}
void dslashXpaySCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, float a) {
int oddBit, int daggerBit, ParitySpinor x, double a) {
dim3 gridDim(GRID_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
......@@ -516,7 +531,7 @@ void dslashXpaySCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
}
void dslashXpayHCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, float a) {
int oddBit, int daggerBit, ParitySpinor x, double a) {
dim3 gridDim(GRID_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
......@@ -579,11 +594,10 @@ int dslashCudaSharedBytes() {
return SHARED_BYTES_SINGLE;
}
// Apply the even-odd preconditioned Dirac operator
void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, float kappa,
void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa,
ParitySpinor tmp, MatPCType matpc_type) {
float kappa2 = -kappa*kappa;
double kappa2 = -kappa*kappa;
if (invert_param->cuda_prec == QUDA_DOUBLE_PRECISION) {
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
......@@ -619,9 +633,9 @@ void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, float kappa,
}
// Apply the even-odd preconditioned Dirac operator
void MatPCDagCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, float kappa,
void MatPCDagCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa,
ParitySpinor tmp, MatPCType matpc_type) {
float kappa2 = -kappa*kappa;
double kappa2 = -kappa*kappa;
if (invert_param->cuda_prec == QUDA_DOUBLE_PRECISION) {
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
......@@ -657,13 +671,13 @@ void MatPCDagCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, float kapp
}
void MatPCDagMatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in,
float kappa, ParitySpinor tmp, MatPCType matpc_type) {
double kappa, ParitySpinor tmp, MatPCType matpc_type) {
MatPCCuda(out, gauge, in, kappa, tmp, matpc_type);
MatPCDagCuda(out, gauge, out, kappa, tmp, matpc_type);
}
// Apply the full operator
void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, float kappa) {
void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa) {
if (invert_param->cuda_prec == QUDA_DOUBLE_PRECISION) {
dslashXpayDCuda(out.odd, gauge, in.even, 1, 0, in.odd, -kappa);
......@@ -679,7 +693,7 @@ void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, float kappa) {
}
// Apply the full operator dagger
void MatDaggerCuda(FullSpinor out, FullGauge gauge, FullSpinor in, float kappa) {
void MatDaggerCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa) {
if (invert_param->cuda_prec == QUDA_SINGLE_PRECISION) {
dslashXpayDCuda(out.odd, gauge, in.even, 1, 1, in.odd, -kappa);
......
......@@ -45,33 +45,33 @@ extern "C" {
void dslashDCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit);
void dslashXpayDCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, float a);
int oddBit, int daggerBit, ParitySpinor x, double a);
// Single precision routines
void dslashSCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit);
void dslashXpaySCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, float a);
int oddBit, int daggerBit, ParitySpinor x, double a);
// Half precision dslash routines
void dslashHCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit);
void dslashXpayHCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, float a);
int oddBit, int daggerBit, ParitySpinor x, double a);
// wrapper to above
void dslashCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity, int dagger);
// Full Wilson matrix
void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, float kappa);
void MatDagCuda(FullSpinor out, FullGauge gauge, FullSpinor in, float kappa);
void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa);
void MatDagCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa);
void MatPCCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven,
float kappa, ParitySpinor tmp, MatPCType matpc_type);
double kappa, ParitySpinor tmp, MatPCType matpc_type);
void MatPCDagCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven,
float kappa, ParitySpinor tmp, MatPCType matpc_type);
double kappa, ParitySpinor tmp, MatPCType matpc_type);
void MatPCDagMatPCCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven,
float kappa, ParitySpinor tmp, MatPCType matpc_type);
double kappa, ParitySpinor tmp, MatPCType matpc_type);
/*QudaSumComplex MatPCcDotWXCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven,
float kappa, ParitySpinor tmp, ParitySpinor d, MatPCType matpc_type);
......
......@@ -8,7 +8,7 @@
#include <gauge_quda.h>
// What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat)
int test_type = 0;
int test_type = 2;
QudaGaugeParam gaugeParam;
QudaInvertParam inv_param;
......@@ -25,15 +25,15 @@ void *spinorEven, *spinorOdd;
double kappa = 1.0;
int ODD_BIT = 0;
int DAGGER_BIT = 0;
int TRANSFER = 0; // include transfer time in the benchmark?
int TRANSFER = 1; // include transfer time in the benchmark?
void init() {
gaugeParam.cpu_prec = QUDA_SINGLE_PRECISION;
gaugeParam.reconstruct_precise = QUDA_RECONSTRUCT_8;
gaugeParam.cuda_prec_precise = QUDA_SINGLE_PRECISION;
gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_8;
gaugeParam.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION;
gaugeParam.reconstruct_precise = QUDA_RECONSTRUCT_12;
gaugeParam.cuda_prec_precise = QUDA_DOUBLE_PRECISION;
gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_12;
gaugeParam.cuda_prec_sloppy = QUDA_DOUBLE_PRECISION;
gaugeParam.X = L1;
gaugeParam.Y = L2;
gaugeParam.Z = L3;
......@@ -44,8 +44,8 @@ void init() {
gaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
gauge_param = &gaugeParam;
inv_param.cpu_prec = QUDA_SINGLE_PRECISION;
inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_DOUBLE_PRECISION;
if (test_type == 2) inv_param.dirac_order = QUDA_DIRAC_ORDER;
else inv_param.dirac_order = QUDA_DIRAC_ORDER;
inv_param.kappa = kappa;
......@@ -55,14 +55,16 @@ void init() {
size_t sSize = (inv_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
// construct input fields
for (int dir = 0; dir < 4; dir++)
hostGauge[dir] = malloc(N*gaugeSiteSize*gSize);
for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(N*gaugeSiteSize*gSize);
spinor = malloc(N*spinorSiteSize*sSize);
spinorRef = malloc(N*spinorSiteSize*sSize);
spinorGPU = malloc(N*spinorSiteSize*sSize);
spinorEven = spinor;
spinorOdd = spinor + Nh*spinorSiteSize;
if (inv_param.cpu_prec == QUDA_DOUBLE_PRECISION)
spinorOdd = (void*)((double*)spinor + Nh*spinorSiteSize);
else
spinorOdd = (void*)((float*)spinor + Nh*spinorSiteSize);
printf("Randomizing fields...");
construct_gauge_field(hostGauge, 1, gaugeParam.cpu_prec);
......@@ -79,14 +81,18 @@ void init() {
printf("Sending fields to GPU..."); fflush(stdout);
if (!TRANSFER) {
cudaSpinor = allocateSpinorField(inv_param.cuda_prec);
cudaSpinorOut = allocateSpinorField(inv_param.cuda_prec);
tmp = allocateParitySpinor(inv_param.cuda_prec);
loadSpinorField(cudaSpinor, spinor, inv_param.cpu_prec,
inv_param.cuda_prec, inv_param.dirac_order);
printf("\nnorm = %e\n", normCuda(cudaSpinor.even.spinor, Nh*24));
tmp = allocateParitySpinor(Nh, inv_param.cuda_prec);
cudaSpinor = allocateSpinorField(N, inv_param.cuda_prec);
cudaSpinorOut = allocateSpinorField(N, inv_param.cuda_prec);
if (test_type < 2) {
loadParitySpinor(cudaSpinor.even, spinorEven, inv_param.cpu_prec,
inv_param.cuda_prec, inv_param.dirac_order);
} else {
loadSpinorField(cudaSpinor, spinor, inv_param.cpu_prec,
inv_param.cuda_prec, inv_param.dirac_order);
}
}
}
......@@ -104,37 +110,10 @@ void end() {
endQuda();
}
void dslashRef() {
// compare to dslash reference implementation
printf("Calculating reference implementation...");
fflush(stdout);
switch (test_type) {
case 0:
dslash_reference(spinorRef, hostGauge, spinorEven, ODD_BIT, DAGGER_BIT,
inv_param.cpu_prec, gaugeParam.cpu_prec);
break;
case 1:
matpc(spinorRef, hostGauge, spinorEven, kappa, QUDA_MATPC_EVEN_EVEN, DAGGER_BIT,
inv_param.cpu_prec, gaugeParam.cpu_prec);
break;
case 2:
mat(spinorRef, hostGauge, spinor, kappa, DAGGER_BIT,
inv_param.cpu_prec, gaugeParam.cpu_prec);
break;
default:
printf("Test type not defined\n");
exit(-1);
}
printf("done.\n");
}
double dslashCUDA() {
// execute kernel
const int LOOPS = 20;
const int LOOPS = 10;
printf("Executing %d kernel loops...", LOOPS);
fflush(stdout);
stopwatchStart();
......@@ -164,31 +143,35 @@ double dslashCUDA() {
printf("done.\n\n");
return secs;
}
}
void strongCheck() {
int len;
void *spinorRes;
if (test_type < 2) {
len = Nh;
spinorRes = spinorOdd;
} else {
len = N;
spinorRes = spinorGPU;
void dslashRef() {
// compare to dslash reference implementation
printf("Calculating reference implementation...");
fflush(stdout);
switch (test_type) {
case 0:
dslash_reference(spinorRef, hostGauge, spinorEven, ODD_BIT, DAGGER_BIT,
inv_param.cpu_prec, gaugeParam.cpu_prec);
break;
case 1:
matpc(spinorRef, hostGauge, spinorEven, kappa, QUDA_MATPC_EVEN_EVEN, DAGGER_BIT,
inv_param.cpu_prec, gaugeParam.cpu_prec);
break;
case 2:
mat(spinorRef, hostGauge, spinor, kappa, DAGGER_BIT,
inv_param.cpu_prec, gaugeParam.cpu_prec);
break;
default:
printf("Test type not defined\n");
exit(-1);
}
printf("Reference:\n");
printSpinorElement(spinorRef, 0, inv_param.cpu_prec); printf("...\n");
printSpinorElement(spinorRef, len-1, inv_param.cpu_prec); printf("\n");
printf("done.\n");
printf("\nCUDA:\n");
printSpinorElement(spinorRes, 0, inv_param.cpu_prec); printf("...\n");
printSpinorElement(spinorRes, len-1, inv_param.cpu_prec); printf("\n");
//compare_spinor(spinorRef, spinorRes, len, inv_param.cpu_prec);
}
void dslashTest() {
init();
......@@ -207,9 +190,9 @@ void dslashTest() {
if (!TRANSFER) {
if (test_type < 2) retrieveParitySpinor(spinorOdd, cudaSpinor.odd, inv_param.cpu_prec,
inv_param.cuda_prec, QUDA_DIRAC_ORDER);
inv_param.cuda_prec, inv_param.dirac_order);
else retrieveSpinorField(spinorGPU, cudaSpinorOut, inv_param.cpu_prec,
inv_param.cuda_prec, QUDA_DIRAC_ORDER);
inv_param.cuda_prec, inv_param.dirac_order);
}
// print timing information
printf("%fms per loop\n", 1000*secs);
......@@ -223,7 +206,8 @@ void dslashTest() {
else res = compare_floats(spinorGPU, spinorRef, N*4*3*2, 1e-4, inv_param.cpu_prec);
printf("%d Test %s\n", i, (1 == res) ? "PASSED" : "FAILED");
strongCheck();
if (test_type < 2) strong_check(spinorRef, spinorOdd, Nh, inv_param.cpu_prec);
else strong_check(spinorRef, spinorGPU, Nh, inv_param.cpu_prec);
exit(0);
}
......
......@@ -6,6 +6,8 @@
#include <xmmintrin.h>
#define __DEVICE_EMULATION__
#define SHORT_LENGTH 65536
#define SCALE_FLOAT (SHORT_LENGTH-1) / 2.f
#define SHIFT_FLOAT -1.f / (SHORT_LENGTH-1)
......@@ -75,8 +77,8 @@ inline void packH8S(short4 *res, float *g) {
inline void packD12D(double2 *res, double *g) {
for (int j=0; j<6; j++) {
res[j*Nh].x = g[j*4+0];
res[j*Nh].y = g[j*4+1];
res[j*Nh].x = g[j*2+0];
res[j*Nh].y = g[j*2+1];
}
}
......
......@@ -12,38 +12,35 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSlop
FullGauge gaugePrecise, ParitySpinor tmp,
QudaInvertParam *invert_param, DagType dag_type)
{
int len = Nh*spinorSiteSize;
Precision prec = QUDA_SINGLE_PRECISION;
ParitySpinor r = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
ParitySpinor p = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
ParitySpinor v = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
ParitySpinor t = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
ParitySpinor r = allocateParitySpinor(prec);
ParitySpinor p = allocateParitySpinor(prec);
ParitySpinor v = allocateParitySpinor(prec);
ParitySpinor t = allocateParitySpinor(prec);
ParitySpinor y = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
ParitySpinor b = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
ParitySpinor y = allocateParitySpinor(prec);
ParitySpinor b = allocateParitySpinor(prec);
copyQuda(b, source);
copyQuda(r, b);
zeroQuda(y);
zeroQuda(x);
copyCuda((float *)b.spinor, (float *)source.spinor, len);
copyCuda((float *)r.spinor, (float *)b.spinor, len);
zeroCuda((float *)y.spinor, len);
zeroCuda((float *)x.spinor, len);
double b2 = normCuda((float *)b.spinor, len);
double b2 = normQuda(b);
double r2 = b2;
double stop = b2*invert_param->tol*invert_param->tol; // stopping condition of solver
cuComplex rho = make_cuFloatComplex(1.0f, 0.0f);
cuComplex rho0 = rho;
cuComplex alpha = make_cuFloatComplex(1.0f, 0.0f);
cuComplex omega = make_cuFloatComplex(1.0f, 0.0f);
cuComplex beta;
cuDoubleComplex rho = make_cuDoubleComplex(1.0, 0.0);
cuDoubleComplex rho0 = rho;
cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0);
cuDoubleComplex omega = make_cuDoubleComplex(1.0, 0.0);
cuDoubleComplex beta;
cuDoubleComplex rv;
cuComplex rho_rho0;
cuComplex alpha_omega;
cuComplex beta_omega;
cuComplex one = make_cuFloatComplex(1.0f, 0.0f);
cuDoubleComplex rho_rho0;
cuDoubleComplex alpha_omega;
cuDoubleComplex beta_omega;
cuDoubleComplex one = make_cuDoubleComplex(1.0, 0.0);
double3 rho_r2;