Advanced Computing Platform for Theoretical Physics

commit大文件会使得服务器变得不稳定,请大家尽量只commit代码,不要commit大的文件。

Commit 85b72c53 authored by mikeaclark's avatar mikeaclark
Browse files

Finally, double/single precision inverter implemented

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@398 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 6d144a26
CUFILES := dslash_cuda.cu blas_cuda.cu
CCFILES := inv_bicgstab_cuda.cpp inv_cg.cpp util_cuda.cpp gauge_cuda.cpp spinor_quda.cpp
CUDA_INSTALL_PATH = /usr/local/cuda
CUDA_INSTALL_PATH = /usr/local/cuda-2.0.0.2.1221
INCLUDES = -I. -I$(CUDA_INSTALL_PATH)/include -I$(CUDA_INSTALL_PATH)/../cuda_sdk/common/inc
LIB = -L$(CUDA_INSTALL_PATH)/lib -lcudart
DFLAGS = #-D__DEVICE_EMULATION__
CC = gcc
CFLAGS = -Wall -g -std=c99 $(INCLUDES) ${DFLAGS}
CFLAGS = -Wall -O3 -std=c99 $(INCLUDES) ${DFLAGS}
CXX = g++
CXXFLAGS = -Wall -g $(INCLUDES) ${DFLAGS}
CXXFLAGS = -Wall -O3 $(INCLUDES) ${DFLAGS}
NVCC = $(CUDA_INSTALL_PATH)/bin/nvcc
NVCCFLAGS = -g $(INCLUDES) ${DFLAGS} -arch=sm_13 #-deviceemu
NVCCFLAGS = -O3 $(INCLUDES) ${DFLAGS} -arch=sm_13 #-deviceemu
LDFLAGS = -fPIC $(LIB)
CCOBJECTS = $(CCFILES:.cpp=.o)
CUOBJECTS = $(CUFILES:.cu=.o)
......
......@@ -39,6 +39,19 @@ inline void checkSpinor(ParitySpinor &a, ParitySpinor &b) {
}
}
// For kernels with precision conversion built in
inline void checkHalfSpinor(ParitySpinor &a, ParitySpinor &b) {
if (a.precision == QUDA_HALF_PRECISION || b.precision == QUDA_HALF_PRECISION) {
printf("checkSpinor error, this kernel does not support QUDA_HALF_PRECISION\n");
exit(-1);
}
if (a.length != b.length) {
printf("checkSpinor error, lengths do not match: %d %d\n", a.length, b.length);
exit(-1);
}
}
template <typename Float>
void zeroCuda(Float* dst, int len) {
// cuda's floating point format, IEEE-754, represents the floating point
......@@ -57,23 +70,58 @@ void zeroQuda(ParitySpinor a) {
}
}
template <typename Float>
void copyCuda(Float* dst, Float *src, int len) {
cudaMemcpy(dst, src, len*sizeof(Float), cudaMemcpyDeviceToDevice);
__global__ void convertDSKernel(double2 *dst, float4 *src, int length) {
unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
unsigned int gridSize = gridDim.x*blockDim.x;
while (i < length) {
for (int k=0; k<6; k++) {
dst[2*k*length+i].x = src[k*length+i].x;
dst[2*k*length+i].y = src[k*length+i].y;
dst[(2*k+1)*length+i].x = src[k*length+i].z;
dst[(2*k+1)*length+i].y = src[k*length+i].w;
}
i += gridSize;
}
}
__global__ void convertSDKernel(float4 *dst, double2 *src, int length) {
unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
unsigned int gridSize = gridDim.x*blockDim.x;
while (i < length) {
for (int k=0; k<6; k++) {
dst[k*length+i].x = src[2*k*length+i].x;
dst[k*length+i].y = src[2*k*length+i].y;
dst[k*length+i].z = src[(2*k+1)*length+i].x;
dst[k*length+i].w = src[(2*k+1)*length+i].y;
}
i += gridSize;
}
}
void copyQuda(ParitySpinor dst, ParitySpinor src) {
checkSpinor(dst, src);
if (dst.precision == QUDA_DOUBLE_PRECISION) copyCuda((double*)dst.spinor, (double*)src.spinor, dst.length);
else copyCuda((float*)dst.spinor, (float*)src.spinor, src.length);
checkHalfSpinor(dst, src);
int convertLength = dst.length / 24;
int blocks = min(REDUCE_MAX_BLOCKS, max(convertLength/REDUCE_THREADS, 1));
dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1);
if (dst.precision == QUDA_DOUBLE_PRECISION && src.precision == QUDA_SINGLE_PRECISION)
convertDSKernel<<<dimGrid, dimBlock>>>((double2*)dst.spinor, (float4*)src.spinor, convertLength);
else if (dst.precision == QUDA_SINGLE_PRECISION && src.precision == QUDA_DOUBLE_PRECISION)
convertSDKernel<<<dimGrid, dimBlock>>>((float4*)dst.spinor, (double2*)src.spinor, convertLength);
else if (dst.precision == QUDA_DOUBLE_PRECISION)
cudaMemcpy(dst.spinor, src.spinor, dst.length*sizeof(double), cudaMemcpyDeviceToDevice);
else
cudaMemcpy(dst.spinor, src.spinor, dst.length*sizeof(float), cudaMemcpyDeviceToDevice);
}
template <typename Float>
__global__ void axpbyKernel(Float a, Float *x, Float b, Float *y, int len) {
__global__ void axpbyKernel(Float a, Float *x, Float b, Float *y, int length) {
unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
unsigned int gridSize = gridDim.x*blockDim.x;
while (i < len) {
while (i < length) {
y[i] = a*x[i] + b*y[i];
i += gridSize;
}
......@@ -91,6 +139,28 @@ void axpbyQuda(double a, ParitySpinor x, double b, ParitySpinor y) {
axpbyKernel<<<dimGrid, dimBlock>>>((float)a, (float*)x.spinor, (float)b, (float*)y.spinor, x.length);
}
template <typename xFloat, typename yFloat>
__global__ void xpyKernel(xFloat *x, yFloat *y, int len) {
unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
unsigned int gridSize = gridDim.x*blockDim.x;
while (i < len) {
y[i] += x[i];
i += gridSize;
}
}
// performs the operation y[i] = x[i] + y[i]
void xpyQuda(ParitySpinor x, ParitySpinor y) {
checkSpinor(x,y);
int blocks = min(REDUCE_MAX_BLOCKS, max(x.length/REDUCE_THREADS, 1));
dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1);
if (x.precision == QUDA_DOUBLE_PRECISION)
xpyKernel<<<dimGrid, dimBlock>>>((double*)x.spinor, (double*)y.spinor, x.length);
else
xpyKernel<<<dimGrid, dimBlock>>>((float*)x.spinor, (float*)y.spinor, x.length);
}
template <typename Float>
__global__ void axpyKernel(Float a, Float *x, Float *y, int len) {
unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
......@@ -533,6 +603,36 @@ double axpyNormQuda(double a, ParitySpinor x, ParitySpinor y) {
}
//
// double axpyNormCuda(float a, float *x, float *y, n){}
//
// First performs the operation y[i] = a*x[i] + y[i]
// Second returns the norm of y
//
template <typename Float>
#define REDUCE_FUNC_NAME(suffix) xmyNorm##suffix
#define REDUCE_TYPES Float *x, Float *y
#define REDUCE_PARAMS x, y
#define REDUCE_AUXILIARY(i) y[i] = x[i] - y[i]
#define REDUCE_OPERATION(i) (y[i]*y[i])
#include "reduce_core.h"
#undef REDUCE_FUNC_NAME
#undef REDUCE_TYPES
#undef REDUCE_PARAMS
#undef REDUCE_AUXILIARY
#undef REDUCE_OPERATION
double xmyNormQuda(ParitySpinor x, ParitySpinor y) {
checkSpinor(x,y);
if (x.precision == QUDA_DOUBLE_PRECISION) {
return xmyNormCuda((double*)x.spinor, (double*)y.spinor, x.length);
} else {
return xmyNormCuda((float*)x.spinor, (float*)y.spinor, x.length);
}
}
//
// double2 cDotProductCuda(float2 *a, float2 *b, int n) {}
//
......
......@@ -17,10 +17,12 @@ extern "C" {
double sumQuda(ParitySpinor b);
double normQuda(ParitySpinor b);
double reDotProductQuda(ParitySpinor a, ParitySpinor b);
double xmyNormQuda(ParitySpinor a, ParitySpinor b);
void axpbyQuda(double a, ParitySpinor x, double b, ParitySpinor y);
void axpyQuda(double a, ParitySpinor x, ParitySpinor y);
void axQuda(double a, ParitySpinor x);
void xpyQuda(ParitySpinor x, ParitySpinor y);
void xpayQuda(ParitySpinor x, double a, ParitySpinor y);
void mxpyQuda(ParitySpinor x, ParitySpinor y);
......
......@@ -26,7 +26,7 @@ texture<float4, 1, cudaReadModeElementType> gauge1TexSingle;
texture<short4, 1, cudaReadModeNormalizedFloat> gauge0TexHalf;
texture<short4, 1, cudaReadModeNormalizedFloat> gauge1TexHalf;
// Single precision input spinor field
// Double precision input spinor field
texture<int4, 1> spinorTexDouble;
// Single precision input spinor field
......@@ -211,12 +211,23 @@ void bindGaugeTex(FullGauge gauge, int oddBit) {
// ----------------------------------------------------------------------
void checkSpinor(ParitySpinor out, ParitySpinor in) {
if (in.precision != out.precision) {
printf("Error in dslash quda: input and out spinor precision's don't match\n");
exit(-1);
}
}
void dslashCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity, int dagger) {
if (invert_param->cuda_prec == QUDA_DOUBLE_PRECISION) {
checkSpinor(in, out);
if (in.precision == QUDA_DOUBLE_PRECISION) {
dslashDCuda(out, gauge, in, parity, dagger);
} else if (invert_param->cuda_prec == QUDA_SINGLE_PRECISION) {
} else if (in.precision == QUDA_SINGLE_PRECISION) {
dslashSCuda(out, gauge, in, parity, dagger);
} else if (invert_param->cuda_prec == QUDA_HALF_PRECISION) {
} else if (in.precision == QUDA_HALF_PRECISION) {
dim3 gridDim(GRID_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
......@@ -410,11 +421,13 @@ void dslashHCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
void dslashXpayCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity, int dagger,
ParitySpinor x, double a) {
if (invert_param->cuda_prec == QUDA_DOUBLE_PRECISION) {
checkSpinor(in, out);
if (in.precision == QUDA_DOUBLE_PRECISION) {
dslashXpayDCuda(out, gauge, in, parity, dagger, x, a);
} else if (invert_param->cuda_prec == QUDA_SINGLE_PRECISION) {
} else if (in.precision == QUDA_SINGLE_PRECISION) {
dslashXpaySCuda(out, gauge, in, parity, dagger, x, a);
} else if (invert_param->cuda_prec == QUDA_HALF_PRECISION) {
} else if (in.precision == QUDA_HALF_PRECISION) {
printf("Not yet implemented\n");
exit(-1);
......@@ -624,9 +637,13 @@ int dslashCudaSharedBytes() {
// Apply the even-odd preconditioned Dirac operator
void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa,
ParitySpinor tmp, MatPCType matpc_type) {
checkSpinor(in, out);
checkSpinor(in, tmp);
double kappa2 = -kappa*kappa;
if (invert_param->cuda_prec == QUDA_DOUBLE_PRECISION) {
if (in.precision == QUDA_DOUBLE_PRECISION) {
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashDCuda(tmp, gauge, in, 1, 0);
dslashXpayDCuda(out, gauge, tmp, 0, 0, in, kappa2);
......@@ -634,7 +651,7 @@ void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa,
dslashDCuda(tmp, gauge, in, 0, 0);
dslashXpayDCuda(out, gauge, tmp, 1, 0, in, kappa2);
}
} else if (invert_param->cuda_prec == QUDA_SINGLE_PRECISION) {
} else if (in.precision == QUDA_SINGLE_PRECISION) {
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashSCuda(tmp, gauge, in, 1, 0);
dslashXpaySCuda(out, gauge, tmp, 0, 0, in, kappa2);
......@@ -642,7 +659,7 @@ void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa,
dslashSCuda(tmp, gauge, in, 0, 0);
dslashXpaySCuda(out, gauge, tmp, 1, 0, in, kappa2);
}
} else if (invert_param->cuda_prec == QUDA_HALF_PRECISION) {
} else if (in.precision == QUDA_HALF_PRECISION) {
dim3 gridDim(GRID_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
......@@ -662,9 +679,13 @@ void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa,
// Apply the even-odd preconditioned Dirac operator
void MatPCDagCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa,
ParitySpinor tmp, MatPCType matpc_type) {
checkSpinor(in, out);
checkSpinor(in, tmp);
double kappa2 = -kappa*kappa;
if (invert_param->cuda_prec == QUDA_DOUBLE_PRECISION) {
if (in.precision == QUDA_DOUBLE_PRECISION) {
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashDCuda(tmp, gauge, in, 1, 1);
dslashXpayDCuda(out, gauge, tmp, 0, 1, in, kappa2);
......@@ -672,7 +693,7 @@ void MatPCDagCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kap
dslashDCuda(tmp, gauge, in, 0, 1);
dslashXpayDCuda(out, gauge, tmp, 1, 1, in, kappa2);
}
} else if (invert_param->cuda_prec == QUDA_SINGLE_PRECISION) {
} else if (in.precision == QUDA_SINGLE_PRECISION) {
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashSCuda(tmp, gauge, in, 1, 1);
dslashXpaySCuda(out, gauge, tmp, 0, 1, in, kappa2);
......@@ -705,14 +726,15 @@ void MatPCDagMatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in,
// Apply the full operator
void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa) {
checkSpinor(in.even, out.even);
if (invert_param->cuda_prec == QUDA_DOUBLE_PRECISION) {
if (in.even.precision == QUDA_DOUBLE_PRECISION) {
dslashXpayDCuda(out.odd, gauge, in.even, 1, 0, in.odd, -kappa);
dslashXpayDCuda(out.even, gauge, in.odd, 0, 0, in.even, -kappa);
} else if (invert_param->cuda_prec == QUDA_SINGLE_PRECISION) {
} else if (in.even.precision == QUDA_SINGLE_PRECISION) {
dslashXpaySCuda(out.odd, gauge, in.even, 1, 0, in.odd, -kappa);
dslashXpaySCuda(out.even, gauge, in.odd, 0, 0, in.even, -kappa);
} else if (invert_param->cuda_prec == QUDA_HALF_PRECISION) {
} else if (in.even.precision == QUDA_HALF_PRECISION) {
printf("Half precision not supported in MatCuda\n");
exit(-1);
}
......@@ -721,14 +743,15 @@ void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa) {
// Apply the full operator dagger
void MatDaggerCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa) {
checkSpinor(in.even, out.even);
if (invert_param->cuda_prec == QUDA_SINGLE_PRECISION) {
if (in.even.precision == QUDA_SINGLE_PRECISION) {
dslashXpayDCuda(out.odd, gauge, in.even, 1, 1, in.odd, -kappa);
dslashXpayDCuda(out.even, gauge, in.odd, 0, 1, in.even, -kappa);
} else if (invert_param->cuda_prec == QUDA_SINGLE_PRECISION) {
} else if (in.even.precision == QUDA_SINGLE_PRECISION) {
dslashXpaySCuda(out.odd, gauge, in.even, 1, 1, in.odd, -kappa);
dslashXpaySCuda(out.even, gauge, in.odd, 0, 1, in.even, -kappa);
} else if (invert_param->cuda_prec == QUDA_HALF_PRECISION) {
} else if (in.even.precision == QUDA_HALF_PRECISION) {
printf("Half precision not supported in MatDaggerCuda\n");
exit(-1);
}
......
......@@ -30,10 +30,10 @@ int TRANSFER = 1; // include transfer time in the benchmark?
void init() {
gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION;
gaugeParam.reconstruct_precise = QUDA_RECONSTRUCT_12;
gaugeParam.cuda_prec_precise = QUDA_DOUBLE_PRECISION;
gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_12;
gaugeParam.cuda_prec_sloppy = QUDA_DOUBLE_PRECISION;
gaugeParam.reconstruct = QUDA_RECONSTRUCT_12;
gaugeParam.cuda_prec = QUDA_DOUBLE_PRECISION;
gaugeParam.reconstruct_sloppy = gaugeParam.reconstruct;
gaugeParam.cuda_prec_sloppy = gaugeParam.cuda_prec;
gaugeParam.X = L1;
gaugeParam.Y = L2;
gaugeParam.Z = L3;
......
......@@ -58,7 +58,7 @@ extern "C" {
} QudaPreserveSource;
typedef enum QudaReconstructType_s {
QUDA_RECONSTRUCT_NO, // store all 18 real numbers epxlicitly
QUDA_RECONSTRUCT_NO, // store all 18 real numbers explicitly
QUDA_RECONSTRUCT_8, // reconstruct from 8 real numbers
QUDA_RECONSTRUCT_12 // reconstruct from 12 real numbers
} QudaReconstructType;
......
......@@ -8,22 +8,36 @@
#include <spinor_quda.h>
#include <gauge_quda.h>
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSloppy,
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugeSloppy,
FullGauge gaugePrecise, ParitySpinor tmp,
QudaInvertParam *invert_param, DagType dag_type)
{
ParitySpinor r = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
ParitySpinor p = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
ParitySpinor v = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
ParitySpinor t = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
ParitySpinor p = allocateParitySpinor(x.length/spinorSiteSize, invert_param->cuda_prec_sloppy);
ParitySpinor v = allocateParitySpinor(x.length/spinorSiteSize, invert_param->cuda_prec_sloppy);
ParitySpinor t = allocateParitySpinor(x.length/spinorSiteSize, invert_param->cuda_prec_sloppy);
ParitySpinor y = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
ParitySpinor b = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
copyQuda(b, source);
copyQuda(r, b);
ParitySpinor x_sloppy, r_sloppy, tmp_sloppy, src_sloppy;
if (invert_param->cuda_prec_sloppy == x.precision) {
x_sloppy = x;
r_sloppy = r;
tmp_sloppy = tmp;
src_sloppy = src;
} else {
x_sloppy = allocateParitySpinor(x.length/spinorSiteSize, invert_param->cuda_prec_sloppy);
r_sloppy = allocateParitySpinor(x.length/spinorSiteSize, invert_param->cuda_prec_sloppy);
src_sloppy = allocateParitySpinor(x.length/spinorSiteSize, invert_param->cuda_prec_sloppy);
tmp_sloppy = allocateParitySpinor(x.length/spinorSiteSize, invert_param->cuda_prec_sloppy);
copyQuda(src_sloppy, src);
}
zeroQuda(x_sloppy);
copyQuda(b, src);
copyQuda(r_sloppy, src_sloppy);
zeroQuda(y);
zeroQuda(x);
double b2 = normQuda(b);
double r2 = b2;
......@@ -54,13 +68,13 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSlop
int k=0;
int xUpdate = 0, rUpdate = 0;
printf("%d iterations, r2 = %e, x2 = %e\n", k, r2, normQuda(x));
printf("%d iterations, r2 = %e, x2 = %e\n", k, r2, normQuda(x_sloppy));
stopwatchStart();
while (r2 > stop && k<invert_param->maxiter) {
if (k==0) {
rho = make_cuDoubleComplex(r2, 0.0);
copyQuda(p, r);
copyQuda(p, r_sloppy);
} else {
alpha_omega = cuCdiv(alpha, omega);
rho_rho0 = cuCdiv(rho, rho0);
......@@ -68,37 +82,35 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSlop
// p = r - beta*omega*v + beta*(p)
beta_omega = cuCmul(beta, omega); beta_omega.x *= -1.0; beta_omega.y *= -1.0;
cxpaypbzQuda(r, beta_omega, v, beta, p); // 8
cxpaypbzQuda(r_sloppy, beta_omega, v, beta, p); // 8
}
if (dag_type == QUDA_DAG_NO)
//rv = MatPCcDotWXCuda(v, gauge, p, invert_param->kappa, tmp, b, invert_param->matpc_type);
MatPCCuda(v, gaugeSloppy, p, invert_param->kappa, tmp, invert_param->matpc_type);
MatPCCuda(v, gaugeSloppy, p, invert_param->kappa, tmp_sloppy, invert_param->matpc_type);
else
//rv = MatPCDagcDotWXCuda(v, gauge, p, invert_param->kappa, tmp, b, invert_param->matpc_type);
MatPCDagCuda(v, gaugeSloppy, p, invert_param->kappa, tmp, invert_param->matpc_type);
rv = cDotProductQuda(source, v);
MatPCDagCuda(v, gaugeSloppy, p, invert_param->kappa, tmp_sloppy, invert_param->matpc_type);
rv = cDotProductQuda(src_sloppy, v);
alpha = cuCdiv(rho, rv);
// r -= alpha*v
alpha.x *= -1.0; alpha.y *= -1.0;
caxpyQuda(alpha, v, r); // 4
caxpyQuda(alpha, v, r_sloppy); // 4
alpha.x *= -1.0; alpha.y *= -1.0;
if (dag_type == QUDA_DAG_NO)
MatPCCuda(t, gaugeSloppy, r, invert_param->kappa, tmp, invert_param->matpc_type);
MatPCCuda(t, gaugeSloppy, r_sloppy, invert_param->kappa, tmp_sloppy, invert_param->matpc_type);
else
MatPCDagCuda(t, gaugeSloppy, r, invert_param->kappa, tmp, invert_param->matpc_type);
MatPCDagCuda(t, gaugeSloppy, r_sloppy, invert_param->kappa, tmp_sloppy, invert_param->matpc_type);
// omega = (t, r) / (t, t)
omega_t2 = cDotProductNormAQuda(t, r); // 6
omega_t2 = cDotProductNormAQuda(t, r_sloppy); // 6
omega.x = omega_t2.x / omega_t2.z; omega.y = omega_t2.y/omega_t2.z;
//x += alpha*p + omega*r, r -= omega*t, r2 = (r,r), rho = (r0, r)
rho_r2 = caxpbypzYmbwcDotProductWYNormYQuda(alpha, p, omega, r, x, t, source);
rho_r2 = caxpbypzYmbwcDotProductWYNormYQuda(alpha, p, omega, r_sloppy, x_sloppy, t, src_sloppy);
rho0 = rho; rho.x = rho_r2.x; rho.y = rho_r2.y; r2 = rho_r2.z;
// reliable updates (ideally should be double precision)
rNorm = sqrt(r2);
if (rNorm > maxrx) maxrx = rNorm;
......@@ -106,28 +118,24 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSlop
int updateX = (rNorm < delta*r0Norm && r0Norm <= maxrx) ? 1 : 0;
int updateR = ((rNorm < delta*maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0;
if (updateR) {
//QudaPrecision spinorPrec = invert_param->cuda_prec;
//invert_param -> cuda_prec = QUDA_SINGLE_PRECISION;
if (updateR) {
if (x.precision != x_sloppy.precision) copyQuda(x, x_sloppy);
if (dag_type == QUDA_DAG_NO)
MatPCCuda(t, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
MatPCCuda(r, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
else
MatPCDagCuda(t, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
//invert_param -> cuda_prec = spinorPrec;
MatPCDagCuda(r, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
copyQuda(r, b);
mxpyQuda(t, r);
r2 = normQuda(r);
r2 = xmyNormQuda(b, r);
if (x.precision != r_sloppy.precision) copyQuda(r_sloppy, r);
rNorm = sqrt(r2);
maxrr = rNorm;
rUpdate++;
if (updateX) {
axpyQuda(1.0, x, y);
zeroQuda(x);
xpyQuda(x, y);
zeroQuda(x_sloppy);
copyQuda(b, r);
r0Norm = rNorm;
......@@ -136,16 +144,17 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSlop
}
}
k++;
printf("%d iterations, r2 = %e, x2 = %e\n", k, r2, normQuda(x));
printf("%d iterations, r2 = %e, x2 = %e\n", k, r2, normQuda(x_sloppy));
}
axpyQuda(1.0, y, x);
if (x.precision != x_sloppy.precision) copyQuda(x, x_sloppy);
xpyQuda(y, x);
invert_param->secs += stopwatchReadSeconds();
//if (k==maxiters) printf("Exceeded maximum iterations %d\n", maxiters);
if (k==invert_param->maxiter) printf("Exceeded maximum iterations %d\n", invert_param->maxiter);
printf("Residual updates = %d, Solution updates = %d\n", rUpdate, xUpdate);
......@@ -159,17 +168,21 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSlop
//#if 0
// Calculate the true residual
if (dag_type == QUDA_DAG_NO)
MatPCCuda(t, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
MatPCCuda(r, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
else
MatPCDagCuda(t, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
copyQuda(r, source);
mxpyQuda(t, r);
double true_res = normQuda(r);
MatPCDagCuda(r, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
double true_res = xmyNormQuda(src, r);
printf("Converged after %d iterations, r2 = %e, true_r2 = %e\n",
k, r2, true_res / b2);
printf("Converged after %d iterations, r2 = %e, true_r2 = %e\n", k, r2, sqrt(true_res / b2));
//#endif
if (invert_param->cuda_prec_sloppy != x.precision) {
freeParitySpinor(src_sloppy);
freeParitySpinor(tmp_sloppy);
freeParitySpinor(r_sloppy);
freeParitySpinor(x_sloppy);
}
freeParitySpinor(b);
freeParitySpinor(y);
freeParitySpinor(r);
......
......@@ -24,8 +24,8 @@ void printGaugeParam(QudaGaugeParam *param) {
printf("anisotropy = %e\n", param->anisotropy);
printf("gauge_order = %d\n", param->gauge_order);
printf("cpu_prec = %d\n", param->cpu_prec);
printf("cuda_prec_precise = %d\n", param->cuda_prec_precise);
printf("reconstruct_precise = %d\n", param->reconstruct_precise);
printf("cuda_prec = %d\n", param->cuda_prec);
printf("reconstruct = %d\n", param->reconstruct);
printf("cuda_prec_sloppy = %d\n", param->cuda_prec_sloppy);
printf("reconstruct_sloppy = %d\n", param->reconstruct_sloppy);
printf("gauge_fix = %d\n", param->gauge_fix);
......@@ -105,12 +105,12 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
gauge_param->X, L1, gauge_param->Y, L2, gauge_param->Z, L3, gauge_param->T, L4);
exit(-1);
}
gauge_param->packed_size = (gauge_param->reconstruct_precise == QUDA_RECONSTRUCT_8) ? 8 : 12;
gauge_param->packed_size = (gauge_param->reconstruct == QUDA_RECONSTRUCT_8) ? 8 : 12;
createGaugeField(&cudaGaugePrecise, h_gauge, gauge_param->reconstruct_precise, gauge_param->cuda_prec_precise);
createGaugeField(&cudaGaugePrecise, h_gauge, gauge_param->reconstruct, gauge_param->cuda_prec);
gauge_param->gaugeGiB = 2.0*cudaGaugePrecise.packedGaugeBytes/ (1 << 30);
if (gauge_param->cuda_prec_sloppy != gauge_param->cuda_prec_precise ||
gauge_param->reconstruct_sloppy != gauge_param->reconstruct_precise) {
if (gauge_param->cuda_prec_sloppy != gauge_param->cuda_prec ||
gauge_param->reconstruct_sloppy != gauge_param->reconstruct) {
createGaugeField(&cudaGaugeSloppy, h_gauge, gauge_param->reconstruct_sloppy, gauge_param->cuda_prec_sloppy);
gauge_param->gaugeGiB += 2.0*cudaGaugeSloppy.packedGaugeBytes/ (1 << 30);
} else {
......@@ -263,6 +263,20 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
loadSpinorField(b, h_b, param->cpu_prec, param->cuda_prec, param->dirac_order);
/*ParitySpinor t = allocateParitySpinor(Nh, invert_param->cuda_prec_sloppy);
ParitySpinor s = allocateParitySpinor(Nh, invert_param->cuda_prec_sloppy);
dslashCuda(in, cudaGaugeSloppy, b.odd, 0, 0);
copyQuda(s, b.odd);
dslashCuda(t, cudaGaugeSloppy, s, 0, 0);
copyQuda(b.even, t);
printf("%e %e %e\n", normQuda(in), normQuda(t), normQuda(b.even));