Advanced Computing Platform for Theoretical Physics

Commit 85b72c53 authored by mikeaclark's avatar mikeaclark
Browse files

Finally, double/single precision inverter implemented

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@398 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 6d144a26
CUFILES := dslash_cuda.cu blas_cuda.cu CUFILES := dslash_cuda.cu blas_cuda.cu
CCFILES := inv_bicgstab_cuda.cpp inv_cg.cpp util_cuda.cpp gauge_cuda.cpp spinor_quda.cpp CCFILES := inv_bicgstab_cuda.cpp inv_cg.cpp util_cuda.cpp gauge_cuda.cpp spinor_quda.cpp
CUDA_INSTALL_PATH = /usr/local/cuda CUDA_INSTALL_PATH = /usr/local/cuda-2.0.0.2.1221
INCLUDES = -I. -I$(CUDA_INSTALL_PATH)/include -I$(CUDA_INSTALL_PATH)/../cuda_sdk/common/inc INCLUDES = -I. -I$(CUDA_INSTALL_PATH)/include -I$(CUDA_INSTALL_PATH)/../cuda_sdk/common/inc
LIB = -L$(CUDA_INSTALL_PATH)/lib -lcudart LIB = -L$(CUDA_INSTALL_PATH)/lib -lcudart
DFLAGS = #-D__DEVICE_EMULATION__ DFLAGS = #-D__DEVICE_EMULATION__
CC = gcc CC = gcc
CFLAGS = -Wall -g -std=c99 $(INCLUDES) ${DFLAGS} CFLAGS = -Wall -O3 -std=c99 $(INCLUDES) ${DFLAGS}
CXX = g++ CXX = g++
CXXFLAGS = -Wall -g $(INCLUDES) ${DFLAGS} CXXFLAGS = -Wall -O3 $(INCLUDES) ${DFLAGS}
NVCC = $(CUDA_INSTALL_PATH)/bin/nvcc NVCC = $(CUDA_INSTALL_PATH)/bin/nvcc
NVCCFLAGS = -g $(INCLUDES) ${DFLAGS} -arch=sm_13 #-deviceemu NVCCFLAGS = -O3 $(INCLUDES) ${DFLAGS} -arch=sm_13 #-deviceemu
LDFLAGS = -fPIC $(LIB) LDFLAGS = -fPIC $(LIB)
CCOBJECTS = $(CCFILES:.cpp=.o) CCOBJECTS = $(CCFILES:.cpp=.o)
CUOBJECTS = $(CUFILES:.cu=.o) CUOBJECTS = $(CUFILES:.cu=.o)
......
...@@ -39,6 +39,19 @@ inline void checkSpinor(ParitySpinor &a, ParitySpinor &b) { ...@@ -39,6 +39,19 @@ inline void checkSpinor(ParitySpinor &a, ParitySpinor &b) {
} }
} }
// For kernels with precision conversion built in
inline void checkHalfSpinor(ParitySpinor &a, ParitySpinor &b) {
if (a.precision == QUDA_HALF_PRECISION || b.precision == QUDA_HALF_PRECISION) {
printf("checkSpinor error, this kernel does not support QUDA_HALF_PRECISION\n");
exit(-1);
}
if (a.length != b.length) {
printf("checkSpinor error, lengths do not match: %d %d\n", a.length, b.length);
exit(-1);
}
}
template <typename Float> template <typename Float>
void zeroCuda(Float* dst, int len) { void zeroCuda(Float* dst, int len) {
// cuda's floating point format, IEEE-754, represents the floating point // cuda's floating point format, IEEE-754, represents the floating point
...@@ -57,23 +70,58 @@ void zeroQuda(ParitySpinor a) { ...@@ -57,23 +70,58 @@ void zeroQuda(ParitySpinor a) {
} }
} }
template <typename Float> __global__ void convertDSKernel(double2 *dst, float4 *src, int length) {
void copyCuda(Float* dst, Float *src, int len) { unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
cudaMemcpy(dst, src, len*sizeof(Float), cudaMemcpyDeviceToDevice); unsigned int gridSize = gridDim.x*blockDim.x;
while (i < length) {
for (int k=0; k<6; k++) {
dst[2*k*length+i].x = src[k*length+i].x;
dst[2*k*length+i].y = src[k*length+i].y;
dst[(2*k+1)*length+i].x = src[k*length+i].z;
dst[(2*k+1)*length+i].y = src[k*length+i].w;
}
i += gridSize;
}
}
__global__ void convertSDKernel(float4 *dst, double2 *src, int length) {
unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
unsigned int gridSize = gridDim.x*blockDim.x;
while (i < length) {
for (int k=0; k<6; k++) {
dst[k*length+i].x = src[2*k*length+i].x;
dst[k*length+i].y = src[2*k*length+i].y;
dst[k*length+i].z = src[(2*k+1)*length+i].x;
dst[k*length+i].w = src[(2*k+1)*length+i].y;
}
i += gridSize;
}
} }
void copyQuda(ParitySpinor dst, ParitySpinor src) { void copyQuda(ParitySpinor dst, ParitySpinor src) {
checkSpinor(dst, src); checkHalfSpinor(dst, src);
if (dst.precision == QUDA_DOUBLE_PRECISION) copyCuda((double*)dst.spinor, (double*)src.spinor, dst.length);
else copyCuda((float*)dst.spinor, (float*)src.spinor, src.length); int convertLength = dst.length / 24;
int blocks = min(REDUCE_MAX_BLOCKS, max(convertLength/REDUCE_THREADS, 1));
dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1);
if (dst.precision == QUDA_DOUBLE_PRECISION && src.precision == QUDA_SINGLE_PRECISION)
convertDSKernel<<<dimGrid, dimBlock>>>((double2*)dst.spinor, (float4*)src.spinor, convertLength);
else if (dst.precision == QUDA_SINGLE_PRECISION && src.precision == QUDA_DOUBLE_PRECISION)
convertSDKernel<<<dimGrid, dimBlock>>>((float4*)dst.spinor, (double2*)src.spinor, convertLength);
else if (dst.precision == QUDA_DOUBLE_PRECISION)
cudaMemcpy(dst.spinor, src.spinor, dst.length*sizeof(double), cudaMemcpyDeviceToDevice);
else
cudaMemcpy(dst.spinor, src.spinor, dst.length*sizeof(float), cudaMemcpyDeviceToDevice);
} }
template <typename Float> template <typename Float>
__global__ void axpbyKernel(Float a, Float *x, Float b, Float *y, int len) { __global__ void axpbyKernel(Float a, Float *x, Float b, Float *y, int length) {
unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
unsigned int gridSize = gridDim.x*blockDim.x; unsigned int gridSize = gridDim.x*blockDim.x;
while (i < len) { while (i < length) {
y[i] = a*x[i] + b*y[i]; y[i] = a*x[i] + b*y[i];
i += gridSize; i += gridSize;
} }
...@@ -91,6 +139,28 @@ void axpbyQuda(double a, ParitySpinor x, double b, ParitySpinor y) { ...@@ -91,6 +139,28 @@ void axpbyQuda(double a, ParitySpinor x, double b, ParitySpinor y) {
axpbyKernel<<<dimGrid, dimBlock>>>((float)a, (float*)x.spinor, (float)b, (float*)y.spinor, x.length); axpbyKernel<<<dimGrid, dimBlock>>>((float)a, (float*)x.spinor, (float)b, (float*)y.spinor, x.length);
} }
template <typename xFloat, typename yFloat>
__global__ void xpyKernel(xFloat *x, yFloat *y, int len) {
unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
unsigned int gridSize = gridDim.x*blockDim.x;
while (i < len) {
y[i] += x[i];
i += gridSize;
}
}
// performs the operation y[i] = x[i] + y[i]
void xpyQuda(ParitySpinor x, ParitySpinor y) {
checkSpinor(x,y);
int blocks = min(REDUCE_MAX_BLOCKS, max(x.length/REDUCE_THREADS, 1));
dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1);
if (x.precision == QUDA_DOUBLE_PRECISION)
xpyKernel<<<dimGrid, dimBlock>>>((double*)x.spinor, (double*)y.spinor, x.length);
else
xpyKernel<<<dimGrid, dimBlock>>>((float*)x.spinor, (float*)y.spinor, x.length);
}
template <typename Float> template <typename Float>
__global__ void axpyKernel(Float a, Float *x, Float *y, int len) { __global__ void axpyKernel(Float a, Float *x, Float *y, int len) {
unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x; unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
...@@ -533,6 +603,36 @@ double axpyNormQuda(double a, ParitySpinor x, ParitySpinor y) { ...@@ -533,6 +603,36 @@ double axpyNormQuda(double a, ParitySpinor x, ParitySpinor y) {
} }
//
// double axpyNormCuda(float a, float *x, float *y, n){}
//
// First performs the operation y[i] = a*x[i] + y[i]
// Second returns the norm of y
//
template <typename Float>
#define REDUCE_FUNC_NAME(suffix) xmyNorm##suffix
#define REDUCE_TYPES Float *x, Float *y
#define REDUCE_PARAMS x, y
#define REDUCE_AUXILIARY(i) y[i] = x[i] - y[i]
#define REDUCE_OPERATION(i) (y[i]*y[i])
#include "reduce_core.h"
#undef REDUCE_FUNC_NAME
#undef REDUCE_TYPES
#undef REDUCE_PARAMS
#undef REDUCE_AUXILIARY
#undef REDUCE_OPERATION
double xmyNormQuda(ParitySpinor x, ParitySpinor y) {
checkSpinor(x,y);
if (x.precision == QUDA_DOUBLE_PRECISION) {
return xmyNormCuda((double*)x.spinor, (double*)y.spinor, x.length);
} else {
return xmyNormCuda((float*)x.spinor, (float*)y.spinor, x.length);
}
}
// //
// double2 cDotProductCuda(float2 *a, float2 *b, int n) {} // double2 cDotProductCuda(float2 *a, float2 *b, int n) {}
// //
......
...@@ -17,10 +17,12 @@ extern "C" { ...@@ -17,10 +17,12 @@ extern "C" {
double sumQuda(ParitySpinor b); double sumQuda(ParitySpinor b);
double normQuda(ParitySpinor b); double normQuda(ParitySpinor b);
double reDotProductQuda(ParitySpinor a, ParitySpinor b); double reDotProductQuda(ParitySpinor a, ParitySpinor b);
double xmyNormQuda(ParitySpinor a, ParitySpinor b);
void axpbyQuda(double a, ParitySpinor x, double b, ParitySpinor y); void axpbyQuda(double a, ParitySpinor x, double b, ParitySpinor y);
void axpyQuda(double a, ParitySpinor x, ParitySpinor y); void axpyQuda(double a, ParitySpinor x, ParitySpinor y);
void axQuda(double a, ParitySpinor x); void axQuda(double a, ParitySpinor x);
void xpyQuda(ParitySpinor x, ParitySpinor y);
void xpayQuda(ParitySpinor x, double a, ParitySpinor y); void xpayQuda(ParitySpinor x, double a, ParitySpinor y);
void mxpyQuda(ParitySpinor x, ParitySpinor y); void mxpyQuda(ParitySpinor x, ParitySpinor y);
......
...@@ -26,7 +26,7 @@ texture<float4, 1, cudaReadModeElementType> gauge1TexSingle; ...@@ -26,7 +26,7 @@ texture<float4, 1, cudaReadModeElementType> gauge1TexSingle;
texture<short4, 1, cudaReadModeNormalizedFloat> gauge0TexHalf; texture<short4, 1, cudaReadModeNormalizedFloat> gauge0TexHalf;
texture<short4, 1, cudaReadModeNormalizedFloat> gauge1TexHalf; texture<short4, 1, cudaReadModeNormalizedFloat> gauge1TexHalf;
// Single precision input spinor field // Double precision input spinor field
texture<int4, 1> spinorTexDouble; texture<int4, 1> spinorTexDouble;
// Single precision input spinor field // Single precision input spinor field
...@@ -211,12 +211,23 @@ void bindGaugeTex(FullGauge gauge, int oddBit) { ...@@ -211,12 +211,23 @@ void bindGaugeTex(FullGauge gauge, int oddBit) {
// ---------------------------------------------------------------------- // ----------------------------------------------------------------------
void checkSpinor(ParitySpinor out, ParitySpinor in) {
if (in.precision != out.precision) {
printf("Error in dslash quda: input and out spinor precision's don't match\n");
exit(-1);
}
}
void dslashCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity, int dagger) { void dslashCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity, int dagger) {
if (invert_param->cuda_prec == QUDA_DOUBLE_PRECISION) { checkSpinor(in, out);
if (in.precision == QUDA_DOUBLE_PRECISION) {
dslashDCuda(out, gauge, in, parity, dagger); dslashDCuda(out, gauge, in, parity, dagger);
} else if (invert_param->cuda_prec == QUDA_SINGLE_PRECISION) { } else if (in.precision == QUDA_SINGLE_PRECISION) {
dslashSCuda(out, gauge, in, parity, dagger); dslashSCuda(out, gauge, in, parity, dagger);
} else if (invert_param->cuda_prec == QUDA_HALF_PRECISION) { } else if (in.precision == QUDA_HALF_PRECISION) {
dim3 gridDim(GRID_DIM, 1, 1); dim3 gridDim(GRID_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1); dim3 blockDim(BLOCK_DIM, 1, 1);
...@@ -410,11 +421,13 @@ void dslashHCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor, ...@@ -410,11 +421,13 @@ void dslashHCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
void dslashXpayCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity, int dagger, void dslashXpayCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity, int dagger,
ParitySpinor x, double a) { ParitySpinor x, double a) {
if (invert_param->cuda_prec == QUDA_DOUBLE_PRECISION) { checkSpinor(in, out);
if (in.precision == QUDA_DOUBLE_PRECISION) {
dslashXpayDCuda(out, gauge, in, parity, dagger, x, a); dslashXpayDCuda(out, gauge, in, parity, dagger, x, a);
} else if (invert_param->cuda_prec == QUDA_SINGLE_PRECISION) { } else if (in.precision == QUDA_SINGLE_PRECISION) {
dslashXpaySCuda(out, gauge, in, parity, dagger, x, a); dslashXpaySCuda(out, gauge, in, parity, dagger, x, a);
} else if (invert_param->cuda_prec == QUDA_HALF_PRECISION) { } else if (in.precision == QUDA_HALF_PRECISION) {
printf("Not yet implemented\n"); printf("Not yet implemented\n");
exit(-1); exit(-1);
...@@ -624,9 +637,13 @@ int dslashCudaSharedBytes() { ...@@ -624,9 +637,13 @@ int dslashCudaSharedBytes() {
// Apply the even-odd preconditioned Dirac operator // Apply the even-odd preconditioned Dirac operator
void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa, void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa,
ParitySpinor tmp, MatPCType matpc_type) { ParitySpinor tmp, MatPCType matpc_type) {
checkSpinor(in, out);
checkSpinor(in, tmp);
double kappa2 = -kappa*kappa; double kappa2 = -kappa*kappa;
if (invert_param->cuda_prec == QUDA_DOUBLE_PRECISION) { if (in.precision == QUDA_DOUBLE_PRECISION) {
if (matpc_type == QUDA_MATPC_EVEN_EVEN) { if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashDCuda(tmp, gauge, in, 1, 0); dslashDCuda(tmp, gauge, in, 1, 0);
dslashXpayDCuda(out, gauge, tmp, 0, 0, in, kappa2); dslashXpayDCuda(out, gauge, tmp, 0, 0, in, kappa2);
...@@ -634,7 +651,7 @@ void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa, ...@@ -634,7 +651,7 @@ void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa,
dslashDCuda(tmp, gauge, in, 0, 0); dslashDCuda(tmp, gauge, in, 0, 0);
dslashXpayDCuda(out, gauge, tmp, 1, 0, in, kappa2); dslashXpayDCuda(out, gauge, tmp, 1, 0, in, kappa2);
} }
} else if (invert_param->cuda_prec == QUDA_SINGLE_PRECISION) { } else if (in.precision == QUDA_SINGLE_PRECISION) {
if (matpc_type == QUDA_MATPC_EVEN_EVEN) { if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashSCuda(tmp, gauge, in, 1, 0); dslashSCuda(tmp, gauge, in, 1, 0);
dslashXpaySCuda(out, gauge, tmp, 0, 0, in, kappa2); dslashXpaySCuda(out, gauge, tmp, 0, 0, in, kappa2);
...@@ -642,7 +659,7 @@ void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa, ...@@ -642,7 +659,7 @@ void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa,
dslashSCuda(tmp, gauge, in, 0, 0); dslashSCuda(tmp, gauge, in, 0, 0);
dslashXpaySCuda(out, gauge, tmp, 1, 0, in, kappa2); dslashXpaySCuda(out, gauge, tmp, 1, 0, in, kappa2);
} }
} else if (invert_param->cuda_prec == QUDA_HALF_PRECISION) { } else if (in.precision == QUDA_HALF_PRECISION) {
dim3 gridDim(GRID_DIM, 1, 1); dim3 gridDim(GRID_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1); dim3 blockDim(BLOCK_DIM, 1, 1);
...@@ -662,9 +679,13 @@ void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa, ...@@ -662,9 +679,13 @@ void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa,
// Apply the even-odd preconditioned Dirac operator // Apply the even-odd preconditioned Dirac operator
void MatPCDagCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa, void MatPCDagCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa,
ParitySpinor tmp, MatPCType matpc_type) { ParitySpinor tmp, MatPCType matpc_type) {
checkSpinor(in, out);
checkSpinor(in, tmp);
double kappa2 = -kappa*kappa; double kappa2 = -kappa*kappa;
if (invert_param->cuda_prec == QUDA_DOUBLE_PRECISION) { if (in.precision == QUDA_DOUBLE_PRECISION) {
if (matpc_type == QUDA_MATPC_EVEN_EVEN) { if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashDCuda(tmp, gauge, in, 1, 1); dslashDCuda(tmp, gauge, in, 1, 1);
dslashXpayDCuda(out, gauge, tmp, 0, 1, in, kappa2); dslashXpayDCuda(out, gauge, tmp, 0, 1, in, kappa2);
...@@ -672,7 +693,7 @@ void MatPCDagCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kap ...@@ -672,7 +693,7 @@ void MatPCDagCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kap
dslashDCuda(tmp, gauge, in, 0, 1); dslashDCuda(tmp, gauge, in, 0, 1);
dslashXpayDCuda(out, gauge, tmp, 1, 1, in, kappa2); dslashXpayDCuda(out, gauge, tmp, 1, 1, in, kappa2);
} }
} else if (invert_param->cuda_prec == QUDA_SINGLE_PRECISION) { } else if (in.precision == QUDA_SINGLE_PRECISION) {
if (matpc_type == QUDA_MATPC_EVEN_EVEN) { if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashSCuda(tmp, gauge, in, 1, 1); dslashSCuda(tmp, gauge, in, 1, 1);
dslashXpaySCuda(out, gauge, tmp, 0, 1, in, kappa2); dslashXpaySCuda(out, gauge, tmp, 0, 1, in, kappa2);
...@@ -705,14 +726,15 @@ void MatPCDagMatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, ...@@ -705,14 +726,15 @@ void MatPCDagMatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in,
// Apply the full operator // Apply the full operator
void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa) { void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa) {
checkSpinor(in.even, out.even);
if (invert_param->cuda_prec == QUDA_DOUBLE_PRECISION) { if (in.even.precision == QUDA_DOUBLE_PRECISION) {
dslashXpayDCuda(out.odd, gauge, in.even, 1, 0, in.odd, -kappa); dslashXpayDCuda(out.odd, gauge, in.even, 1, 0, in.odd, -kappa);
dslashXpayDCuda(out.even, gauge, in.odd, 0, 0, in.even, -kappa); dslashXpayDCuda(out.even, gauge, in.odd, 0, 0, in.even, -kappa);
} else if (invert_param->cuda_prec == QUDA_SINGLE_PRECISION) { } else if (in.even.precision == QUDA_SINGLE_PRECISION) {
dslashXpaySCuda(out.odd, gauge, in.even, 1, 0, in.odd, -kappa); dslashXpaySCuda(out.odd, gauge, in.even, 1, 0, in.odd, -kappa);
dslashXpaySCuda(out.even, gauge, in.odd, 0, 0, in.even, -kappa); dslashXpaySCuda(out.even, gauge, in.odd, 0, 0, in.even, -kappa);
} else if (invert_param->cuda_prec == QUDA_HALF_PRECISION) { } else if (in.even.precision == QUDA_HALF_PRECISION) {
printf("Half precision not supported in MatCuda\n"); printf("Half precision not supported in MatCuda\n");
exit(-1); exit(-1);
} }
...@@ -721,14 +743,15 @@ void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa) { ...@@ -721,14 +743,15 @@ void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa) {
// Apply the full operator dagger // Apply the full operator dagger
void MatDaggerCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa) { void MatDaggerCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa) {
checkSpinor(in.even, out.even);
if (invert_param->cuda_prec == QUDA_SINGLE_PRECISION) { if (in.even.precision == QUDA_SINGLE_PRECISION) {
dslashXpayDCuda(out.odd, gauge, in.even, 1, 1, in.odd, -kappa); dslashXpayDCuda(out.odd, gauge, in.even, 1, 1, in.odd, -kappa);
dslashXpayDCuda(out.even, gauge, in.odd, 0, 1, in.even, -kappa); dslashXpayDCuda(out.even, gauge, in.odd, 0, 1, in.even, -kappa);
} else if (invert_param->cuda_prec == QUDA_SINGLE_PRECISION) { } else if (in.even.precision == QUDA_SINGLE_PRECISION) {
dslashXpaySCuda(out.odd, gauge, in.even, 1, 1, in.odd, -kappa); dslashXpaySCuda(out.odd, gauge, in.even, 1, 1, in.odd, -kappa);
dslashXpaySCuda(out.even, gauge, in.odd, 0, 1, in.even, -kappa); dslashXpaySCuda(out.even, gauge, in.odd, 0, 1, in.even, -kappa);
} else if (invert_param->cuda_prec == QUDA_HALF_PRECISION) { } else if (in.even.precision == QUDA_HALF_PRECISION) {
printf("Half precision not supported in MatDaggerCuda\n"); printf("Half precision not supported in MatDaggerCuda\n");
exit(-1); exit(-1);
} }
......
...@@ -30,10 +30,10 @@ int TRANSFER = 1; // include transfer time in the benchmark? ...@@ -30,10 +30,10 @@ int TRANSFER = 1; // include transfer time in the benchmark?
void init() { void init() {
gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION; gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION;
gaugeParam.reconstruct_precise = QUDA_RECONSTRUCT_12; gaugeParam.reconstruct = QUDA_RECONSTRUCT_12;
gaugeParam.cuda_prec_precise = QUDA_DOUBLE_PRECISION; gaugeParam.cuda_prec = QUDA_DOUBLE_PRECISION;
gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_12; gaugeParam.reconstruct_sloppy = gaugeParam.reconstruct;
gaugeParam.cuda_prec_sloppy = QUDA_DOUBLE_PRECISION; gaugeParam.cuda_prec_sloppy = gaugeParam.cuda_prec;
gaugeParam.X = L1; gaugeParam.X = L1;
gaugeParam.Y = L2; gaugeParam.Y = L2;
gaugeParam.Z = L3; gaugeParam.Z = L3;
......
...@@ -58,7 +58,7 @@ extern "C" { ...@@ -58,7 +58,7 @@ extern "C" {
} QudaPreserveSource; } QudaPreserveSource;
typedef enum QudaReconstructType_s { typedef enum QudaReconstructType_s {
QUDA_RECONSTRUCT_NO, // store all 18 real numbers epxlicitly QUDA_RECONSTRUCT_NO, // store all 18 real numbers explicitly
QUDA_RECONSTRUCT_8, // reconstruct from 8 real numbers QUDA_RECONSTRUCT_8, // reconstruct from 8 real numbers
QUDA_RECONSTRUCT_12 // reconstruct from 12 real numbers QUDA_RECONSTRUCT_12 // reconstruct from 12 real numbers
} QudaReconstructType; } QudaReconstructType;
......
...@@ -8,22 +8,36 @@ ...@@ -8,22 +8,36 @@
#include <spinor_quda.h> #include <spinor_quda.h>
#include <gauge_quda.h> #include <gauge_quda.h>
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSloppy, void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugeSloppy,
FullGauge gaugePrecise, ParitySpinor tmp, FullGauge gaugePrecise, ParitySpinor tmp,
QudaInvertParam *invert_param, DagType dag_type) QudaInvertParam *invert_param, DagType dag_type)
{ {
ParitySpinor r = allocateParitySpinor(x.length/spinorSiteSize, x.precision); ParitySpinor r = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
ParitySpinor p = allocateParitySpinor(x.length/spinorSiteSize, x.precision); ParitySpinor p = allocateParitySpinor(x.length/spinorSiteSize, invert_param->cuda_prec_sloppy);
ParitySpinor v = allocateParitySpinor(x.length/spinorSiteSize, x.precision); ParitySpinor v = allocateParitySpinor(x.length/spinorSiteSize, invert_param->cuda_prec_sloppy);
ParitySpinor t = allocateParitySpinor(x.length/spinorSiteSize, x.precision); ParitySpinor t = allocateParitySpinor(x.length/spinorSiteSize, invert_param->cuda_prec_sloppy);
ParitySpinor y = allocateParitySpinor(x.length/spinorSiteSize, x.precision); ParitySpinor y = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
ParitySpinor b = allocateParitySpinor(x.length/spinorSiteSize, x.precision); ParitySpinor b = allocateParitySpinor(x.length/spinorSiteSize, x.precision);
copyQuda(b, source); ParitySpinor x_sloppy, r_sloppy, tmp_sloppy, src_sloppy;
copyQuda(r, b); if (invert_param->cuda_prec_sloppy == x.precision) {
x_sloppy = x;
r_sloppy = r;
tmp_sloppy = tmp;
src_sloppy = src;
} else {
x_sloppy = allocateParitySpinor(x.length/spinorSiteSize, invert_param->cuda_prec_sloppy);
r_sloppy = allocateParitySpinor(x.length/spinorSiteSize, invert_param->cuda_prec_sloppy);
src_sloppy = allocateParitySpinor(x.length/spinorSiteSize, invert_param->cuda_prec_sloppy);
tmp_sloppy = allocateParitySpinor(x.length/spinorSiteSize, invert_param->cuda_prec_sloppy);
copyQuda(src_sloppy, src);
}
zeroQuda(x_sloppy);
copyQuda(b, src);
copyQuda(r_sloppy, src_sloppy);
zeroQuda(y); zeroQuda(y);
zeroQuda(x);
double b2 = normQuda(b); double b2 = normQuda(b);
double r2 = b2; double r2 = b2;
...@@ -54,13 +68,13 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSlop ...@@ -54,13 +68,13 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSlop
int k=0; int k=0;
int xUpdate = 0, rUpdate = 0; int xUpdate = 0, rUpdate = 0;
printf("%d iterations, r2 = %e, x2 = %e\n", k, r2, normQuda(x)); printf("%d iterations, r2 = %e, x2 = %e\n", k, r2, normQuda(x_sloppy));
stopwatchStart(); stopwatchStart();
while (r2 > stop && k<invert_param->maxiter) { while (r2 > stop && k<invert_param->maxiter) {
if (k==0) { if (k==0) {
rho = make_cuDoubleComplex(r2, 0.0); rho = make_cuDoubleComplex(r2, 0.0);
copyQuda(p, r); copyQuda(p, r_sloppy);
} else { } else {
alpha_omega = cuCdiv(alpha, omega); alpha_omega = cuCdiv(alpha, omega);
rho_rho0 = cuCdiv(rho, rho0); rho_rho0 = cuCdiv(rho, rho0);
...@@ -68,37 +82,35 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSlop ...@@ -68,37 +82,35 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSlop
// p = r - beta*omega*v + beta*(p) // p = r - beta*omega*v + beta*(p)
beta_omega = cuCmul(beta, omega); beta_omega.x *= -1.0; beta_omega.y *= -1.0; beta_omega = cuCmul(beta, omega); beta_omega.x *= -1.0; beta_omega.y *= -1.0;
cxpaypbzQuda(r, beta_omega, v, beta, p); // 8 cxpaypbzQuda(r_sloppy, beta_omega, v, beta, p); // 8
} }
if (dag_type == QUDA_DAG_NO) if (dag_type == QUDA_DAG_NO)
//rv = MatPCcDotWXCuda(v, gauge, p, invert_param->kappa, tmp, b, invert_param->matpc_type); MatPCCuda(v, gaugeSloppy, p, invert_param->kappa, tmp_sloppy, invert_param->matpc_type);
MatPCCuda(v, gaugeSloppy, p, invert_param->kappa, tmp, invert_param->matpc_type);
else else
//rv = MatPCDagcDotWXCuda(v, gauge, p, invert_param->kappa, tmp, b, invert_param->matpc_type); MatPCDagCuda(v, gaugeSloppy, p, invert_param->kappa, tmp_sloppy, invert_param->matpc_type);
MatPCDagCuda(v, gaugeSloppy, p, invert_param->kappa, tmp, invert_param->matpc_type);
rv = cDotProductQuda(src_sloppy, v);
rv = cDotProductQuda(source, v);
alpha = cuCdiv(rho, rv); alpha = cuCdiv(rho, rv);
// r -= alpha*v // r -= alpha*v
alpha.x *= -1.0; alpha.y *= -1.0; alpha.x *= -1.0; alpha.y *= -1.0;
caxpyQuda(alpha, v, r); // 4 caxpyQuda(alpha, v, r_sloppy); // 4
alpha.x *= -1.0; alpha.y *= -1.0; alpha.x *= -1.0; alpha.y *= -1.0;
if (dag_type == QUDA_DAG_NO) if (dag_type == QUDA_DAG_NO)
MatPCCuda(t, gaugeSloppy, r, invert_param->kappa, tmp, invert_param->matpc_type); MatPCCuda(t, gaugeSloppy, r_sloppy, invert_param->kappa, tmp_sloppy, invert_param->matpc_type);
else else
MatPCDagCuda(t, gaugeSloppy, r, invert_param->kappa, tmp, invert_param->matpc_type); MatPCDagCuda(t, gaugeSloppy, r_sloppy, invert_param->kappa, tmp_sloppy, invert_param->matpc_type);
// omega = (t, r) / (t, t) // omega = (t, r) / (t, t)
omega_t2 = cDotProductNormAQuda(t, r); // 6 omega_t2 = cDotProductNormAQuda(t, r_sloppy); // 6
omega.x = omega_t2.x / omega_t2.z; omega.y = omega_t2.y/omega_t2.z; omega.x = omega_t2.x / omega_t2.z; omega.y = omega_t2.y/omega_t2.z;
//x += alpha*p + omega*r, r -= omega*t, r2 = (r,r), rho = (r0, r) //x += alpha*p + omega*r, r -= omega*t, r2 = (r,r), rho = (r0, r)
rho_r2 = caxpbypzYmbwcDotProductWYNormYQuda(alpha, p, omega, r, x, t, source); rho_r2 = caxpbypzYmbwcDotProductWYNormYQuda(alpha, p, omega, r_sloppy, x_sloppy, t, src_sloppy);
rho0 = rho; rho.x = rho_r2.x; rho.y = rho_r2.y; r2 = rho_r2.z; rho0 = rho; rho.x = rho_r2.x; rho.y = rho_r2.y; r2 = rho_r2.z;
// reliable updates (ideally should be double precision) // reliable updates (ideally should be double precision)
rNorm = sqrt(r2); rNorm = sqrt(r2);
if (rNorm > maxrx) maxrx = rNorm; if (rNorm > maxrx) maxrx = rNorm;
...@@ -106,28 +118,24 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSlop ...@@ -106,28 +118,24 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSlop
int updateX = (rNorm < delta*r0Norm && r0Norm <= maxrx) ? 1 : 0; int updateX = (rNorm < delta*r0Norm && r0Norm <= maxrx) ? 1 : 0;
int updateR = ((rNorm < delta*maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0; int updateR = ((rNorm < delta*maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0;
if (updateR) { if (updateR) {
//QudaPrecision spinorPrec = invert_param->cuda_prec; if (x.precision != x_sloppy.precision) copyQuda(x, x_sloppy);
//invert_param -> cuda_prec = QUDA_SINGLE_PRECISION;
if (dag_type == QUDA_DAG_NO) if (dag_type == QUDA_DAG_NO)
MatPCCuda(t, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type); MatPCCuda(r, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
else else
MatPCDagCuda(t, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type); MatPCDagCuda(r, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
//invert_param -> cuda_prec = spinorPrec;
copyQuda(r, b); r2 = xmyNormQuda(b, r);
mxpyQuda(t, r); if (x.precision != r_sloppy.precision) copyQuda(r_sloppy, r);
r2 = normQuda(r);
rNorm = sqrt(r2); rNorm = sqrt(r2);
maxrr = rNorm; maxrr = rNorm;
rUpdate++; rUpdate++;
if (updateX) { if (updateX) {
axpyQuda(1.0, x, y); xpyQuda(x, y);
zeroQuda(x); zeroQuda(x_sloppy);
copyQuda(b, r); copyQuda(b, r);
r0Norm = rNorm; r0Norm = rNorm;
...@@ -136,16 +144,17 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSlop ...@@ -136,16 +144,17 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSlop
} }
} }
k++; k++;
printf("%d iterations, r2 = %e, x2 = %e\n", k, r2, normQuda(x)); printf("%d iterations, r2 = %e, x2 = %e\n", k, r2, normQuda(x_sloppy));
} }
axpyQuda(1.0, y, x);
if (x.precision != x_sloppy.precision) copyQuda(x, x_sloppy);
xpyQuda(y, x);
invert_param->secs += stopwatchReadSeconds(); invert_param->secs += stopwatchReadSeconds();
//if (k==maxiters) printf("Exceeded maximum iterations %d\n", maxiters); if (k==invert_param->maxiter) printf("Exceeded maximum iterations %d\n", invert_param->maxiter);
printf("Residual updates = %d, Solution updates = %d\n", rUpdate, xUpdate); printf("Residual updates = %d, Solution updates = %d\n", rUpdate, xUpdate);
...@@ -159,17 +168,21 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSlop ...@@ -159,17 +168,21 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSlop
//#if 0 //#if 0
// Calculate the true residual // Calculate the true residual
if (dag_type == QUDA_DAG_NO) if (dag_type == QUDA_DAG_NO)
MatPCCuda(t, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type); MatPCCuda(r, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
else else
MatPCDagCuda(t, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type); MatPCDagCuda(r, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
copyQuda(r, source); double true_res = xmyNormQuda(src, r);
mxpyQuda(t, r);
double true_res = normQuda(r);
printf("Converged after %d iterations, r2 = %e, true_r2 = %e\n", printf("Converged after %d iterations, r2 = %e, true_r2 = %e\n", k, r2, sqrt(true_res / b2));
k, r2, true_res / b2);
//#endif //#endif
if (invert_param->cuda_prec_sloppy != x.precision) {
freeParitySpinor(src_sloppy);
freeParitySpinor(tmp_sloppy);
freeParitySpinor(r_sloppy);
freeParitySpinor(x_sloppy);
}
freeParitySpinor(b); freeParitySpinor(b);
freeParitySpinor(y); freeParitySpinor(y);
freeParitySpinor(r); freeParitySpinor(r);
......
...@@ -24,8 +24,8 @@ void printGaugeParam(QudaGaugeParam *param) { ...@@ -24,8 +24,8 @@ void printGaugeParam(QudaGaugeParam *param) {
printf("anisotropy = %e\n", param->anisotropy); printf("anisotropy = %e\n", param->anisotropy);
printf("gauge_order = %d\n", param->gauge_order); printf("gauge_order = %d\n", param->gauge_order);
printf("cpu_prec = %d\n", param->cpu_prec); printf("cpu_prec = %d\n", param->cpu_prec);
printf("cuda_prec_precise = %d\n", param->cuda_prec_precise); printf("cuda_prec = %d\n", param->cuda_prec);
printf("reconstruct_precise = %d\n", param->reconstruct_precise); printf("reconstruct = %d\n", param->reconstruct);
printf("cuda_prec_sloppy = %d\n", param->cuda_prec_sloppy); printf("cuda_prec_sloppy = %d\n", param->cuda_prec_sloppy);
printf("reconstruct_sloppy = %d\n", param->reconstruct_sloppy); printf("reconstruct_sloppy = %d\n", param->reconstruct_sloppy);
printf("gauge_fix = %d\n", param->gauge_fix); printf("gauge_fix = %d\n", param->gauge_fix);
...@@ -105,12 +105,12 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) ...@@ -105,12 +105,12 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
gauge_param->X, L1, gauge_param->Y, L2, gauge_param->Z, L3, gauge_param->T, L4); gauge_param->X, L1, gauge_param->Y, L2, gauge_param->Z, L3, gauge_param->T, L4);
exit(-1); exit(-1);
} }
gauge_param->packed_size = (gauge_param->reconstruct_precise == QUDA_RECONSTRUCT_8) ? 8 : 12; gauge_param->packed_size = (gauge_param->reconstruct == QUDA_RECONSTRUCT_8) ? 8 : 12;
createGaugeField(&cudaGaugePrecise, h_gauge, gauge_param->reconstruct_precise, gauge_param->cuda_prec_precise); createGaugeField(&cudaGaugePrecise, h_gauge, gauge_param->reconstruct, gauge_param->cuda_prec);
gauge_param->gaugeGiB = 2.0*cudaGaugePrecise.packedGaugeBytes/ (1 << 30); gauge_param->gaugeGiB = 2.0*cudaGaugePrecise.packedGaugeBytes/ (1 << 30);
if (gauge_param->cuda_prec_sloppy != gauge_param->cuda_prec_precise || if (gauge_param->cuda_prec_sloppy != gauge_param->cuda_prec ||
gauge_param->reconstruct_sloppy != gauge_param->reconstruct_precise) { gauge_param->reconstruct_sloppy != gauge_param->reconstruct) {
createGaugeField(&cudaGaugeSloppy, h_gauge, gauge_param->reconstruct_sloppy, gauge_param->cuda_prec_sloppy); createGaugeField(&cudaGaugeSloppy, h_gauge, gauge_param->reconstruct_sloppy, gauge_param->cuda_prec_sloppy);
gauge_param->gaugeGiB += 2.0*cudaGaugeSloppy.packedGaugeBytes/ (1 << 30); gauge_param->gaugeGiB += 2.0*cudaGaugeSloppy.packedGaugeBytes/ (1 << 30);
} else { } else {
...@@ -263,6 +263,20 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param) ...@@ -263,6 +263,20 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
loadSpinorField(b, h_b, param->cpu_prec, param->cuda_prec, param->dirac_order); loadSpinorField(b, h_b, param->cpu_prec, param->cuda_prec, param->dirac_order);
/*ParitySpinor t = allocateParitySpinor(Nh, invert_param->cuda_prec_sloppy);
ParitySpinor s = allocateParitySpinor(Nh, invert_param->cuda_prec_sloppy);
dslashCuda(in, cudaGaugeSloppy, b.odd, 0, 0);
copyQuda(s, b.odd);
dslashCuda(t, cudaGaugeSloppy, s, 0, 0);
copyQuda(b.even, t);
printf("%e %e %e\n", normQuda(in), normQuda(t), normQuda(b.even));
FILE *file = fopen("test.dat","w");
for (int i=0; i<in.length; i++) {
fprintf(file, "%d %e %e %e\n", i, ((double*)in.spinor)[i], ((float*)t.spinor)[i], ((double*)b.even.spinor)[i]);
}
exit(0);*/
// multiply the source to get the mass normalization // multiply the source to get the mass normalization
if (param->mass_normalization == QUDA_MASS_NORMALIZATION) { if (param->mass_normalization == QUDA_MASS_NORMALIZATION) {
axQuda(2.0*kappa, b.even); axQuda(2.0*kappa, b.even);
......
...@@ -20,8 +20,8 @@ extern "C" { ...@@ -20,8 +20,8 @@ extern "C" {
QudaPrecision cpu_prec; QudaPrecision cpu_prec;
QudaPrecision cuda_prec_precise; QudaPrecision cuda_prec;
QudaReconstructType reconstruct_precise; QudaReconstructType reconstruct;
QudaPrecision cuda_prec_sloppy; QudaPrecision cuda_prec_sloppy;
QudaReconstructType reconstruct_sloppy; QudaReconstructType reconstruct_sloppy;
...@@ -54,6 +54,8 @@ extern "C" { ...@@ -54,6 +54,8 @@ extern "C" {
QudaPrecision cpu_prec; QudaPrecision cpu_prec;
QudaPrecision cuda_prec; QudaPrecision cuda_prec;
QudaPrecision cuda_prec_sloppy;
QudaDiracFieldOrder dirac_order; QudaDiracFieldOrder dirac_order;
double spinorGiB; double spinorGiB;
......
...@@ -17,11 +17,11 @@ int main(int argc, char **argv) ...@@ -17,11 +17,11 @@ int main(int argc, char **argv)
Gauge_param.cpu_prec = QUDA_DOUBLE_PRECISION; Gauge_param.cpu_prec = QUDA_DOUBLE_PRECISION;
Gauge_param.cuda_prec_precise = QUDA_DOUBLE_PRECISION; Gauge_param.cuda_prec = QUDA_DOUBLE_PRECISION;
Gauge_param.reconstruct_precise = QUDA_RECONSTRUCT_8; Gauge_param.reconstruct = QUDA_RECONSTRUCT_12;
Gauge_param.cuda_prec_sloppy = QUDA_DOUBLE_PRECISION; Gauge_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
Gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12; Gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_8;
Gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO; Gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
Gauge_param.X = L1; Gauge_param.X = L1;
...@@ -30,20 +30,21 @@ int main(int argc, char **argv) ...@@ -30,20 +30,21 @@ int main(int argc, char **argv)
Gauge_param.T = L4; Gauge_param.T = L4;
Gauge_param.anisotropy = 1.0; Gauge_param.anisotropy = 1.0;
inv_param.inv_type = QUDA_CG_INVERTER; inv_param.inv_type = QUDA_BICGSTAB_INVERTER;
Gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T; Gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T;
Gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER; Gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
gauge_param = &Gauge_param; gauge_param = &Gauge_param;
double mass = 0.0; double mass = -0.97;
inv_param.kappa = 1.0 / (2.0*(4 + mass)); inv_param.kappa = 1.0 / (2.0*(4 + mass));
inv_param.tol = 1e-6; inv_param.tol = 1e-12;
inv_param.maxiter = 20; inv_param.maxiter = 5000;
inv_param.reliable_delta = 1e-3; inv_param.reliable_delta = 1e-1;
inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION; inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION; inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_DOUBLE_PRECISION; inv_param.cuda_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
inv_param.solution_type = QUDA_MAT_SOLUTION; inv_param.solution_type = QUDA_MAT_SOLUTION;
inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO; inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO;
......
...@@ -47,10 +47,10 @@ void init() { ...@@ -47,10 +47,10 @@ void init() {
Precision single = QUDA_SINGLE_PRECISION; Precision single = QUDA_SINGLE_PRECISION;
param.cpu_prec = QUDA_SINGLE_PRECISION; param.cpu_prec = QUDA_SINGLE_PRECISION;
param.cuda_prec_precise = QUDA_SINGLE_PRECISION; param.cuda_prec = QUDA_SINGLE_PRECISION;
param.reconstruct_precise = QUDA_RECONSTRUCT_12; param.reconstruct = QUDA_RECONSTRUCT_12;
param.cuda_prec_sloppy = param.cuda_prec_precise; param.cuda_prec_sloppy = param.cuda_prec;
param.reconstruct_sloppy = param.reconstruct_precise; param.reconstruct_sloppy = param.reconstruct;
param.X = L1; param.X = L1;
param.Y = L2; param.Y = L2;
param.Z = L3; param.Z = L3;
...@@ -99,13 +99,13 @@ void packTest() { ...@@ -99,13 +99,13 @@ void packTest() {
stopwatchStart(); stopwatchStart();
param.gauge_order = QUDA_CPS_WILSON_GAUGE_ORDER; param.gauge_order = QUDA_CPS_WILSON_GAUGE_ORDER;
createGaugeField(&cudaGaugePrecise, cpsGauge, param.reconstruct_precise, param.cuda_prec_precise); createGaugeField(&cudaGaugePrecise, cpsGauge, param.reconstruct, param.cuda_prec);
double cpsGtime = stopwatchReadSeconds(); double cpsGtime = stopwatchReadSeconds();
printf("CPS Gauge send time = %e seconds\n", cpsGtime); printf("CPS Gauge send time = %e seconds\n", cpsGtime);
stopwatchStart(); stopwatchStart();
param.gauge_order = QUDA_QDP_GAUGE_ORDER; param.gauge_order = QUDA_QDP_GAUGE_ORDER;
createGaugeField(&cudaGaugePrecise, qdpGauge, param.reconstruct_precise, param.cuda_prec_precise); createGaugeField(&cudaGaugePrecise, qdpGauge, param.reconstruct, param.cuda_prec);
double qdpGtime = stopwatchReadSeconds(); double qdpGtime = stopwatchReadSeconds();
printf("QDP Gauge send time = %e seconds\n", qdpGtime); printf("QDP Gauge send time = %e seconds\n", qdpGtime);
......
...@@ -6,8 +6,6 @@ ...@@ -6,8 +6,6 @@
#include <xmmintrin.h> #include <xmmintrin.h>
#define __DEVICE_EMULATION__
// GPU clover matrix // GPU clover matrix
FullClover cudaClover; FullClover cudaClover;
...@@ -46,9 +44,6 @@ ParitySpinor allocateParitySpinor(int geometric_length, Precision precision) { ...@@ -46,9 +44,6 @@ ParitySpinor allocateParitySpinor(int geometric_length, Precision precision) {
printf("Error allocating spinorNorm\n"); printf("Error allocating spinorNorm\n");
exit(0); exit(0);
} }
} else {
printf("Error allocating spinor: double precision not supported\n");
exit(0);
} }
return ret; return ret;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment