Advanced Computing Platform for Theoretical Physics

commit大文件会使得服务器变得不稳定,请大家尽量只commit代码,不要commit大的文件。

Commit 246ae47c authored by mikeaclark's avatar mikeaclark
Browse files

Added initial padding support for spinor fields, fixed a reduction bug

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@528 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 7ee1d3d5
......@@ -37,7 +37,7 @@ LDFLAGS = -fPIC $(LIB)
all: dslash_test invert_test su3_test pack_test blas_test
QUDA = libquda.a
QUDA_OBJS = blas_quda.o blas_reference.o clover_quda.o dslash_quda.o \
QUDA_OBJS = blas_quda.o blas_reference.o clover_field.o clover_quda.o dslash_quda.o \
dslash_reference.o gauge_quda.o inv_bicgstab_quda.o inv_cg_quda.o \
invert_quda.o spinor_quda.o util_quda.o
QUDA_HDRS = blas_quda.h blas_reference.h clover_def.h dslash_def.h \
......
This diff is collapsed.
......@@ -19,14 +19,15 @@ void init() {
int X[4];
X[0] = 32;
X[1] = 32;
X[2] = 32;
X[3] = 32;
X[0] = 24;
X[1] = 24;
X[2] = 24;
X[3] = 64;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
inv_param.cuda_prec = QUDA_HALF_PRECISION;
inv_param.verbosity = QUDA_VERBOSE;
inv_param.sp_pad = 0;
invert_param = &inv_param;
......@@ -35,11 +36,11 @@ void init() {
// need single parity dimensions
X[0] /= 2;
v = allocateParitySpinor(X, inv_param.cuda_prec);
w = allocateParitySpinor(X, inv_param.cuda_prec);
x = allocateParitySpinor(X, inv_param.cuda_prec);
y = allocateParitySpinor(X, inv_param.cuda_prec);
z = allocateParitySpinor(X, inv_param.cuda_prec);
v = allocateParitySpinor(X, inv_param.cuda_prec, inv_param.sp_pad);
w = allocateParitySpinor(X, inv_param.cuda_prec, inv_param.sp_pad);
x = allocateParitySpinor(X, inv_param.cuda_prec, inv_param.sp_pad);
y = allocateParitySpinor(X, inv_param.cuda_prec, inv_param.sp_pad);
z = allocateParitySpinor(X, inv_param.cuda_prec, inv_param.sp_pad);
}
......
......@@ -328,13 +328,12 @@ volatile spinorFloat o31_im;
volatile spinorFloat o32_re;
volatile spinorFloat o32_im;
#include "read_gauge.h"
#include "read_clover.h"
#include "io_spinor.h"
int sid = blockIdx.x*blockDim.x + threadIdx.x;
int z1 = FAST_INT_DIVIDE(sid, X1h);
int x1h = sid - z1*X1h;
int z2 = FAST_INT_DIVIDE(z1, X2);
......
......@@ -35,7 +35,7 @@
#define DD_RECON 0
#define DD_GPREC 0
#define DD_SPREC 0
#define DD_CPREC 0
#define DD_CPREC 0 //
#endif
// set options for current iteration
......@@ -249,7 +249,8 @@ DD_FUNC(DD_GPREC_F, DD_SPREC_F, DD_CPREC_F, DD_RECON_F, DD_DAG_F, DD_XPAY_F)(DD_
#undef DD_SPREC
#define DD_SPREC 2
#else
#undef DD_SPREC
#undef DD_SPREC // from here
#define DD_SPREC 0
#if (DD_CPREC==0)
......@@ -261,7 +262,8 @@ DD_FUNC(DD_GPREC_F, DD_SPREC_F, DD_CPREC_F, DD_RECON_F, DD_DAG_F, DD_XPAY_F)(DD_
#elif (DD_CPREC==2)
#undef DD_CPREC
#define DD_CPREC 3
#else
#else // to here
#undef DD_LOOP
#undef DD_DAG
......@@ -269,9 +271,9 @@ DD_FUNC(DD_GPREC_F, DD_SPREC_F, DD_CPREC_F, DD_RECON_F, DD_DAG_F, DD_XPAY_F)(DD_
#undef DD_RECON
#undef DD_GPREC
#undef DD_SPREC
#undef DD_CPREC
#undef DD_CPREC //
#endif // DD_CPREC
#endif // DD_CPREC //
#endif // DD_SPREC
#endif // DD_GPREC
#endif // DD_RECON
......
......@@ -4,123 +4,47 @@
#include <dslash_quda.h>
#include <spinor_quda.h> // not needed once call to allocateParitySpinor() is removed
#if (__CUDA_ARCH__ == 130)
static __inline__ __device__ double2 fetch_double2(texture<int4, 1> t, int i)
{
int4 v = tex1Dfetch(t,i);
return make_double2(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z));
}
#endif
// Double precision gauge field
texture<int4, 1> gauge0TexDouble;
texture<int4, 1> gauge1TexDouble;
// Single precision gauge field
texture<float4, 1, cudaReadModeElementType> gauge0TexSingle;
texture<float4, 1, cudaReadModeElementType> gauge1TexSingle;
// Half precision gauge field
texture<short4, 1, cudaReadModeNormalizedFloat> gauge0TexHalf;
texture<short4, 1, cudaReadModeNormalizedFloat> gauge1TexHalf;
// Double precision input spinor field
texture<int4, 1> spinorTexDouble;
// Single precision input spinor field
texture<float4, 1, cudaReadModeElementType> spinorTexSingle;
// Half precision input spinor field
texture<short4, 1, cudaReadModeNormalizedFloat> spinorTexHalf;
texture<float, 1, cudaReadModeElementType> spinorTexNorm;
// Double precision accumulate spinor field
texture<int4, 1> accumTexDouble;
// Single precision accumulate spinor field
texture<float4, 1, cudaReadModeElementType> accumTexSingle;
// Half precision accumulate spinor field
texture<short4, 1, cudaReadModeNormalizedFloat> accumTexHalf;
texture<float, 1, cudaReadModeElementType> accumTexNorm;
// Double precision clover term
texture<int4, 1> cloverTexDouble;
// Single precision clover term
texture<float4, 1, cudaReadModeElementType> cloverTexSingle;
// Half precision clover term
texture<short4, 1, cudaReadModeNormalizedFloat> cloverTexHalf;
texture<float, 1, cudaReadModeElementType> cloverTexNorm;
#include<dslash_textures.h>
#include<dslash_constants.h>
QudaGaugeParam *gauge_param;
QudaInvertParam *invert_param;
__constant__ int X1h;
__constant__ int X1;
__constant__ int X2;
__constant__ int X3;
__constant__ int X4;
__constant__ int X1m1;
__constant__ int X2m1;
__constant__ int X3m1;
__constant__ int X4m1;
__constant__ int X2X1mX1;
__constant__ int X3X2X1mX2X1;
__constant__ int X4X3X2X1mX3X2X1;
__constant__ int X4X3X2X1hmX3X2X1h;
__constant__ float X1h_inv;
__constant__ float X2_inv;
__constant__ float X3_inv;
__constant__ int X2X1;
__constant__ int X3X2X1;
__constant__ int Vh;
__constant__ int gauge_fixed;
// single precision constants
__constant__ float anisotropy_f;
__constant__ float t_boundary_f;
__constant__ float pi_f;
// double precision constants
__constant__ double anisotropy;
__constant__ double t_boundary;
static int initDslash = 0;
unsigned long long dslash_quda_flops;
unsigned long long dslash_quda_bytes;
#define BLOCK_DIM 64
//#include <dslash_def.h> // Dslash kernel definitions
// kludge to avoid '#include nested too deeply' error
//#define DD_CPREC 3
#define DD_DAG 0
#include <dslash_def.h>
#undef DD_DAG
#define DD_DAG 1
#include <dslash_def.h>
#undef DD_DAG
//#undef DD_CPREC
#include <clover_def.h> // kernels for applying the clover term alone
static void initDslashCuda(FullGauge gauge) {
int dslashCudaSharedBytes(Precision precision) {
return BLOCK_DIM*SHARED_FLOATS_PER_THREAD*precision;
}
#include <dslash_common.h>
int initDslash = 0;
void initDslashConstants(FullGauge gauge, int sp_stride) {
int Vh = gauge.volume;
cudaMemcpyToSymbol("Vh", &Vh, sizeof(int));
if (gauge.blockDim%64 != 0) {
printf("Sorry, block size not set approriately\n");
exit(-1);
}
cudaMemcpyToSymbol("sp_stride", &sp_stride, sizeof(int));
if (Vh%gauge.blockDim !=0) {
printf("Sorry, volume is not a multiple of number of threads %d\n", gauge.blockDim);
if (Vh%BLOCK_DIM != 0) {
printf("Error, volume not a multiple of the thread block size\n");
exit(-1);
}
......@@ -198,7 +122,7 @@ static void initDslashCuda(FullGauge gauge) {
initDslash = 1;
}
static void bindGaugeTex(FullGauge gauge, int oddBit) {
void bindGaugeTex(FullGauge gauge, int oddBit) {
if (gauge.precision == QUDA_DOUBLE_PRECISION) {
if (oddBit) {
cudaBindTexture(0, gauge0TexDouble, gauge.odd, gauge.bytes);
......@@ -226,79 +150,13 @@ static void bindGaugeTex(FullGauge gauge, int oddBit) {
}
}
static void bindCloverTex(ParityClover clover) {
if (clover.precision == QUDA_DOUBLE_PRECISION) {
cudaBindTexture(0, cloverTexDouble, clover.clover, clover.bytes);
} else if (clover.precision == QUDA_SINGLE_PRECISION) {
cudaBindTexture(0, cloverTexSingle, clover.clover, clover.bytes);
} else {
cudaBindTexture(0, cloverTexHalf, clover.clover, clover.bytes);
cudaBindTexture(0, cloverTexNorm, clover.cloverNorm, clover.bytes/18);
}
}
// ----------------------------------------------------------------------
static void checkSpinor(ParitySpinor out, ParitySpinor in) {
if (in.precision != out.precision) {
printf("Error in dslash quda: input and out spinor precisions don't match\n");
exit(-1);
}
#if (__CUDA_ARCH__ != 130)
if (in.precision == QUDA_DOUBLE_PRECISION) {
printf("Double precision not supported on this GPU\n");
exit(-1);
}
#endif
}
static void checkGaugeSpinor(ParitySpinor spinor, FullGauge gauge) {
if (spinor.volume != gauge.volume) {
printf("Error, spinor volume %d doesn't match gauge volume %d\n", spinor.volume, gauge.volume);
exit(-1);
}
#if (__CUDA_ARCH__ != 130)
if (gauge.precision == QUDA_DOUBLE_PRECISION) {
printf("Double precision not supported on this GPU\n");
exit(-1);
}
#endif
}
static void checkCloverSpinor(ParitySpinor spinor, FullClover clover) {
if (spinor.volume != clover.even.volume) {
printf("Error, spinor volume %d doesn't match even clover volume %d\n",
spinor.volume, clover.even.volume);
exit(-1);
}
if (spinor.volume != clover.odd.volume) {
printf("Error, spinor volume %d doesn't match odd clover volume %d\n",
spinor.volume, clover.odd.volume);
exit(-1);
}
#if (__CUDA_ARCH__ != 130)
if ((clover.even.precision == QUDA_DOUBLE_PRECISION) ||
(clover.odd.precision == QUDA_DOUBLE_PRECISION)) {
printf("Double precision not supported on this GPU\n");
exit(-1);
}
#endif
}
int dslashCudaSharedBytes(Precision precision, int blockDim) {
if (precision == QUDA_DOUBLE_PRECISION) return blockDim*SHARED_FLOATS_PER_THREAD*sizeof(double);
else return blockDim*SHARED_FLOATS_PER_THREAD*sizeof(float);
}
// ----------------------------------------------------------------------
// plain Wilson Dslash:
void dslashCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity, int dagger) {
if (!initDslash) initDslashCuda(gauge);
if (!initDslash) initDslashConstants(gauge, in.stride);
checkSpinor(in, out);
checkGaugeSpinor(in, gauge);
......@@ -316,8 +174,8 @@ void dslashCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity,
void dslashDCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit) {
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
bindGaugeTex(gauge, oddBit);
......@@ -378,8 +236,8 @@ void dslashDCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
void dslashSCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit) {
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
bindGaugeTex(gauge, oddBit);
......@@ -440,8 +298,8 @@ void dslashSCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
void dslashHCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit) {
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
bindGaugeTex(gauge, oddBit);
......@@ -501,7 +359,7 @@ void dslashHCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
void dslashXpayCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity, int dagger,
ParitySpinor x, double a) {
if (!initDslash) initDslashCuda(gauge);
if (!initDslash) initDslashConstants(gauge, in.stride);
checkSpinor(in, out);
checkGaugeSpinor(in, gauge);
......@@ -520,8 +378,8 @@ void dslashXpayCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int pari
void dslashXpayDCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, double a) {
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
bindGaugeTex(gauge, oddBit);
......@@ -585,8 +443,8 @@ void dslashXpayDCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
void dslashXpaySCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, double a) {
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
bindGaugeTex(gauge, oddBit);
......@@ -650,8 +508,8 @@ void dslashXpaySCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
void dslashXpayHCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, double a) {
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
bindGaugeTex(gauge, oddBit);
......@@ -753,6 +611,18 @@ void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa, int d
}
void bindCloverTex(ParityClover clover) {
if (clover.precision == QUDA_DOUBLE_PRECISION) {
cudaBindTexture(0, cloverTexDouble, clover.clover, clover.bytes);
} else if (clover.precision == QUDA_SINGLE_PRECISION) {
cudaBindTexture(0, cloverTexSingle, clover.clover, clover.bytes);
} else {
cudaBindTexture(0, cloverTexHalf, clover.clover, clover.bytes);
cudaBindTexture(0, cloverTexNorm, clover.cloverNorm, clover.bytes/18);
}
}
// ----------------------------------------------------------------------
// clover-improved Wilson Dslash
//
......@@ -762,7 +632,7 @@ void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa, int d
void cloverDslashCuda(ParitySpinor out, FullGauge gauge, FullClover cloverInv,
ParitySpinor in, int parity, int dagger)
{
if (!initDslash) initDslashCuda(gauge);
if (!initDslash) initDslashConstants(gauge, in.stride);
checkSpinor(in, out);
checkGaugeSpinor(in, gauge);
checkCloverSpinor(in, cloverInv);
......@@ -781,8 +651,8 @@ void cloverDslashCuda(ParitySpinor out, FullGauge gauge, FullClover cloverInv,
void cloverDslashDCuda(ParitySpinor res, FullGauge gauge, FullClover cloverInv,
ParitySpinor spinor, int oddBit, int daggerBit)
{
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
Precision clover_prec;
bindGaugeTex(gauge, oddBit);
......@@ -941,8 +811,8 @@ void cloverDslashDCuda(ParitySpinor res, FullGauge gauge, FullClover cloverInv,
void cloverDslashSCuda(ParitySpinor res, FullGauge gauge, FullClover cloverInv,
ParitySpinor spinor, int oddBit, int daggerBit)
{
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
Precision clover_prec;
bindGaugeTex(gauge, oddBit);
......@@ -1104,8 +974,8 @@ void cloverDslashSCuda(ParitySpinor res, FullGauge gauge, FullClover cloverInv,
void cloverDslashHCuda(ParitySpinor res, FullGauge gauge, FullClover cloverInv,
ParitySpinor spinor, int oddBit, int daggerBit)
{
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
Precision clover_prec;
bindGaugeTex(gauge, oddBit);
......@@ -1268,7 +1138,7 @@ void cloverDslashHCuda(ParitySpinor res, FullGauge gauge, FullClover cloverInv,
void cloverDslashXpayCuda(ParitySpinor out, FullGauge gauge, FullClover cloverInv, ParitySpinor in,
int parity, int dagger, ParitySpinor x, double a)
{
if (!initDslash) initDslashCuda(gauge);
if (!initDslash) initDslashConstants(gauge, in.stride);
checkSpinor(in, out);
checkGaugeSpinor(in, gauge);
checkCloverSpinor(in, cloverInv);
......@@ -1288,8 +1158,8 @@ void cloverDslashXpayCuda(ParitySpinor out, FullGauge gauge, FullClover cloverIn
void cloverDslashXpayDCuda(ParitySpinor res, FullGauge gauge, FullClover cloverInv, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, double a)
{
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
Precision clover_prec;
bindGaugeTex(gauge, oddBit);
......@@ -1448,8 +1318,8 @@ void cloverDslashXpayDCuda(ParitySpinor res, FullGauge gauge, FullClover cloverI
void cloverDslashXpaySCuda(ParitySpinor res, FullGauge gauge, FullClover cloverInv, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, double a)
{
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
Precision clover_prec;
bindGaugeTex(gauge, oddBit);
......@@ -1612,8 +1482,8 @@ void cloverDslashXpaySCuda(ParitySpinor res, FullGauge gauge, FullClover cloverI
void cloverDslashXpayHCuda(ParitySpinor res, FullGauge gauge, FullClover cloverInv, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, double a)
{
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
Precision clover_prec;
bindGaugeTex(gauge, oddBit);
......@@ -1862,7 +1732,7 @@ void cloverMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover clover, FullC
void cloverMatPCDagMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover clover, FullClover cloverInv, ParitySpinor in,
double kappa, ParitySpinor tmp, MatPCType matpc_type)
{
ParitySpinor aux = allocateParitySpinor(out.X, out.precision); // FIXME: eliminate aux
ParitySpinor aux = allocateParitySpinor(out.X, out.precision, out.pad); // FIXME: eliminate aux
cloverMatPCCuda(aux, gauge, clover, cloverInv, in, kappa, tmp, matpc_type, 0);
cloverMatPCCuda(out, gauge, clover, cloverInv, aux, kappa, tmp, matpc_type, 1);
freeParitySpinor(aux);
......@@ -1885,7 +1755,7 @@ void cloverMatCuda(FullSpinor out, FullGauge gauge, FullClover clover,
void cloverCuda(ParitySpinor out, FullGauge gauge, FullClover clover,
ParitySpinor in, int parity)
{
if (!initDslash) initDslashCuda(gauge);
if (!initDslash) initDslashConstants(gauge, in.stride);
checkSpinor(in, out);
checkGaugeSpinor(in, gauge);
checkCloverSpinor(in, clover);
......@@ -1904,8 +1774,8 @@ void cloverCuda(ParitySpinor out, FullGauge gauge, FullClover clover,
void cloverDCuda(ParitySpinor res, FullGauge gauge, FullClover clover,
ParitySpinor spinor, int oddBit)
{
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
Precision clover_prec;
bindGaugeTex(gauge, oddBit);
......@@ -1937,8 +1807,8 @@ void cloverDCuda(ParitySpinor res, FullGauge gauge, FullClover clover,
void cloverSCuda(ParitySpinor res, FullGauge gauge, FullClover clover,
ParitySpinor spinor, int oddBit)
{
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
Precision clover_prec;
bindGaugeTex(gauge, oddBit);
......@@ -1970,8 +1840,8 @@ void cloverSCuda(ParitySpinor res, FullGauge gauge, FullClover clover,
void cloverHCuda(ParitySpinor res, FullGauge gauge, FullClover clover,
ParitySpinor spinor, int oddBit)
{
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
Precision clover_prec;
bindGaugeTex(gauge, oddBit);
......
......@@ -113,7 +113,7 @@ extern "C" {
ParitySpinor spinor, int oddBit);
void cloverHCuda(ParitySpinor res, FullGauge gauge, FullClover clover,
ParitySpinor spinor, int oddBit);
// -- inv_cg_cuda.cpp
void invertCgCuda(ParitySpinor x, ParitySpinor b, ParitySpinor tmp,
QudaInvertParam *param);
......
......@@ -36,7 +36,7 @@ void init() {
gaugeParam.X[0] = 24;
gaugeParam.X[1] = 24;
gaugeParam.X[2] = 24;
gaugeParam.X[3] = 48;
gaugeParam.X[3] = 64;
setDims(gaugeParam.X);
gaugeParam.anisotropy = 2.3;
......@@ -51,8 +51,6 @@ void init() {
gaugeParam.cuda_prec_sloppy = gaugeParam.cuda_prec;
gaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
gaugeParam.blockDim = 64;
if (clover_yes) {
inv_param.dslash_type = QUDA_CLOVER_WILSON_DSLASH;
} else {
......@@ -66,12 +64,14 @@ void init() {
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
inv_param.sp_pad = 24*24*24;
if (test_type == 2) inv_param.dirac_order = QUDA_DIRAC_ORDER;
else inv_param.dirac_order = QUDA_DIRAC_ORDER;
if (clover_yes) {
inv_param.clover_cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.clover_cuda_prec = QUDA_HALF_PRECISION;
inv_param.clover_cuda_prec = QUDA_SINGLE_PRECISION;
inv_param.clover_cuda_prec_sloppy = inv_param.clover_cuda_prec;
inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER;
}
......@@ -143,9 +143,9 @@ void init() {
if (!TRANSFER) {
gaugeParam.X[0] /= 2;
tmp = allocateParitySpinor(gaugeParam.X, inv_param.cuda_prec);
cudaSpinor = allocateSpinorField(gaugeParam.X, inv_param.cuda_prec);
cudaSpinorOut = allocateSpinorField(gaugeParam.X, inv_param.cuda_prec);
tmp = allocateParitySpinor(gaugeParam.X, inv_param.cuda_prec, inv_param.sp_pad);
cudaSpinor = allocateSpinorField(gaugeParam.X, inv_param.cuda_prec, inv_param.sp_pad);
cudaSpinorOut = allocateSpinorField(gaugeParam.X, inv_param.cuda_prec, inv_param.sp_pad);
gaugeParam.X[0] *= 2;