Advanced Computing Platform for Theoretical Physics

Commit 246ae47c authored by mikeaclark's avatar mikeaclark
Browse files

Added initial padding support for spinor fields, fixed a reduction bug

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@528 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 7ee1d3d5
......@@ -37,7 +37,7 @@ LDFLAGS = -fPIC $(LIB)
all: dslash_test invert_test su3_test pack_test blas_test
QUDA = libquda.a
QUDA_OBJS = blas_quda.o blas_reference.o clover_quda.o dslash_quda.o \
QUDA_OBJS = blas_quda.o blas_reference.o clover_field.o clover_quda.o dslash_quda.o \
dslash_reference.o gauge_quda.o inv_bicgstab_quda.o inv_cg_quda.o \
invert_quda.o spinor_quda.o util_quda.o
QUDA_HDRS = blas_quda.h blas_reference.h clover_def.h dslash_def.h \
......
This diff is collapsed.
......@@ -19,14 +19,15 @@ void init() {
int X[4];
X[0] = 32;
X[1] = 32;
X[2] = 32;
X[3] = 32;
X[0] = 24;
X[1] = 24;
X[2] = 24;
X[3] = 64;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
inv_param.cuda_prec = QUDA_HALF_PRECISION;
inv_param.verbosity = QUDA_VERBOSE;
inv_param.sp_pad = 0;
invert_param = &inv_param;
......@@ -35,11 +36,11 @@ void init() {
// need single parity dimensions
X[0] /= 2;
v = allocateParitySpinor(X, inv_param.cuda_prec);
w = allocateParitySpinor(X, inv_param.cuda_prec);
x = allocateParitySpinor(X, inv_param.cuda_prec);
y = allocateParitySpinor(X, inv_param.cuda_prec);
z = allocateParitySpinor(X, inv_param.cuda_prec);
v = allocateParitySpinor(X, inv_param.cuda_prec, inv_param.sp_pad);
w = allocateParitySpinor(X, inv_param.cuda_prec, inv_param.sp_pad);
x = allocateParitySpinor(X, inv_param.cuda_prec, inv_param.sp_pad);
y = allocateParitySpinor(X, inv_param.cuda_prec, inv_param.sp_pad);
z = allocateParitySpinor(X, inv_param.cuda_prec, inv_param.sp_pad);
}
......
......@@ -328,13 +328,12 @@ volatile spinorFloat o31_im;
volatile spinorFloat o32_re;
volatile spinorFloat o32_im;
#include "read_gauge.h"
#include "read_clover.h"
#include "io_spinor.h"
int sid = blockIdx.x*blockDim.x + threadIdx.x;
int z1 = FAST_INT_DIVIDE(sid, X1h);
int x1h = sid - z1*X1h;
int z2 = FAST_INT_DIVIDE(z1, X2);
......
......@@ -35,7 +35,7 @@
#define DD_RECON 0
#define DD_GPREC 0
#define DD_SPREC 0
#define DD_CPREC 0
#define DD_CPREC 0 //
#endif
// set options for current iteration
......@@ -249,7 +249,8 @@ DD_FUNC(DD_GPREC_F, DD_SPREC_F, DD_CPREC_F, DD_RECON_F, DD_DAG_F, DD_XPAY_F)(DD_
#undef DD_SPREC
#define DD_SPREC 2
#else
#undef DD_SPREC
#undef DD_SPREC // from here
#define DD_SPREC 0
#if (DD_CPREC==0)
......@@ -261,7 +262,8 @@ DD_FUNC(DD_GPREC_F, DD_SPREC_F, DD_CPREC_F, DD_RECON_F, DD_DAG_F, DD_XPAY_F)(DD_
#elif (DD_CPREC==2)
#undef DD_CPREC
#define DD_CPREC 3
#else
#else // to here
#undef DD_LOOP
#undef DD_DAG
......@@ -269,9 +271,9 @@ DD_FUNC(DD_GPREC_F, DD_SPREC_F, DD_CPREC_F, DD_RECON_F, DD_DAG_F, DD_XPAY_F)(DD_
#undef DD_RECON
#undef DD_GPREC
#undef DD_SPREC
#undef DD_CPREC
#undef DD_CPREC //
#endif // DD_CPREC
#endif // DD_CPREC //
#endif // DD_SPREC
#endif // DD_GPREC
#endif // DD_RECON
......
......@@ -4,123 +4,47 @@
#include <dslash_quda.h>
#include <spinor_quda.h> // not needed once call to allocateParitySpinor() is removed
#if (__CUDA_ARCH__ == 130)
static __inline__ __device__ double2 fetch_double2(texture<int4, 1> t, int i)
{
int4 v = tex1Dfetch(t,i);
return make_double2(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z));
}
#endif
// Double precision gauge field
texture<int4, 1> gauge0TexDouble;
texture<int4, 1> gauge1TexDouble;
// Single precision gauge field
texture<float4, 1, cudaReadModeElementType> gauge0TexSingle;
texture<float4, 1, cudaReadModeElementType> gauge1TexSingle;
// Half precision gauge field
texture<short4, 1, cudaReadModeNormalizedFloat> gauge0TexHalf;
texture<short4, 1, cudaReadModeNormalizedFloat> gauge1TexHalf;
// Double precision input spinor field
texture<int4, 1> spinorTexDouble;
// Single precision input spinor field
texture<float4, 1, cudaReadModeElementType> spinorTexSingle;
// Half precision input spinor field
texture<short4, 1, cudaReadModeNormalizedFloat> spinorTexHalf;
texture<float, 1, cudaReadModeElementType> spinorTexNorm;
// Double precision accumulate spinor field
texture<int4, 1> accumTexDouble;
// Single precision accumulate spinor field
texture<float4, 1, cudaReadModeElementType> accumTexSingle;
// Half precision accumulate spinor field
texture<short4, 1, cudaReadModeNormalizedFloat> accumTexHalf;
texture<float, 1, cudaReadModeElementType> accumTexNorm;
// Double precision clover term
texture<int4, 1> cloverTexDouble;
// Single precision clover term
texture<float4, 1, cudaReadModeElementType> cloverTexSingle;
// Half precision clover term
texture<short4, 1, cudaReadModeNormalizedFloat> cloverTexHalf;
texture<float, 1, cudaReadModeElementType> cloverTexNorm;
#include<dslash_textures.h>
#include<dslash_constants.h>
QudaGaugeParam *gauge_param;
QudaInvertParam *invert_param;
__constant__ int X1h;
__constant__ int X1;
__constant__ int X2;
__constant__ int X3;
__constant__ int X4;
__constant__ int X1m1;
__constant__ int X2m1;
__constant__ int X3m1;
__constant__ int X4m1;
__constant__ int X2X1mX1;
__constant__ int X3X2X1mX2X1;
__constant__ int X4X3X2X1mX3X2X1;
__constant__ int X4X3X2X1hmX3X2X1h;
__constant__ float X1h_inv;
__constant__ float X2_inv;
__constant__ float X3_inv;
__constant__ int X2X1;
__constant__ int X3X2X1;
__constant__ int Vh;
__constant__ int gauge_fixed;
// single precision constants
__constant__ float anisotropy_f;
__constant__ float t_boundary_f;
__constant__ float pi_f;
// double precision constants
__constant__ double anisotropy;
__constant__ double t_boundary;
static int initDslash = 0;
unsigned long long dslash_quda_flops;
unsigned long long dslash_quda_bytes;
#define BLOCK_DIM 64
//#include <dslash_def.h> // Dslash kernel definitions
// kludge to avoid '#include nested too deeply' error
//#define DD_CPREC 3
#define DD_DAG 0
#include <dslash_def.h>
#undef DD_DAG
#define DD_DAG 1
#include <dslash_def.h>
#undef DD_DAG
//#undef DD_CPREC
#include <clover_def.h> // kernels for applying the clover term alone
static void initDslashCuda(FullGauge gauge) {
int dslashCudaSharedBytes(Precision precision) {
return BLOCK_DIM*SHARED_FLOATS_PER_THREAD*precision;
}
#include <dslash_common.h>
int initDslash = 0;
void initDslashConstants(FullGauge gauge, int sp_stride) {
int Vh = gauge.volume;
cudaMemcpyToSymbol("Vh", &Vh, sizeof(int));
if (gauge.blockDim%64 != 0) {
printf("Sorry, block size not set approriately\n");
exit(-1);
}
cudaMemcpyToSymbol("sp_stride", &sp_stride, sizeof(int));
if (Vh%gauge.blockDim !=0) {
printf("Sorry, volume is not a multiple of number of threads %d\n", gauge.blockDim);
if (Vh%BLOCK_DIM != 0) {
printf("Error, volume not a multiple of the thread block size\n");
exit(-1);
}
......@@ -198,7 +122,7 @@ static void initDslashCuda(FullGauge gauge) {
initDslash = 1;
}
static void bindGaugeTex(FullGauge gauge, int oddBit) {
void bindGaugeTex(FullGauge gauge, int oddBit) {
if (gauge.precision == QUDA_DOUBLE_PRECISION) {
if (oddBit) {
cudaBindTexture(0, gauge0TexDouble, gauge.odd, gauge.bytes);
......@@ -226,79 +150,13 @@ static void bindGaugeTex(FullGauge gauge, int oddBit) {
}
}
static void bindCloverTex(ParityClover clover) {
if (clover.precision == QUDA_DOUBLE_PRECISION) {
cudaBindTexture(0, cloverTexDouble, clover.clover, clover.bytes);
} else if (clover.precision == QUDA_SINGLE_PRECISION) {
cudaBindTexture(0, cloverTexSingle, clover.clover, clover.bytes);
} else {
cudaBindTexture(0, cloverTexHalf, clover.clover, clover.bytes);
cudaBindTexture(0, cloverTexNorm, clover.cloverNorm, clover.bytes/18);
}
}
// ----------------------------------------------------------------------
static void checkSpinor(ParitySpinor out, ParitySpinor in) {
if (in.precision != out.precision) {
printf("Error in dslash quda: input and out spinor precisions don't match\n");
exit(-1);
}
#if (__CUDA_ARCH__ != 130)
if (in.precision == QUDA_DOUBLE_PRECISION) {
printf("Double precision not supported on this GPU\n");
exit(-1);
}
#endif
}
static void checkGaugeSpinor(ParitySpinor spinor, FullGauge gauge) {
if (spinor.volume != gauge.volume) {
printf("Error, spinor volume %d doesn't match gauge volume %d\n", spinor.volume, gauge.volume);
exit(-1);
}
#if (__CUDA_ARCH__ != 130)
if (gauge.precision == QUDA_DOUBLE_PRECISION) {
printf("Double precision not supported on this GPU\n");
exit(-1);
}
#endif
}
static void checkCloverSpinor(ParitySpinor spinor, FullClover clover) {
if (spinor.volume != clover.even.volume) {
printf("Error, spinor volume %d doesn't match even clover volume %d\n",
spinor.volume, clover.even.volume);
exit(-1);
}
if (spinor.volume != clover.odd.volume) {
printf("Error, spinor volume %d doesn't match odd clover volume %d\n",
spinor.volume, clover.odd.volume);
exit(-1);
}
#if (__CUDA_ARCH__ != 130)
if ((clover.even.precision == QUDA_DOUBLE_PRECISION) ||
(clover.odd.precision == QUDA_DOUBLE_PRECISION)) {
printf("Double precision not supported on this GPU\n");
exit(-1);
}
#endif
}
int dslashCudaSharedBytes(Precision precision, int blockDim) {
if (precision == QUDA_DOUBLE_PRECISION) return blockDim*SHARED_FLOATS_PER_THREAD*sizeof(double);
else return blockDim*SHARED_FLOATS_PER_THREAD*sizeof(float);
}
// ----------------------------------------------------------------------
// plain Wilson Dslash:
void dslashCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity, int dagger) {
if (!initDslash) initDslashCuda(gauge);
if (!initDslash) initDslashConstants(gauge, in.stride);
checkSpinor(in, out);
checkGaugeSpinor(in, gauge);
......@@ -316,8 +174,8 @@ void dslashCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity,
void dslashDCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit) {
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
bindGaugeTex(gauge, oddBit);
......@@ -378,8 +236,8 @@ void dslashDCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
void dslashSCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit) {
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
bindGaugeTex(gauge, oddBit);
......@@ -440,8 +298,8 @@ void dslashSCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
void dslashHCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit) {
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
bindGaugeTex(gauge, oddBit);
......@@ -501,7 +359,7 @@ void dslashHCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
void dslashXpayCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity, int dagger,
ParitySpinor x, double a) {
if (!initDslash) initDslashCuda(gauge);
if (!initDslash) initDslashConstants(gauge, in.stride);
checkSpinor(in, out);
checkGaugeSpinor(in, gauge);
......@@ -520,8 +378,8 @@ void dslashXpayCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int pari
void dslashXpayDCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, double a) {
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
bindGaugeTex(gauge, oddBit);
......@@ -585,8 +443,8 @@ void dslashXpayDCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
void dslashXpaySCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, double a) {
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
bindGaugeTex(gauge, oddBit);
......@@ -650,8 +508,8 @@ void dslashXpaySCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
void dslashXpayHCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, double a) {
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
bindGaugeTex(gauge, oddBit);
......@@ -753,6 +611,18 @@ void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa, int d
}
void bindCloverTex(ParityClover clover) {
if (clover.precision == QUDA_DOUBLE_PRECISION) {
cudaBindTexture(0, cloverTexDouble, clover.clover, clover.bytes);
} else if (clover.precision == QUDA_SINGLE_PRECISION) {
cudaBindTexture(0, cloverTexSingle, clover.clover, clover.bytes);
} else {
cudaBindTexture(0, cloverTexHalf, clover.clover, clover.bytes);
cudaBindTexture(0, cloverTexNorm, clover.cloverNorm, clover.bytes/18);
}
}
// ----------------------------------------------------------------------
// clover-improved Wilson Dslash
//
......@@ -762,7 +632,7 @@ void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa, int d
void cloverDslashCuda(ParitySpinor out, FullGauge gauge, FullClover cloverInv,
ParitySpinor in, int parity, int dagger)
{
if (!initDslash) initDslashCuda(gauge);
if (!initDslash) initDslashConstants(gauge, in.stride);
checkSpinor(in, out);
checkGaugeSpinor(in, gauge);
checkCloverSpinor(in, cloverInv);
......@@ -781,8 +651,8 @@ void cloverDslashCuda(ParitySpinor out, FullGauge gauge, FullClover cloverInv,
void cloverDslashDCuda(ParitySpinor res, FullGauge gauge, FullClover cloverInv,
ParitySpinor spinor, int oddBit, int daggerBit)
{
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
Precision clover_prec;
bindGaugeTex(gauge, oddBit);
......@@ -941,8 +811,8 @@ void cloverDslashDCuda(ParitySpinor res, FullGauge gauge, FullClover cloverInv,
void cloverDslashSCuda(ParitySpinor res, FullGauge gauge, FullClover cloverInv,
ParitySpinor spinor, int oddBit, int daggerBit)
{
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
Precision clover_prec;
bindGaugeTex(gauge, oddBit);
......@@ -1104,8 +974,8 @@ void cloverDslashSCuda(ParitySpinor res, FullGauge gauge, FullClover cloverInv,
void cloverDslashHCuda(ParitySpinor res, FullGauge gauge, FullClover cloverInv,
ParitySpinor spinor, int oddBit, int daggerBit)
{
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
Precision clover_prec;
bindGaugeTex(gauge, oddBit);
......@@ -1268,7 +1138,7 @@ void cloverDslashHCuda(ParitySpinor res, FullGauge gauge, FullClover cloverInv,
void cloverDslashXpayCuda(ParitySpinor out, FullGauge gauge, FullClover cloverInv, ParitySpinor in,
int parity, int dagger, ParitySpinor x, double a)
{
if (!initDslash) initDslashCuda(gauge);
if (!initDslash) initDslashConstants(gauge, in.stride);
checkSpinor(in, out);
checkGaugeSpinor(in, gauge);
checkCloverSpinor(in, cloverInv);
......@@ -1288,8 +1158,8 @@ void cloverDslashXpayCuda(ParitySpinor out, FullGauge gauge, FullClover cloverIn
void cloverDslashXpayDCuda(ParitySpinor res, FullGauge gauge, FullClover cloverInv, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, double a)
{
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
Precision clover_prec;
bindGaugeTex(gauge, oddBit);
......@@ -1448,8 +1318,8 @@ void cloverDslashXpayDCuda(ParitySpinor res, FullGauge gauge, FullClover cloverI
void cloverDslashXpaySCuda(ParitySpinor res, FullGauge gauge, FullClover cloverInv, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, double a)
{
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
Precision clover_prec;
bindGaugeTex(gauge, oddBit);
......@@ -1612,8 +1482,8 @@ void cloverDslashXpaySCuda(ParitySpinor res, FullGauge gauge, FullClover cloverI
void cloverDslashXpayHCuda(ParitySpinor res, FullGauge gauge, FullClover cloverInv, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, double a)
{
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
Precision clover_prec;
bindGaugeTex(gauge, oddBit);
......@@ -1862,7 +1732,7 @@ void cloverMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover clover, FullC
void cloverMatPCDagMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover clover, FullClover cloverInv, ParitySpinor in,
double kappa, ParitySpinor tmp, MatPCType matpc_type)
{
ParitySpinor aux = allocateParitySpinor(out.X, out.precision); // FIXME: eliminate aux
ParitySpinor aux = allocateParitySpinor(out.X, out.precision, out.pad); // FIXME: eliminate aux
cloverMatPCCuda(aux, gauge, clover, cloverInv, in, kappa, tmp, matpc_type, 0);
cloverMatPCCuda(out, gauge, clover, cloverInv, aux, kappa, tmp, matpc_type, 1);
freeParitySpinor(aux);
......@@ -1885,7 +1755,7 @@ void cloverMatCuda(FullSpinor out, FullGauge gauge, FullClover clover,
void cloverCuda(ParitySpinor out, FullGauge gauge, FullClover clover,
ParitySpinor in, int parity)
{
if (!initDslash) initDslashCuda(gauge);
if (!initDslash) initDslashConstants(gauge, in.stride);
checkSpinor(in, out);
checkGaugeSpinor(in, gauge);
checkCloverSpinor(in, clover);
......@@ -1904,8 +1774,8 @@ void cloverCuda(ParitySpinor out, FullGauge gauge, FullClover clover,
void cloverDCuda(ParitySpinor res, FullGauge gauge, FullClover clover,
ParitySpinor spinor, int oddBit)
{
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
Precision clover_prec;
bindGaugeTex(gauge, oddBit);
......@@ -1937,8 +1807,8 @@ void cloverDCuda(ParitySpinor res, FullGauge gauge, FullClover clover,
void cloverSCuda(ParitySpinor res, FullGauge gauge, FullClover clover,
ParitySpinor spinor, int oddBit)
{
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
Precision clover_prec;
bindGaugeTex(gauge, oddBit);
......@@ -1970,8 +1840,8 @@ void cloverSCuda(ParitySpinor res, FullGauge gauge, FullClover clover,
void cloverHCuda(ParitySpinor res, FullGauge gauge, FullClover clover,
ParitySpinor spinor, int oddBit)
{
dim3 gridDim(res.volume/gauge.blockDim, 1, 1);
dim3 blockDim(gauge.blockDim, 1, 1);
dim3 gridDim(res.volume/BLOCK_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
Precision clover_prec;
bindGaugeTex(gauge, oddBit);
......
......@@ -113,7 +113,7 @@ extern "C" {
ParitySpinor spinor, int oddBit);
void cloverHCuda(ParitySpinor res, FullGauge gauge, FullClover clover,
ParitySpinor spinor, int oddBit);
// -- inv_cg_cuda.cpp
void invertCgCuda(ParitySpinor x, ParitySpinor b, ParitySpinor tmp,
QudaInvertParam *param);
......
......@@ -36,7 +36,7 @@ void init() {
gaugeParam.X[0] = 24;
gaugeParam.X[1] = 24;
gaugeParam.X[2] = 24;
gaugeParam.X[3] = 48;
gaugeParam.X[3] = 64;
setDims(gaugeParam.X);
gaugeParam.anisotropy = 2.3;
......@@ -51,8 +51,6 @@ void init() {
gaugeParam.cuda_prec_sloppy = gaugeParam.cuda_prec;
gaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
gaugeParam.blockDim = 64;
if (clover_yes) {
inv_param.dslash_type = QUDA_CLOVER_WILSON_DSLASH;
} else {
......@@ -66,12 +64,14 @@ void init() {
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
inv_param.sp_pad = 24*24*24;
if (test_type == 2) inv_param.dirac_order = QUDA_DIRAC_ORDER;
else inv_param.dirac_order = QUDA_DIRAC_ORDER;
if (clover_yes) {
inv_param.clover_cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.clover_cuda_prec = QUDA_HALF_PRECISION;
inv_param.clover_cuda_prec = QUDA_SINGLE_PRECISION;
inv_param.clover_cuda_prec_sloppy = inv_param.clover_cuda_prec;
inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER;
}
......@@ -143,9 +143,9 @@ void init() {
if (!TRANSFER) {
gaugeParam.X[0] /= 2;
tmp = allocateParitySpinor(gaugeParam.X, inv_param.cuda_prec);
cudaSpinor = allocateSpinorField(gaugeParam.X, inv_param.cuda_prec);
cudaSpinorOut = allocateSpinorField(gaugeParam.X, inv_param.cuda_prec);
tmp = allocateParitySpinor(gaugeParam.X, inv_param.cuda_prec, inv_param.sp_pad);
cudaSpinor = allocateSpinorField(gaugeParam.X, inv_param.cuda_prec, inv_param.sp_pad);
cudaSpinorOut = allocateSpinorField(gaugeParam.X, inv_param.cuda_prec, inv_param.sp_pad);
gaugeParam.X[0] *= 2;
if (test_type < 2) {
......@@ -268,7 +268,7 @@ void dslashTest() {
init();
float spinorGiB = (float)Vh*spinorSiteSize*sizeof(inv_param.cpu_prec) / (1 << 30);
float sharedKB = (float)dslashCudaSharedBytes(inv_param.cuda_prec, gaugeParam.blockDim) / (1 << 10);
float sharedKB = 0;//(float)dslashCudaSharedBytes(inv_param.cuda_prec) / (1 << 10);
printf("\nSpinor mem: %.3f GiB\n", spinorGiB);
printf("Gauge mem: %.3f GiB\n", gaugeParam.gaugeGiB);
printf("Shared mem: %.3f KB\n", sharedKB);
......
......@@ -531,8 +531,18 @@ void loadGaugeField(FloatN *even, FloatN *odd, Float *cpuGauge, ReconstructType
exit(-1);
}
cudaMemcpy(even, packedEven, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(odd, packedOdd, bytes, cudaMemcpyHostToDevice);
cudaError_t error = cudaMemcpy(even, packedEven, bytes, cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
printf("Error: %s\n", cudaGetErrorString(error));
exit(-1);
}
error = cudaMemcpy(odd, packedOdd, bytes, cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
printf("Error: %s\n", cudaGetErrorString(error));
exit(-1);
}
#ifndef __DEVICE_EMULATION__
cudaFreeHost(packedEven);
......@@ -584,7 +594,7 @@ void retrieveGaugeField(Float *cpuGauge, FloatN *even, FloatN *odd, ReconstructT
}
void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, Precision precision, ReconstructType reconstruct,
Tboundary t_boundary, int *XX, double anisotropy, int blockDim) {
Tboundary t_boundary, int *XX, double anisotropy) {
if (gauge_param->cpu_prec == QUDA_HALF_PRECISION) {
printf("QUDA error: half precision not supported on cpu\n");
......@@ -604,8 +614,6 @@ void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, Precision precision,
cudaGauge->X[0] /= 2; // actually store the even-odd sublattice dimensions
cudaGauge->volume /= 2;
cudaGauge->blockDim = blockDim;
allocateGaugeField(cudaGauge, reconstruct, precision);
if (precision == QUDA_DOUBLE_PRECISION) {
......
......@@ -10,7 +10,7 @@ extern "C" {
void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, Precision precision,
ReconstructType reconstruct, Tboundary t_boundary,
int *X, double anisotropy, int blockDim);
int *X, double anisotropy);
void restoreGaugeField(void *cpuGauge, FullGauge *cudaGauge);
......
......@@ -25,12 +25,12 @@ void MatVec(ParitySpinor out, FullGauge gauge, FullClover clover, FullClover cl
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, ParitySpinor r,
QudaInvertParam *invert_param, DagType dag_type)
{
ParitySpinor y = allocateParitySpinor(x.X, x.precision);
ParitySpinor y = allocateParitySpinor(x.X, x.precision, x.pad);
ParitySpinor p = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
ParitySpinor v = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
ParitySpinor t = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
ParitySpinor tmp = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
ParitySpinor p = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy, x.pad);
ParitySpinor v = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy, x.pad);
ParitySpinor t = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy, x.pad);
ParitySpinor tmp = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy, x.pad);
ParitySpinor x_sloppy, r_sloppy, r0;
if (invert_param->cuda_prec_sloppy == x.precision) {
......@@ -38,9 +38,9 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, ParitySpinor r,
r_sloppy = r;
r0 = b;
} else {
x_sloppy = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
r_sloppy = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
r0 = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
x_sloppy = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy, x.pad);
r_sloppy = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy, x.pad);
r0 = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy, x.pad);
copyCuda(r0, b);
}
......@@ -50,6 +50,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, ParitySpinor r,
zeroCuda(y);
double b2 = normCuda(b);
double r2 = b2;
double stop = b2*invert_param->tol*invert_param->tol; // stopping condition of solver
double delta = invert_param->reliable_delta;
......@@ -108,7 +109,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, ParitySpinor r,
alpha.x *= -1.0; alpha.y *= -1.0;
caxpyCuda(alpha, v, r_sloppy); // 4
alpha.x *= -1.0; alpha.y *= -1.0;
MatVec(t, cudaGaugeSloppy, cudaCloverSloppy, cudaCloverInvSloppy, r_sloppy, invert_param, tmp, dag_type);
// omega = (t, r) / (t, t)
......
......@@ -20,21 +20,21 @@ void MatVec(ParitySpinor out, FullGauge gauge, FullClover clover, FullClover cl
}
}
void invertCgCuda(ParitySpinor x, ParitySpinor b, ParitySpinor r, QudaInvertParam *invert_param)
void invertCgCuda(ParitySpinor x, ParitySpinor b, ParitySpinor y, QudaInvertParam *invert_param)
{
ParitySpinor y = allocateParitySpinor(x.X, x.precision);
ParitySpinor r = allocateParitySpinor(x.X, x.precision, x.pad);
ParitySpinor p = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
ParitySpinor Ap = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
ParitySpinor tmp = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
ParitySpinor p = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy, x.pad);
ParitySpinor Ap = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy, x.pad);
ParitySpinor tmp = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy, x.pad);
ParitySpinor x_sloppy, r_sloppy;
if (invert_param->cuda_prec_sloppy == x.precision) {
x_sloppy = x;
r_sloppy = r;
} else {
x_sloppy = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
r_sloppy = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy);
x_sloppy = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy, x.pad);
r_sloppy = allocateParitySpinor(x.X, invert_param->cuda_prec_sloppy, x.pad);
}
copyCuda(r, b);
......@@ -151,7 +151,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor b, ParitySpinor r, QudaInvertPara
freeParitySpinor(Ap);
freeParitySpinor(tmp);
freeParitySpinor(y);
freeParitySpinor(r);
return;
}
......@@ -130,14 +130,13 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
gauge_param->packed_size = (gauge_param->reconstruct == QUDA_RECONSTRUCT_8) ? 8 : 12;
createGaugeField(&cudaGaugePrecise, h_gauge, gauge_param->cuda_prec, gauge_param->reconstruct,
gauge_param->t_boundary, gauge_param->X, gauge_param->anisotropy, gauge_param->blockDim);
gauge_param->t_boundary, gauge_param->X, gauge_param->anisotropy);
gauge_param->gaugeGiB = 2.0*cudaGaugePrecise.bytes/ (1 << 30);
if (gauge_param->cuda_prec_sloppy != gauge_param->cuda_prec ||
gauge_param->reconstruct_sloppy != gauge_param->reconstruct) {
createGaugeField(&cudaGaugeSloppy, h_gauge, gauge_param->cuda_prec_sloppy,
gauge_param->reconstruct_sloppy, gauge_param->t_boundary,
gauge_param->X, gauge_param->anisotropy,
gauge_param->blockDim_sloppy);
gauge_param->X, gauge_param->anisotropy);
gauge_param->gaugeGiB += 2.0*cudaGaugeSloppy.bytes/ (1 << 30);
} else {
cudaGaugeSloppy = cudaGaugePrecise;
......@@ -235,8 +234,8 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int parity,
{
checkPrecision(inv_param->cpu_prec);
ParitySpinor in = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec);
ParitySpinor out = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec);
ParitySpinor in = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec, inv_param->sp_pad);
ParitySpinor out = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec, inv_param->sp_pad);
loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->dirac_order);
......@@ -263,9 +262,9 @@ void MatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int dagger)
{
checkPrecision(inv_param->cpu_prec);
ParitySpinor in = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec);
ParitySpinor out = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec);
ParitySpinor tmp = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec);
ParitySpinor in = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec, inv_param->sp_pad);
ParitySpinor out = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec, inv_param->sp_pad);
ParitySpinor tmp = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec, inv_param->sp_pad);
loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->dirac_order);
......@@ -293,9 +292,9 @@ void MatPCDagMatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
{
checkPrecision(inv_param->cpu_prec);
ParitySpinor in = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec);
ParitySpinor out = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec);
ParitySpinor tmp = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec);
ParitySpinor in = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec, inv_param->sp_pad);
ParitySpinor out = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec, inv_param->sp_pad);
ParitySpinor tmp = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec, inv_param->sp_pad);
loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->dirac_order);
......@@ -322,8 +321,8 @@ void MatPCDagMatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int dagger) {
checkPrecision(inv_param->cpu_prec);
FullSpinor in = allocateSpinorField(cudaGaugePrecise.X, inv_param->cuda_prec);
FullSpinor out = allocateSpinorField(cudaGaugePrecise.X, inv_param->cuda_prec);
FullSpinor in = allocateSpinorField(cudaGaugePrecise.X, inv_param->cuda_prec, inv_param->sp_pad);
FullSpinor out = allocateSpinorField(cudaGaugePrecise.X, inv_param->cuda_prec, inv_param->sp_pad);
loadSpinorField(in, h_in, inv_param->cpu_prec, inv_param->dirac_order);
......@@ -334,7 +333,7 @@ void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int dagger) {
if (inv_param->dslash_type == QUDA_WILSON_DSLASH) {
MatCuda(out, cudaGaugePrecise, in, -kappa, dagger);
} else if (inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
ParitySpinor tmp = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec);
ParitySpinor tmp = allocateParitySpinor(cudaGaugePrecise.X, inv_param->cuda_prec, inv_param->sp_pad);
cloverMatCuda(out, cudaGaugePrecise, cudaCloverPrecise, in, kappa, tmp, dagger);
freeParitySpinor(tmp);
} else {
......@@ -368,18 +367,18 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
if (param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) kappa *= cudaGaugePrecise.anisotropy;
FullSpinor b, x;
ParitySpinor in = allocateParitySpinor(cudaGaugePrecise.X, invert_param->cuda_prec); // source vector
ParitySpinor out = allocateParitySpinor(cudaGaugePrecise.X, invert_param->cuda_prec); // solution vector
ParitySpinor tmp = allocateParitySpinor(cudaGaugePrecise.X, invert_param->cuda_prec); // temporary used when applying operator
ParitySpinor in = allocateParitySpinor(cudaGaugePrecise.X, invert_param->cuda_prec, invert_param->sp_pad);// source
ParitySpinor out = allocateParitySpinor(cudaGaugePrecise.X, invert_param->cuda_prec, invert_param->sp_pad);// solution
ParitySpinor tmp = allocateParitySpinor(cudaGaugePrecise.X, invert_param->cuda_prec, invert_param->sp_pad);// temporary
if (param->solution_type == QUDA_MAT_SOLUTION) {
if (param->preserve_source == QUDA_PRESERVE_SOURCE_YES) {
b = allocateSpinorField(cudaGaugePrecise.X, invert_param->cuda_prec);
b = allocateSpinorField(cudaGaugePrecise.X, invert_param->cuda_prec, invert_param->sp_pad);
} else {
b.even = out;
b.odd = tmp;
}
if (param->matpc_type == QUDA_MATPC_EVEN_EVEN ||
param->matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) {
x.odd = tmp;
......
......@@ -30,6 +30,8 @@ extern "C" {
int blockDim; // number of threads in a block
int blockDim_sloppy;
int ga_pad;
int packed_size;
double gaugeGiB;
......@@ -65,6 +67,9 @@ extern "C" {
QudaVerbosity verbosity;
int sp_pad;
int cl_pad;
int iter;
double spinorGiB;
double cloverGiB;
......
......@@ -18,7 +18,7 @@ int main(int argc, char **argv)
Gauge_param.X[0] = 24;
Gauge_param.X[1] = 24;
Gauge_param.X[2] = 24;
Gauge_param.X[3] = 128;
Gauge_param.X[3] = 64;
setDims(Gauge_param.X);
Gauge_param.anisotropy = 1.0;
......@@ -32,9 +32,6 @@ int main(int argc, char **argv)
Gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12;
Gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
Gauge_param.blockDim = 64;
Gauge_param.blockDim_sloppy = 64;
gauge_param = &Gauge_param;
int clover_yes = 0; // 0 for plain Wilson, 1 for clover
......@@ -44,13 +41,13 @@ int main(int argc, char **argv)
} else {
inv_param.dslash_type = QUDA_WILSON_DSLASH;
}
inv_param.inv_type = QUDA_CG_INVERTER;
inv_param.inv_type = QUDA_BICGSTAB_INVERTER;
double mass = -0.94;
inv_param.kappa = 1.0 / (2.0*(1 + 3/gauge_param->anisotropy + mass));
inv_param.tol = 5e-7;
inv_param.maxiter = 10000;
inv_param.reliable_delta = 1e-3;
inv_param.reliable_delta = 1e-1;
inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
inv_param.solution_type = QUDA_MAT_SOLUTION;
......@@ -59,9 +56,11 @@ int main(int argc, char **argv)
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
inv_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO;
inv_param.preserve_source = QUDA_PRESERVE_SOURCE_YES;
inv_param.dirac_order = QUDA_DIRAC_ORDER;
inv_param.sp_pad = 0;
if (clover_yes) {
inv_param.clover_cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.clover_cuda_prec = QUDA_SINGLE_PRECISION;
......
#define READ_SPINOR_DOUBLE(spinor) \
double2 I0 = fetch_double2((spinor), sp_idx + 0*Vh); \
double2 I1 = fetch_double2((spinor), sp_idx + 1*Vh); \
double2 I2 = fetch_double2((spinor), sp_idx + 2*Vh); \
double2 I3 = fetch_double2((spinor), sp_idx + 3*Vh); \
double2 I4 = fetch_double2((spinor), sp_idx + 4*Vh); \
double2 I5 = fetch_double2((spinor), sp_idx + 5*Vh); \
double2 I6 = fetch_double2((spinor), sp_idx + 6*Vh); \
double2 I7 = fetch_double2((spinor), sp_idx + 7*Vh); \
double2 I8 = fetch_double2((spinor), sp_idx + 8*Vh); \
double2 I9 = fetch_double2((spinor), sp_idx + 9*Vh); \
double2 I10 = fetch_double2((spinor), sp_idx + 10*Vh); \
double2 I11 = fetch_double2((spinor), sp_idx + 11*Vh);
double2 I0 = fetch_double2((spinor), sp_idx + 0*(sp_stride)); \
double2 I1 = fetch_double2((spinor), sp_idx + 1*(sp_stride)); \
double2 I2 = fetch_double2((spinor), sp_idx + 2*(sp_stride)); \
double2 I3 = fetch_double2((spinor), sp_idx + 3*(sp_stride)); \
double2 I4 = fetch_double2((spinor), sp_idx + 4*(sp_stride)); \
double2 I5 = fetch_double2((spinor), sp_idx + 5*(sp_stride)); \
double2 I6 = fetch_double2((spinor), sp_idx + 6*(sp_stride)); \
double2 I7 = fetch_double2((spinor), sp_idx + 7*(sp_stride)); \
double2 I8 = fetch_double2((spinor), sp_idx + 8*(sp_stride)); \
double2 I9 = fetch_double2((spinor), sp_idx + 9*(sp_stride)); \
double2 I10 = fetch_double2((spinor), sp_idx + 10*(sp_stride)); \
double2 I11 = fetch_double2((spinor), sp_idx + 11*(sp_stride));
#define READ_SPINOR_DOUBLE_UP(spinor) \
double2 I0 = fetch_double2((spinor), sp_idx + 0*Vh); \
double2 I1 = fetch_double2((spinor), sp_idx + 1*Vh); \
double2 I2 = fetch_double2((spinor), sp_idx + 2*Vh); \
double2 I3 = fetch_double2((spinor), sp_idx + 3*Vh); \
double2 I4 = fetch_double2((spinor), sp_idx + 4*Vh); \
double2 I5 = fetch_double2((spinor), sp_idx + 5*Vh);
double2 I0 = fetch_double2((spinor), sp_idx + 0*(sp_stride)); \
double2 I1 = fetch_double2((spinor), sp_idx + 1*(sp_stride)); \
double2 I2 = fetch_double2((spinor), sp_idx + 2*(sp_stride)); \
double2 I3 = fetch_double2((spinor), sp_idx + 3*(sp_stride)); \
double2 I4 = fetch_double2((spinor), sp_idx + 4*(sp_stride)); \
double2 I5 = fetch_double2((spinor), sp_idx + 5*(sp_stride));
#define READ_SPINOR_DOUBLE_DOWN(spinor) \
double2 I6 = fetch_double2((spinor), sp_idx + 6*Vh); \
double2 I7 = fetch_double2((spinor), sp_idx + 7*Vh); \
double2 I8 = fetch_double2((spinor), sp_idx + 8*Vh); \
double2 I9 = fetch_double2((spinor), sp_idx + 9*Vh); \
double2 I10 = fetch_double2((spinor), sp_idx + 10*Vh); \
double2 I11 = fetch_double2((spinor), sp_idx + 11*Vh);
double2 I6 = fetch_double2((spinor), sp_idx + 6*(sp_stride)); \
double2 I7 = fetch_double2((spinor), sp_idx + 7*(sp_stride)); \
double2 I8 = fetch_double2((spinor), sp_idx + 8*(sp_stride)); \
double2 I9 = fetch_double2((spinor), sp_idx + 9*(sp_stride)); \
double2 I10 = fetch_double2((spinor), sp_idx + 10*(sp_stride)); \
double2 I11 = fetch_double2((spinor), sp_idx + 11*(sp_stride));
#define READ_SPINOR_SINGLE(spinor) \
float4 I0 = tex1Dfetch((spinor), sp_idx + 0*Vh); \
float4 I1 = tex1Dfetch((spinor), sp_idx + 1*Vh); \
float4 I2 = tex1Dfetch((spinor), sp_idx + 2*Vh); \
float4 I3 = tex1Dfetch((spinor), sp_idx + 3*Vh); \
float4 I4 = tex1Dfetch((spinor), sp_idx + 4*Vh); \
float4 I5 = tex1Dfetch((spinor), sp_idx + 5*Vh);
float4 I0 = tex1Dfetch((spinor), sp_idx + 0*(sp_stride)); \
float4 I1 = tex1Dfetch((spinor), sp_idx + 1*(sp_stride)); \
float4 I2 = tex1Dfetch((spinor), sp_idx + 2*(sp_stride)); \
float4 I3 = tex1Dfetch((spinor), sp_idx + 3*(sp_stride)); \
float4 I4 = tex1Dfetch((spinor), sp_idx + 4*(sp_stride)); \
float4 I5 = tex1Dfetch((spinor), sp_idx + 5*(sp_stride));
#define READ_SPINOR_SINGLE_UP(spinor) \
float4 I0 = tex1Dfetch((spinor), sp_idx + 0*Vh); \
float4 I1 = tex1Dfetch((spinor), sp_idx + 1*Vh); \
float4 I2 = tex1Dfetch((spinor), sp_idx + 2*Vh); \
float4 I0 = tex1Dfetch((spinor), sp_idx + 0*(sp_stride)); \
float4 I1 = tex1Dfetch((spinor), sp_idx + 1*(sp_stride)); \
float4 I2 = tex1Dfetch((spinor), sp_idx + 2*(sp_stride)); \
#define READ_SPINOR_SINGLE_DOWN(spinor) \
float4 I3 = tex1Dfetch((spinor), sp_idx + 3*Vh); \
float4 I4 = tex1Dfetch((spinor), sp_idx + 4*Vh); \
float4 I5 = tex1Dfetch((spinor), sp_idx + 5*Vh);
float4 I3 = tex1Dfetch((spinor), sp_idx + 3*(sp_stride)); \
float4 I4 = tex1Dfetch((spinor), sp_idx + 4*(sp_stride)); \
float4 I5 = tex1Dfetch((spinor), sp_idx + 5*(sp_stride));
#define READ_SPINOR_HALF(spinor) \
float4 I0 = tex1Dfetch((spinor), sp_idx + 0*Vh); \
float4 I1 = tex1Dfetch((spinor), sp_idx + 1*Vh); \
float4 I2 = tex1Dfetch((spinor), sp_idx + 2*Vh); \
float4 I3 = tex1Dfetch((spinor), sp_idx + 3*Vh); \
float4 I4 = tex1Dfetch((spinor), sp_idx + 4*Vh); \
float4 I5 = tex1Dfetch((spinor), sp_idx + 5*Vh); \
float4 I0 = tex1Dfetch((spinor), sp_idx + 0*(sp_stride)); \
float4 I1 = tex1Dfetch((spinor), sp_idx + 1*(sp_stride)); \
float4 I2 = tex1Dfetch((spinor), sp_idx + 2*(sp_stride)); \
float4 I3 = tex1Dfetch((spinor), sp_idx + 3*(sp_stride)); \
float4 I4 = tex1Dfetch((spinor), sp_idx + 4*(sp_stride)); \
float4 I5 = tex1Dfetch((spinor), sp_idx + 5*(sp_stride)); \
float C = tex1Dfetch((spinorTexNorm), sp_idx); \
I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
......@@ -62,52 +62,52 @@
I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
#define READ_SPINOR_HALF_UP(spinor) \
float4 I0 = tex1Dfetch((spinor), sp_idx + 0*Vh); \
float4 I1 = tex1Dfetch((spinor), sp_idx + 1*Vh); \
float4 I2 = tex1Dfetch((spinor), sp_idx + 2*Vh); \
float4 I0 = tex1Dfetch((spinor), sp_idx + 0*(sp_stride)); \
float4 I1 = tex1Dfetch((spinor), sp_idx + 1*(sp_stride)); \
float4 I2 = tex1Dfetch((spinor), sp_idx + 2*(sp_stride)); \
float C = tex1Dfetch((spinorTexNorm), sp_idx); \
I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \
#define READ_SPINOR_HALF_DOWN(spinor) \
float4 I3 = tex1Dfetch((spinor), sp_idx + 3*Vh); \
float4 I4 = tex1Dfetch((spinor), sp_idx + 4*Vh); \
float4 I5 = tex1Dfetch((spinor), sp_idx + 5*Vh); \
float4 I3 = tex1Dfetch((spinor), sp_idx + 3*(sp_stride)); \
float4 I4 = tex1Dfetch((spinor), sp_idx + 4*(sp_stride)); \
float4 I5 = tex1Dfetch((spinor), sp_idx + 5*(sp_stride)); \
float C = tex1Dfetch((spinorTexNorm), sp_idx); \
I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \
I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \
I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
#define READ_ACCUM_DOUBLE(spinor) \
double2 accum0 = fetch_double2((spinor), sid + 0*Vh); \
double2 accum1 = fetch_double2((spinor), sid + 1*Vh); \
double2 accum2 = fetch_double2((spinor), sid + 2*Vh); \
double2 accum3 = fetch_double2((spinor), sid + 3*Vh); \
double2 accum4 = fetch_double2((spinor), sid + 4*Vh); \
double2 accum5 = fetch_double2((spinor), sid + 5*Vh); \
double2 accum6 = fetch_double2((spinor), sid + 6*Vh); \
double2 accum7 = fetch_double2((spinor), sid + 7*Vh); \
double2 accum8 = fetch_double2((spinor), sid + 8*Vh); \
double2 accum9 = fetch_double2((spinor), sid + 9*Vh); \
double2 accum10 = fetch_double2((spinor), sid + 10*Vh); \
double2 accum11 = fetch_double2((spinor), sid + 11*Vh);
double2 accum0 = fetch_double2((spinor), sid + 0*(sp_stride)); \
double2 accum1 = fetch_double2((spinor), sid + 1*(sp_stride)); \
double2 accum2 = fetch_double2((spinor), sid + 2*(sp_stride)); \
double2 accum3 = fetch_double2((spinor), sid + 3*(sp_stride)); \
double2 accum4 = fetch_double2((spinor), sid + 4*(sp_stride)); \
double2 accum5 = fetch_double2((spinor), sid + 5*(sp_stride)); \
double2 accum6 = fetch_double2((spinor), sid + 6*(sp_stride)); \
double2 accum7 = fetch_double2((spinor), sid + 7*(sp_stride)); \
double2 accum8 = fetch_double2((spinor), sid + 8*(sp_stride)); \
double2 accum9 = fetch_double2((spinor), sid + 9*(sp_stride)); \
double2 accum10 = fetch_double2((spinor), sid + 10*(sp_stride)); \
double2 accum11 = fetch_double2((spinor), sid + 11*(sp_stride));
#define READ_ACCUM_SINGLE(spinor) \
float4 accum0 = tex1Dfetch((spinor), sid + 0*Vh); \
float4 accum1 = tex1Dfetch((spinor), sid + 1*Vh); \
float4 accum2 = tex1Dfetch((spinor), sid + 2*Vh); \
float4 accum3 = tex1Dfetch((spinor), sid + 3*Vh); \
float4 accum4 = tex1Dfetch((spinor), sid + 4*Vh); \
float4 accum5 = tex1Dfetch((spinor), sid + 5*Vh);
float4 accum0 = tex1Dfetch((spinor), sid + 0*(sp_stride)); \
float4 accum1 = tex1Dfetch((spinor), sid + 1*(sp_stride)); \
float4 accum2 = tex1Dfetch((spinor), sid + 2*(sp_stride)); \
float4 accum3 = tex1Dfetch((spinor), sid + 3*(sp_stride)); \
float4 accum4 = tex1Dfetch((spinor), sid + 4*(sp_stride)); \
float4 accum5 = tex1Dfetch((spinor), sid + 5*(sp_stride));
#define READ_ACCUM_HALF(spinor) \
float4 accum0 = tex1Dfetch((spinor), sid + 0*Vh); \
float4 accum1 = tex1Dfetch((spinor), sid + 1*Vh); \
float4 accum2 = tex1Dfetch((spinor), sid + 2*Vh); \
float4 accum3 = tex1Dfetch((spinor), sid + 3*Vh); \
float4 accum4 = tex1Dfetch((spinor), sid + 4*Vh); \
float4 accum5 = tex1Dfetch((spinor), sid + 5*Vh); \
float4 accum0 = tex1Dfetch((spinor), sid + 0*(sp_stride)); \
float4 accum1 = tex1Dfetch((spinor), sid + 1*(sp_stride)); \
float4 accum2 = tex1Dfetch((spinor), sid + 2*(sp_stride)); \
float4 accum3 = tex1Dfetch((spinor), sid + 3*(sp_stride)); \
float4 accum4 = tex1Dfetch((spinor), sid + 4*(sp_stride)); \
float4 accum5 = tex1Dfetch((spinor), sid + 5*(sp_stride)); \
float C = tex1Dfetch((accumTexNorm), sid); \
accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C; \
accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C; \
......@@ -118,26 +118,26 @@
#define WRITE_SPINOR_DOUBLE2() \
g_out[0*Vh+sid] = make_double2(o00_re, o00_im); \
g_out[1*Vh+sid] = make_double2(o01_re, o01_im); \
g_out[2*Vh+sid] = make_double2(o02_re, o02_im); \
g_out[3*Vh+sid] = make_double2(o10_re, o10_im); \
g_out[4*Vh+sid] = make_double2(o11_re, o11_im); \
g_out[5*Vh+sid] = make_double2(o12_re, o12_im); \
g_out[6*Vh+sid] = make_double2(o20_re, o20_im); \
g_out[7*Vh+sid] = make_double2(o21_re, o21_im); \
g_out[8*Vh+sid] = make_double2(o22_re, o22_im); \
g_out[9*Vh+sid] = make_double2(o30_re, o30_im); \
g_out[10*Vh+sid] = make_double2(o31_re, o31_im); \
g_out[11*Vh+sid] = make_double2(o32_re, o32_im);
g_out[0*(sp_stride)+sid] = make_double2(o00_re, o00_im); \
g_out[1*(sp_stride)+sid] = make_double2(o01_re, o01_im); \
g_out[2*(sp_stride)+sid] = make_double2(o02_re, o02_im); \
g_out[3*(sp_stride)+sid] = make_double2(o10_re, o10_im); \
g_out[4*(sp_stride)+sid] = make_double2(o11_re, o11_im); \
g_out[5*(sp_stride)+sid] = make_double2(o12_re, o12_im); \
g_out[6*(sp_stride)+sid] = make_double2(o20_re, o20_im); \
g_out[7*(sp_stride)+sid] = make_double2(o21_re, o21_im); \
g_out[8*(sp_stride)+sid] = make_double2(o22_re, o22_im); \
g_out[9*(sp_stride)+sid] = make_double2(o30_re, o30_im); \
g_out[10*(sp_stride)+sid] = make_double2(o31_re, o31_im); \
g_out[11*(sp_stride)+sid] = make_double2(o32_re, o32_im);
#define WRITE_SPINOR_FLOAT4() \
g_out[0*Vh+sid] = make_float4(o00_re, o00_im, o01_re, o01_im); \
g_out[1*Vh+sid] = make_float4(o02_re, o02_im, o10_re, o10_im); \
g_out[2*Vh+sid] = make_float4(o11_re, o11_im, o12_re, o12_im); \
g_out[3*Vh+sid] = make_float4(o20_re, o20_im, o21_re, o21_im); \
g_out[4*Vh+sid] = make_float4(o22_re, o22_im, o30_re, o30_im); \
g_out[5*Vh+sid] = make_float4(o31_re, o31_im, o32_re, o32_im);
g_out[0*(sp_stride)+sid] = make_float4(o00_re, o00_im, o01_re, o01_im); \
g_out[1*(sp_stride)+sid] = make_float4(o02_re, o02_im, o10_re, o10_im); \
g_out[2*(sp_stride)+sid] = make_float4(o11_re, o11_im, o12_re, o12_im); \
g_out[3*(sp_stride)+sid] = make_float4(o20_re, o20_im, o21_re, o21_im); \
g_out[4*(sp_stride)+sid] = make_float4(o22_re, o22_im, o30_re, o30_im); \
g_out[5*(sp_stride)+sid] = make_float4(o31_re, o31_im, o32_re, o32_im);
#define WRITE_SPINOR_SHORT4() \
float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im)); \
......@@ -171,12 +171,12 @@
o20_re *= scale; o20_im *= scale; o21_re *= scale; o21_im *= scale; \
o22_re *= scale; o22_im *= scale; o30_re *= scale; o30_im *= scale; \
o31_re *= scale; o31_im *= scale; o32_re *= scale; o32_im *= scale; \
g_out[sid+0*Vh] = make_short4((short)o00_re, (short)o00_im, (short)o01_re, (short)o01_im); \
g_out[sid+1*Vh] = make_short4((short)o02_re, (short)o02_im, (short)o10_re, (short)o10_im); \
g_out[sid+2*Vh] = make_short4((short)o11_re, (short)o11_im, (short)o12_re, (short)o12_im); \
g_out[sid+3*Vh] = make_short4((short)o20_re, (short)o20_im, (short)o21_re, (short)o21_im); \
g_out[sid+4*Vh] = make_short4((short)o22_re, (short)o22_im, (short)o30_re, (short)o30_im); \
g_out[sid+5*Vh] = make_short4((short)o31_re, (short)o31_im, (short)o32_re, (short)o32_im);
g_out[sid+0*(sp_stride)] = make_short4((short)o00_re, (short)o00_im, (short)o01_re, (short)o01_im); \
g_out[sid+1*(sp_stride)] = make_short4((short)o02_re, (short)o02_im, (short)o10_re, (short)o10_im); \
g_out[sid+2*(sp_stride)] = make_short4((short)o11_re, (short)o11_im, (short)o12_re, (short)o12_im); \
g_out[sid+3*(sp_stride)] = make_short4((short)o20_re, (short)o20_im, (short)o21_re, (short)o21_im); \
g_out[sid+4*(sp_stride)] = make_short4((short)o22_re, (short)o22_im, (short)o30_re, (short)o30_im); \
g_out[sid+5*(sp_stride)] = make_short4((short)o31_re, (short)o31_im, (short)o32_re, (short)o32_im);
/*
#define WRITE_SPINOR_FLOAT1_SMEM() \
......
......@@ -15,9 +15,10 @@ QudaGaugeParam param;
FullSpinor cudaFullSpinor;
ParitySpinor cudaParitySpinor;
float *qdpGauge[4];
float *cpsGauge;
float *spinor;
void *qdpGauge[4];
void *cpsGauge;
void *spinor;
void *spinor2;
float kappa = 1.0;
int ODD_BIT = 0;
......@@ -38,20 +39,17 @@ aligned_malloc(size_t n, void **m0)
void init() {
Precision single = QUDA_SINGLE_PRECISION;
param.blockDim = 64;
param.cpu_prec = QUDA_SINGLE_PRECISION;
param.cuda_prec = QUDA_SINGLE_PRECISION;
param.cuda_prec = QUDA_HALF_PRECISION;
param.reconstruct = QUDA_RECONSTRUCT_8;
param.cuda_prec_sloppy = param.cuda_prec;
param.reconstruct_sloppy = param.reconstruct;
param.X[0] = 24;
param.X[1] = 24;
param.X[2] = 24;
param.X[3] = 24;
param.X[0] = 4;
param.X[1] = 4;
param.X[2] = 4;
param.X[3] = 4;
param.ga_pad = 0;
setDims(param.X);
param.anisotropy = 2.3;
......@@ -59,20 +57,25 @@ void init() {
param.gauge_fix = QUDA_GAUGE_FIXED_NO;
gauge_param = &param;
int sp_pad = 256;
// construct input fields
for (int dir = 0; dir < 4; dir++) {
qdpGauge[dir] = (float*)malloc(V*gaugeSiteSize*sizeof(float));
qdpGauge[dir] = malloc(V*gaugeSiteSize*param.cpu_prec);
}
cpsGauge = (float*)malloc(4*V*gaugeSiteSize*sizeof(float));
cpsGauge = malloc(4*V*gaugeSiteSize*param.cpu_prec);
spinor = (float*)malloc(V*spinorSiteSize*sizeof(float));
spinor = malloc(V*spinorSiteSize*param.cpu_prec);
spinor2 = malloc(V*spinorSiteSize*param.cpu_prec);
construct_spinor_field(spinor, 1, 0, 0, 0, param.cpu_prec);
int dev = 0;
cudaSetDevice(dev);
param.X[0] /= 2;
cudaFullSpinor = allocateSpinorField(param.X, single);
cudaParitySpinor = allocateParitySpinor(param.X, single);
cudaFullSpinor = allocateSpinorField(param.X, param.cuda_prec, sp_pad);
cudaParitySpinor = allocateParitySpinor(param.X, param.cuda_prec, sp_pad);
param.X[0] *= 2;
}
......@@ -82,6 +85,7 @@ void end() {
for (int dir = 0; dir < 4; dir++) free(qdpGauge[dir]);
free(cpsGauge);
free(spinor);
free(spinor2);
freeSpinorField(cudaFullSpinor);
freeParitySpinor(cudaParitySpinor);
freeSpinorBuffer();
......@@ -91,7 +95,7 @@ void packTest() {
init();
float spinorGiB = (float)Vh*spinorSiteSize*sizeof(float) / (1 << 30);
float spinorGiB = (float)Vh*spinorSiteSize*param.cuda_prec / (1 << 30);
printf("\nSpinor mem: %.3f GiB\n", spinorGiB);
printf("Gauge mem: %.3f GiB\n", param.gaugeGiB);
......@@ -100,7 +104,7 @@ void packTest() {
stopwatchStart();
param.gauge_order = QUDA_CPS_WILSON_GAUGE_ORDER;
createGaugeField(&cudaGaugePrecise, cpsGauge, param.cuda_prec, param.reconstruct,
param.t_boundary, param.X, 1.0, param.blockDim);
param.t_boundary, param.X, 1.0);
double cpsGtime = stopwatchReadSeconds();
printf("CPS Gauge send time = %e seconds\n", cpsGtime);
......@@ -111,8 +115,8 @@ void packTest() {
stopwatchStart();
param.gauge_order = QUDA_QDP_GAUGE_ORDER;
createGaugeField(&cudaGaugePrecise, qdpGauge, param.cpu_prec, param.reconstruct,
param.t_boundary, param.X, 1.0, param.blockDim);
createGaugeField(&cudaGaugePrecise, qdpGauge, param.cuda_prec, param.reconstruct,
param.t_boundary, param.X, 1.0);
double qdpGtime = stopwatchReadSeconds();
printf("QDP Gauge send time = %e seconds\n", qdpGtime);
......@@ -122,26 +126,26 @@ void packTest() {
printf("QDP Gauge restore time = %e seconds\n", qdpGRtime);
stopwatchStart();
loadSpinorField(cudaFullSpinor, (void*)spinor, QUDA_SINGLE_PRECISION, QUDA_DIRAC_ORDER);
double sSendTime = stopwatchReadSeconds();
printf("Spinor send time = %e seconds\n", sSendTime);
loadParitySpinor(cudaFullSpinor.even, (void*)spinor, param.cpu_prec, QUDA_DIRAC_ORDER);
double pSendTime = stopwatchReadSeconds();
printf("Parity spinor send time = %e seconds\n", pSendTime);
stopwatchStart();
retrieveParitySpinor(spinor2, cudaFullSpinor.even, param.cpu_prec, QUDA_DIRAC_ORDER);
double pRecTime = stopwatchReadSeconds();
printf("Parity receive time = %e seconds\n", pRecTime);
stopwatchStart();
loadParitySpinor(cudaFullSpinor.even, (void*)spinor, QUDA_SINGLE_PRECISION, QUDA_DIRAC_ORDER);
double pSendTime = stopwatchReadSeconds();
printf("Parity spinor send time = %e seconds\n", pSendTime);
loadSpinorField(cudaFullSpinor, (void*)spinor, param.cpu_prec, QUDA_DIRAC_ORDER);
double sSendTime = stopwatchReadSeconds();
printf("Spinor send time = %e seconds\n", sSendTime);
stopwatchStart();
retrieveSpinorField(spinor, cudaFullSpinor, QUDA_SINGLE_PRECISION, QUDA_DIRAC_ORDER);
retrieveSpinorField(spinor2, cudaFullSpinor, param.cpu_prec, QUDA_DIRAC_ORDER);
double sRecTime = stopwatchReadSeconds();
printf("Spinor receive time = %e seconds\n", sRecTime);
stopwatchStart();
retrieveParitySpinor(spinor, cudaParitySpinor, QUDA_SINGLE_PRECISION, QUDA_DIRAC_ORDER);
double pRecTime = stopwatchReadSeconds();
printf("Parity receive time = %e seconds\n", pRecTime);
compare_spinor(spinor, spinor2, V, param.cpu_prec);
end();
......
......@@ -38,11 +38,13 @@ extern "C" {
typedef void *ParityGauge;
typedef struct {
int blockDim; // The size of the thread block to use
size_t bytes;
Precision precision;
int length; // total length
int real_length; // physical length (excluding padding)
int volume; // geometric volume (single parity)
int pad; // padding from end of array to start of next
int stride; // geometric stride between volume lengthed arrays
int X[4]; // the geometric lengths (single parity)
int Nc; // number of colors
ReconstructType reconstruct;
......@@ -55,7 +57,10 @@ extern "C" {
size_t bytes;
Precision precision;
int length;
int real_length; // physical length (excluding padding)
int volume;
int pad; // padding from end of array to start of next
int stride; // geometric stride between volume lengthed arrays
int X[4];
int Nc;
int Ns;
......@@ -72,7 +77,10 @@ extern "C" {
size_t bytes;
Precision precision;
int length; // total length
int real_length; // physical length (excluding padding)
int volume; // geometric volume (single parity)
int pad; // padding from end of array to start of next
int stride; // geometric stride between volume lengthed arrays
int X[4]; // the geometric lengths (single parity)
int Nc; // length of color dimension
int Ns; // length of spin dimension
......
......@@ -96,7 +96,7 @@ __global__ void REDUCE_FUNC_NAME(Kernel) (REDUCE_TYPES, QudaSumComplex *g_odata,
template <typename Float, typename Float2>
cuDoubleComplex REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n) {
if (n % REDUCE_THREADS != 0) {
printf("ERROR reduceCuda(): length must be a multiple of %d\n", REDUCE_THREADS);
printf("ERROR reduceCuda(): length %d must be a multiple of %d\n", n, REDUCE_THREADS);
exit(-1);
}
......@@ -107,11 +107,19 @@ cuDoubleComplex REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n) {
// partial reduction; each block generates one number
dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1);
#if (REDUCE_TYPE == REDUCE_KAHAN)
int smemSize = REDUCE_THREADS * 2 * sizeof(QudaSumComplex);
#else
int smemSize = REDUCE_THREADS * sizeof(QudaSumComplex);
#endif
REDUCE_FUNC_NAME(Kernel)<<< dimGrid, dimBlock, smemSize >>>(REDUCE_PARAMS, d_reduceComplex, n);
// copy result from device to host, and perform final reduction on CPU
cudaMemcpy(h_reduceComplex, d_reduceComplex, blocks*sizeof(QudaSumComplex), cudaMemcpyDeviceToHost);
cudaError_t error = cudaMemcpy(h_reduceComplex, d_reduceComplex, blocks*sizeof(QudaSumComplex), cudaMemcpyDeviceToHost);
if (error != cudaSuccess) {
printf("Error: %s\n", cudaGetErrorString(error));
exit(-1);
}
cuDoubleComplex gpu_result;
gpu_result.x = 0;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment