Advanced Computing Platform for Theoretical Physics

commit大文件会使得服务器变得不稳定,请大家尽量只commit代码,不要commit大的文件。

Commit ba59a842 authored by rbabich's avatar rbabich
Browse files

quda: eliminated global gauge_param


git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@585 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 37c1415f
......@@ -4,6 +4,8 @@ Version 0.x
- Introduced new interface functions newQudaGaugeParam() and
newQudaInvertParam() to allow for enhanced error checking. See
invert_test for an example of their use.
- Modified loadCloverQuda() to take gauge_param as an additional
parameter.
- Improved stability of the half precision 8-parameter SU(3)
reconstruction (with thanks to Guochun Shi).
- Cleaned up the invert_test example to remove unnecessary dependencies.
......
......@@ -18,8 +18,6 @@ extern "C" {
extern FullClover cudaCloverInvPrecise;
extern FullClover cudaCloverInvSloppy;
extern QudaGaugeParam *gauge_param;
// ---------- dslash_quda.cu ----------
int dslashCudaSharedBytes(Precision spinor_prec, int blockDim);
......
......@@ -8,13 +8,13 @@
extern "C" {
#endif
void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, Precision precision,
ReconstructType reconstruct, Tboundary t_boundary,
int *X, double anisotropy, int pad);
void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, Precision cuda_prec, Precision cpu_prec,
GaugeFieldOrder gauge_order, ReconstructType reconstruct, GaugeFixed gauge_fixed,
Tboundary t_boundary, int *XX, double anisotropy, int pad);
void restoreGaugeField(void *cpuGauge, FullGauge *cudaGauge);
void restoreGaugeField(void *cpuGauge, FullGauge *cudaGauge, Precision cpu_prec, GaugeFieldOrder gauge_order);
void freeGaugeField(FullGauge *cudaCauge);
void freeGaugeField(FullGauge *cudaGauge);
#ifdef __cplusplus
}
......
......@@ -82,9 +82,9 @@ extern "C" {
void initQuda(int dev);
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param);
void saveGaugeQuda(void *h_gauge);
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param);
// void discardCloverQuda(QudaInvertParam *inv_param);
void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param);
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param);
void invertQuda(void *h_x, void *h_b, QudaInvertParam *param);
void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int parity, int dagger);
......
......@@ -49,6 +49,8 @@ extern "C" {
int X[4]; // the geometric lengths (single parity)
int Nc; // number of colors
ReconstructType reconstruct;
GaugeFixed gauge_fixed;
Tboundary t_boundary;
ParityGauge odd;
ParityGauge even;
double anisotropy;
......
......@@ -7,8 +7,6 @@
#include<dslash_textures.h>
#include<dslash_constants.h>
QudaGaugeParam *gauge_param;
unsigned long long dslash_quda_flops;
unsigned long long dslash_quda_bytes;
......@@ -106,18 +104,18 @@ void initDslashConstants(FullGauge gauge, int sp_stride, int cl_stride) {
int X4X3X2X1hmX3X2X1h = (X4-1)*X3*X2*X1h;
cudaMemcpyToSymbol("X4X3X2X1hmX3X2X1h", &X4X3X2X1hmX3X2X1h, sizeof(int));
int gf = (gauge_param->gauge_fix == QUDA_GAUGE_FIXED_YES) ? 1 : 0;
int gf = (gauge.gauge_fixed == QUDA_GAUGE_FIXED_YES) ? 1 : 0;
cudaMemcpyToSymbol("gauge_fixed", &(gf), sizeof(int));
cudaMemcpyToSymbol("anisotropy", &(gauge_param->anisotropy), sizeof(double));
cudaMemcpyToSymbol("anisotropy", &(gauge.anisotropy), sizeof(double));
double t_bc = (gauge_param->t_boundary == QUDA_PERIODIC_T) ? 1.0 : -1.0;
double t_bc = (gauge.t_boundary == QUDA_PERIODIC_T) ? 1.0 : -1.0;
cudaMemcpyToSymbol("t_boundary", &(t_bc), sizeof(double));
float anisotropy_f = gauge_param->anisotropy;
float anisotropy_f = gauge.anisotropy;
cudaMemcpyToSymbol("anisotropy_f", &(anisotropy_f), sizeof(float));
float t_bc_f = (gauge_param->t_boundary == QUDA_PERIODIC_T) ? 1.0 : -1.0;
float t_bc_f = (gauge.t_boundary == QUDA_PERIODIC_T) ? 1.0 : -1.0;
cudaMemcpyToSymbol("t_boundary_f", &(t_bc_f), sizeof(float));
float h_pi_f = M_PI;
......
......@@ -4,15 +4,13 @@
#include <quda_internal.h>
#include <gauge_quda.h>
//#include <xmmintrin.h>
#define SHORT_LENGTH 65536
#define SCALE_FLOAT ((SHORT_LENGTH-1) / 2.f)
#define SHIFT_FLOAT (-1.f / (SHORT_LENGTH-1))
double Anisotropy;
QudaTboundary tBoundary;
int X[4];
static double Anisotropy;
static QudaTboundary tBoundary;
static int X[4];
template <typename Float>
inline short FloatToShort(const Float &a) {
......@@ -156,7 +154,7 @@ inline void accumulateConjugateProduct(Float *a, Float *b, Float *c, int sign) {
// a = conj(b)*conj(c)
template <typename Float>
void complexConjugateProduct(Float *a, Float *b, Float *c) {
inline void complexConjugateProduct(Float *a, Float *b, Float *c) {
a[0] = b[0]*c[0] - b[1]*c[1];
a[1] = -b[0]*c[1] - b[1]*c[0];
}
......@@ -354,7 +352,7 @@ inline void unpack18(Float *h_gauge, short4 *d_gauge, int dir, int V) {
// Assume the gauge field is "QDP" ordered: directions outside of
// space-time, row-column ordering, even-odd space-time
template <typename Float, typename FloatN>
void packQDPGaugeField(FloatN *d_gauge, Float **h_gauge, int oddBit,
static void packQDPGaugeField(FloatN *d_gauge, Float **h_gauge, int oddBit,
ReconstructType reconstruct, int V, int pad) {
if (reconstruct == QUDA_RECONSTRUCT_12) {
for (int dir = 0; dir < 4; dir++) {
......@@ -377,8 +375,8 @@ void packQDPGaugeField(FloatN *d_gauge, Float **h_gauge, int oddBit,
// Assume the gauge field is "QDP" ordered: directions outside of
// space-time, row-column ordering, even-odd space-time
template <typename Float, typename FloatN>
void unpackQDPGaugeField(Float **h_gauge, FloatN *d_gauge, int oddBit,
ReconstructType reconstruct, int V, int pad) {
static void unpackQDPGaugeField(Float **h_gauge, FloatN *d_gauge, int oddBit,
ReconstructType reconstruct, int V, int pad) {
if (reconstruct == QUDA_RECONSTRUCT_12) {
for (int dir = 0; dir < 4; dir++) {
Float *g = h_gauge[dir] + oddBit*V*18;
......@@ -399,7 +397,7 @@ void unpackQDPGaugeField(Float **h_gauge, FloatN *d_gauge, int oddBit,
// transpose and scale the matrix
template <typename Float, typename Float2>
void transposeScale(Float *gT, Float *g, const Float2 &a) {
static void transposeScale(Float *gT, Float *g, const Float2 &a) {
for (int ic=0; ic<3; ic++) for (int jc=0; jc<3; jc++) for (int r=0; r<2; r++)
gT[(ic*3+jc)*2+r] = a*g[(jc*3+ic)*2+r];
}
......@@ -407,8 +405,8 @@ void transposeScale(Float *gT, Float *g, const Float2 &a) {
// Assume the gauge field is "Wilson" ordered directions inside of
// space-time column-row ordering even-odd space-time
template <typename Float, typename FloatN>
void packCPSGaugeField(FloatN *d_gauge, Float *h_gauge, int oddBit,
ReconstructType reconstruct, int V, int pad) {
static void packCPSGaugeField(FloatN *d_gauge, Float *h_gauge, int oddBit,
ReconstructType reconstruct, int V, int pad) {
Float gT[18];
if (reconstruct == QUDA_RECONSTRUCT_12) {
for (int dir = 0; dir < 4; dir++) {
......@@ -441,8 +439,8 @@ void packCPSGaugeField(FloatN *d_gauge, Float *h_gauge, int oddBit,
// Assume the gauge field is "Wilson" ordered directions inside of
// space-time column-row ordering even-odd space-time
template <typename Float, typename FloatN>
void unpackCPSGaugeField(Float *h_gauge, FloatN *d_gauge, int oddBit,
ReconstructType reconstruct, int V, int pad) {
static void unpackCPSGaugeField(Float *h_gauge, FloatN *d_gauge, int oddBit,
ReconstructType reconstruct, int V, int pad) {
Float gT[18];
if (reconstruct == QUDA_RECONSTRUCT_12) {
for (int dir = 0; dir < 4; dir++) {
......@@ -472,7 +470,7 @@ void unpackCPSGaugeField(Float *h_gauge, FloatN *d_gauge, int oddBit,
}
void allocateGaugeField(FullGauge *cudaGauge, ReconstructType reconstruct, Precision precision) {
static void allocateGaugeField(FullGauge *cudaGauge, ReconstructType reconstruct, Precision precision) {
cudaGauge->reconstruct = reconstruct;
cudaGauge->precision = precision;
......@@ -511,8 +509,8 @@ void freeGaugeField(FullGauge *cudaGauge) {
}
template <typename Float, typename FloatN>
void loadGaugeField(FloatN *even, FloatN *odd, Float *cpuGauge, ReconstructType reconstruct,
int bytes, int Vh, int pad) {
static void loadGaugeField(FloatN *even, FloatN *odd, Float *cpuGauge, GaugeFieldOrder gauge_order,
ReconstructType reconstruct, int bytes, int Vh, int pad) {
// Use pinned memory
FloatN *packedEven, *packedOdd;
......@@ -525,14 +523,14 @@ void loadGaugeField(FloatN *even, FloatN *odd, Float *cpuGauge, ReconstructType
packedOdd = (FloatN*)malloc(bytes);
#endif
if (gauge_param->gauge_order == QUDA_QDP_GAUGE_ORDER) {
if (gauge_order == QUDA_QDP_GAUGE_ORDER) {
packQDPGaugeField(packedEven, (Float**)cpuGauge, 0, reconstruct, Vh, pad);
packQDPGaugeField(packedOdd, (Float**)cpuGauge, 1, reconstruct, Vh, pad);
} else if (gauge_param->gauge_order == QUDA_CPS_WILSON_GAUGE_ORDER) {
} else if (gauge_order == QUDA_CPS_WILSON_GAUGE_ORDER) {
packCPSGaugeField(packedEven, (Float*)cpuGauge, 0, reconstruct, Vh, pad);
packCPSGaugeField(packedOdd, (Float*)cpuGauge, 1, reconstruct, Vh, pad);
} else {
printf("Sorry, %d GaugeFieldOrder not supported\n", gauge_param->gauge_order);
printf("Sorry, %d GaugeFieldOrder not supported\n", gauge_order);
exit(-1);
}
......@@ -560,8 +558,8 @@ void loadGaugeField(FloatN *even, FloatN *odd, Float *cpuGauge, ReconstructType
}
template <typename Float, typename FloatN>
void retrieveGaugeField(Float *cpuGauge, FloatN *even, FloatN *odd, ReconstructType reconstruct,
int bytes, int Vh, int pad) {
static void retrieveGaugeField(Float *cpuGauge, FloatN *even, FloatN *odd, GaugeFieldOrder gauge_order,
ReconstructType reconstruct, int bytes, int Vh, int pad) {
// Use pinned memory
FloatN *packedEven, *packedOdd;
......@@ -577,14 +575,14 @@ void retrieveGaugeField(Float *cpuGauge, FloatN *even, FloatN *odd, ReconstructT
cudaMemcpy(packedEven, even, bytes, cudaMemcpyDeviceToHost);
cudaMemcpy(packedOdd, odd, bytes, cudaMemcpyDeviceToHost);
if (gauge_param->gauge_order == QUDA_QDP_GAUGE_ORDER) {
if (gauge_order == QUDA_QDP_GAUGE_ORDER) {
unpackQDPGaugeField((Float**)cpuGauge, packedEven, 0, reconstruct, Vh, pad);
unpackQDPGaugeField((Float**)cpuGauge, packedOdd, 1, reconstruct, Vh, pad);
} else if (gauge_param->gauge_order == QUDA_CPS_WILSON_GAUGE_ORDER) {
} else if (gauge_order == QUDA_CPS_WILSON_GAUGE_ORDER) {
unpackCPSGaugeField((Float*)cpuGauge, packedEven, 0, reconstruct, Vh, pad);
unpackCPSGaugeField((Float*)cpuGauge, packedOdd, 1, reconstruct, Vh, pad);
} else {
printf("Sorry, %d GaugeFieldOrder not supported\n", gauge_param->gauge_order);
printf("Sorry, %d GaugeFieldOrder not supported\n", gauge_order);
exit(-1);
}
......@@ -598,10 +596,11 @@ void retrieveGaugeField(Float *cpuGauge, FloatN *even, FloatN *odd, ReconstructT
}
void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, Precision precision, ReconstructType reconstruct,
Tboundary t_boundary, int *XX, double anisotropy, int pad) {
if (gauge_param->cpu_prec == QUDA_HALF_PRECISION) {
void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, Precision cuda_prec, Precision cpu_prec,
GaugeFieldOrder gauge_order, ReconstructType reconstruct, GaugeFixed gauge_fixed,
Tboundary t_boundary, int *XX, double anisotropy, int pad)
{
if (cpu_prec == QUDA_HALF_PRECISION) {
printf("QUDA error: half precision not supported on cpu\n");
exit(-1);
}
......@@ -620,72 +619,74 @@ void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, Precision precision,
cudaGauge->volume /= 2;
cudaGauge->pad = pad;
cudaGauge->stride = cudaGauge->volume + cudaGauge->pad;
cudaGauge->gauge_fixed = gauge_fixed;
cudaGauge->t_boundary = t_boundary;
allocateGaugeField(cudaGauge, reconstruct, precision);
allocateGaugeField(cudaGauge, reconstruct, cuda_prec);
if (precision == QUDA_DOUBLE_PRECISION) {
if (cuda_prec == QUDA_DOUBLE_PRECISION) {
if (gauge_param->cpu_prec == QUDA_DOUBLE_PRECISION)
if (cpu_prec == QUDA_DOUBLE_PRECISION)
loadGaugeField((double2*)(cudaGauge->even), (double2*)(cudaGauge->odd), (double*)cpuGauge,
cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad);
else if (gauge_param->cpu_prec == QUDA_SINGLE_PRECISION)
gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad);
else if (cpu_prec == QUDA_SINGLE_PRECISION)
loadGaugeField((double2*)(cudaGauge->even), (double2*)(cudaGauge->odd), (float*)cpuGauge,
cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad);
gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad);
} else if (precision == QUDA_SINGLE_PRECISION) {
} else if (cuda_prec == QUDA_SINGLE_PRECISION) {
if (gauge_param->cpu_prec == QUDA_DOUBLE_PRECISION)
if (cpu_prec == QUDA_DOUBLE_PRECISION)
loadGaugeField((float4*)(cudaGauge->even), (float4*)(cudaGauge->odd), (double*)cpuGauge,
cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad);
else if (gauge_param->cpu_prec == QUDA_SINGLE_PRECISION)
gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad);
else if (cpu_prec == QUDA_SINGLE_PRECISION)
loadGaugeField((float4*)(cudaGauge->even), (float4*)(cudaGauge->odd), (float*)cpuGauge,
cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad);
gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad);
} else if (precision == QUDA_HALF_PRECISION) {
} else if (cuda_prec == QUDA_HALF_PRECISION) {
if (gauge_param->cpu_prec == QUDA_DOUBLE_PRECISION)
if (cpu_prec == QUDA_DOUBLE_PRECISION)
loadGaugeField((short4*)(cudaGauge->even), (short4*)(cudaGauge->odd), (double*)cpuGauge,
cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad);
else if (gauge_param->cpu_prec == QUDA_SINGLE_PRECISION)
gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad);
else if (cpu_prec == QUDA_SINGLE_PRECISION)
loadGaugeField((short4*)(cudaGauge->even), (short4*)(cudaGauge->odd), (float*)cpuGauge,
cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad);
gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad);
}
}
void restoreGaugeField(void *cpuGauge, FullGauge *cudaGauge) {
if (gauge_param->cpu_prec == QUDA_HALF_PRECISION) {
void restoreGaugeField(void *cpuGauge, FullGauge *cudaGauge, Precision cpu_prec, GaugeFieldOrder gauge_order)
{
if (cpu_prec == QUDA_HALF_PRECISION) {
printf("QUDA error: half precision not supported on cpu\n");
exit(-1);
}
if (cudaGauge->precision == QUDA_DOUBLE_PRECISION) {
if (gauge_param->cpu_prec == QUDA_DOUBLE_PRECISION)
if (cpu_prec == QUDA_DOUBLE_PRECISION)
retrieveGaugeField((double*)cpuGauge, (double2*)(cudaGauge->even), (double2*)(cudaGauge->odd),
cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad);
else if (gauge_param->cpu_prec == QUDA_SINGLE_PRECISION)
gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad);
else if (cpu_prec == QUDA_SINGLE_PRECISION)
retrieveGaugeField((float*)cpuGauge, (double2*)(cudaGauge->even), (double2*)(cudaGauge->odd),
cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad);
gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad);
} else if (cudaGauge->precision == QUDA_SINGLE_PRECISION) {
if (gauge_param->cpu_prec == QUDA_DOUBLE_PRECISION)
if (cpu_prec == QUDA_DOUBLE_PRECISION)
retrieveGaugeField((double*)cpuGauge, (float4*)(cudaGauge->even), (float4*)(cudaGauge->odd),
cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad);
else if (gauge_param->cpu_prec == QUDA_SINGLE_PRECISION)
gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad);
else if (cpu_prec == QUDA_SINGLE_PRECISION)
retrieveGaugeField((float*)cpuGauge, (float4*)(cudaGauge->even), (float4*)(cudaGauge->odd),
cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad);
gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad);
} else if (cudaGauge->precision == QUDA_HALF_PRECISION) {
if (gauge_param->cpu_prec == QUDA_DOUBLE_PRECISION)
if (cpu_prec == QUDA_DOUBLE_PRECISION)
retrieveGaugeField((double*)cpuGauge, (short4*)(cudaGauge->even), (short4*)(cudaGauge->odd),
cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad);
else if (gauge_param->cpu_prec == QUDA_SINGLE_PRECISION)
gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad);
else if (cpu_prec == QUDA_SINGLE_PRECISION)
retrieveGaugeField((float*)cpuGauge, (short4*)(cudaGauge->even), (short4*)(cudaGauge->odd),
cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad);
gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad);
}
}
......@@ -5,7 +5,6 @@
#include <quda_internal.h>
#include <spinor_quda.h>
#include <gauge_quda.h>
#include <util_quda.h>
......
......@@ -5,7 +5,6 @@
#include <quda_internal.h>
#include <util_quda.h>
#include <spinor_quda.h>
#include <gauge_quda.h>
void MatVec(ParitySpinor out, FullGauge gauge, FullClover clover, FullClover cloverInv, ParitySpinor in,
QudaInvertParam *invert_param, ParitySpinor tmp) {
......
......@@ -92,21 +92,19 @@ void initQuda(int dev)
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
{
gauge_param = param;
checkGaugeParam(gauge_param);
gauge_param->packed_size = (gauge_param->reconstruct == QUDA_RECONSTRUCT_8) ? 8 : 12;
createGaugeField(&cudaGaugePrecise, h_gauge, gauge_param->cuda_prec, gauge_param->reconstruct,
gauge_param->t_boundary, gauge_param->X, gauge_param->anisotropy, gauge_param->ga_pad);
gauge_param->gaugeGiB = 2.0*cudaGaugePrecise.bytes/ (1 << 30);
if (gauge_param->cuda_prec_sloppy != gauge_param->cuda_prec ||
gauge_param->reconstruct_sloppy != gauge_param->reconstruct) {
createGaugeField(&cudaGaugeSloppy, h_gauge, gauge_param->cuda_prec_sloppy,
gauge_param->reconstruct_sloppy, gauge_param->t_boundary,
gauge_param->X, gauge_param->anisotropy, gauge_param->ga_pad);
gauge_param->gaugeGiB += 2.0*cudaGaugeSloppy.bytes/ (1 << 30);
checkGaugeParam(param);
param->packed_size = (param->reconstruct == QUDA_RECONSTRUCT_8) ? 8 : 12;
createGaugeField(&cudaGaugePrecise, h_gauge, param->cuda_prec, param->cpu_prec, param->gauge_order, param->reconstruct, param->gauge_fix,
param->t_boundary, param->X, param->anisotropy, param->ga_pad);
param->gaugeGiB = 2.0*cudaGaugePrecise.bytes/ (1 << 30);
if (param->cuda_prec_sloppy != param->cuda_prec ||
param->reconstruct_sloppy != param->reconstruct) {
createGaugeField(&cudaGaugeSloppy, h_gauge, param->cuda_prec_sloppy, param->cpu_prec, param->gauge_order,
param->reconstruct_sloppy, param->gauge_fix, param->t_boundary,
param->X, param->anisotropy, param->ga_pad);
param->gaugeGiB += 2.0*cudaGaugeSloppy.bytes/ (1 << 30);
} else {
cudaGaugeSloppy = cudaGaugePrecise;
}
......@@ -114,16 +112,15 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
/*
Very limited functionailty here
- currently assumes that the precision of the cpu field is the same as before
- no ability to dump the sloppy gauge field
- really exposes how crap the current api is
*/
void saveGaugeQuda(void *h_gauge)
void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
{
restoreGaugeField(h_gauge, &cudaGaugePrecise);
restoreGaugeField(h_gauge, &cudaGaugePrecise, param->cpu_prec, param->gauge_order);
}
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
{
if (!h_clover && !h_clovinv) {
printf("QUDA error: loadCloverQuda() called with neither clover term nor inverse\n");
......@@ -220,7 +217,7 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int parity,
if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
parity = (parity+1)%2;
axCuda(gauge_param->anisotropy, in);
axCuda(cudaGaugePrecise.anisotropy, in);
}
if (inv_param->dslash_type == QUDA_WILSON_DSLASH) {
......
......@@ -14,7 +14,7 @@ int test_type = 1;
// clover-improved? (0 = plain Wilson, 1 = clover)
int clover_yes = 0;
QudaGaugeParam gaugeParam;
QudaGaugeParam gauge_param;
QudaInvertParam inv_param;
FullGauge gauge;
......@@ -35,26 +35,26 @@ int TRANSFER = 0; // include transfer time in the benchmark?
void init() {
gaugeParam = newQudaGaugeParam();
gauge_param = newQudaGaugeParam();
inv_param = newQudaInvertParam();
gaugeParam.X[0] = 24;
gaugeParam.X[1] = 24;
gaugeParam.X[2] = 24;
gaugeParam.X[3] = 48;
setDims(gaugeParam.X);
gauge_param.X[0] = 24;
gauge_param.X[1] = 24;
gauge_param.X[2] = 24;
gauge_param.X[3] = 48;
setDims(gauge_param.X);
gaugeParam.anisotropy = 2.3;
gauge_param.anisotropy = 2.3;
gaugeParam.gauge_order = QUDA_QDP_GAUGE_ORDER;
gaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;
gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T;
gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION;
gaugeParam.cuda_prec = QUDA_SINGLE_PRECISION;
gaugeParam.reconstruct = QUDA_RECONSTRUCT_12;
gaugeParam.reconstruct_sloppy = gaugeParam.reconstruct;
gaugeParam.cuda_prec_sloppy = gaugeParam.cuda_prec;
gaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
gauge_param.cpu_prec = QUDA_DOUBLE_PRECISION;
gauge_param.cuda_prec = QUDA_SINGLE_PRECISION;
gauge_param.reconstruct = QUDA_RECONSTRUCT_12;
gauge_param.reconstruct_sloppy = gauge_param.reconstruct;
gauge_param.cuda_prec_sloppy = gauge_param.cuda_prec;
gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
if (clover_yes) {
inv_param.dslash_type = QUDA_CLOVER_WILSON_DSLASH;
......@@ -69,11 +69,11 @@ void init() {
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
gaugeParam.ga_pad = 0;
gauge_param.ga_pad = 0;
inv_param.sp_pad = 0;
inv_param.cl_pad = 0;
/*gaugeParam.ga_pad = 24*24*12;
/*gauge_param.ga_pad = 24*24*12;
inv_param.sp_pad = 24*24*12;
inv_param.cl_pad = 24*24*12;*/
......@@ -88,10 +88,8 @@ void init() {
}
inv_param.verbosity = QUDA_VERBOSE;
gauge_param = &gaugeParam;
// construct input fields
for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gaugeParam.cpu_prec);
for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gauge_param.cpu_prec);
if (clover_yes) {
if (test_type > 0) {
......@@ -121,7 +119,7 @@ void init() {
printf("Randomizing fields... ");
construct_gauge_field(hostGauge, 1, gaugeParam.cpu_prec, &gaugeParam);
construct_gauge_field(hostGauge, 1, gauge_param.cpu_prec, &gauge_param);
construct_spinor_field(spinor, 1, 0, 0, 0, inv_param.cpu_prec);
if (clover_yes) {
......@@ -139,11 +137,11 @@ void init() {
int dev = 0;
initQuda(dev);
loadGaugeQuda(hostGauge, &gaugeParam);
loadGaugeQuda(hostGauge, &gauge_param);
gauge = cudaGaugePrecise;
if (clover_yes) {
loadCloverQuda(NULL, hostCloverInv, &inv_param);
loadCloverQuda(NULL, hostCloverInv, &gauge_param, &inv_param);
clover = cudaCloverPrecise;
cloverInv = cudaCloverInvPrecise;
}
......@@ -152,11 +150,11 @@ void init() {
if (!TRANSFER) {
gaugeParam.X[0] /= 2;
tmp = allocateParitySpinor(gaugeParam.X, inv_param.cuda_prec, inv_param.sp_pad);
cudaSpinor = allocateSpinorField(gaugeParam.X, inv_param.cuda_prec, inv_param.sp_pad);
cudaSpinorOut = allocateSpinorField(gaugeParam.X, inv_param.cuda_prec, inv_param.sp_pad);
gaugeParam.X[0] *= 2;
gauge_param.X[0] /= 2;
tmp = allocateParitySpinor(gauge_param.X, inv_param.cuda_prec, inv_param.sp_pad);
cudaSpinor = allocateSpinorField(gauge_param.X, inv_param.cuda_prec, inv_param.sp_pad);
cudaSpinorOut = allocateSpinorField(gauge_param.X, inv_param.cuda_prec, inv_param.sp_pad);
gauge_param.X[0] *= 2;
if (test_type < 2) {
loadParitySpinor(cudaSpinor.even, spinorEven, inv_param.cpu_prec,
......@@ -254,15 +252,15 @@ void dslashRef() {
switch (test_type) {
case 0:
dslash(spinorRef, hostGauge, spinorEven, ODD_BIT, DAGGER_BIT,
inv_param.cpu_prec, gaugeParam.cpu_prec);
inv_param.cpu_prec, gauge_param.cpu_prec);
break;
case 1:
matpc(spinorRef, hostGauge, spinorEven, kappa, inv_param.matpc_type, DAGGER_BIT,
inv_param.cpu_prec, gaugeParam.cpu_prec);
inv_param.cpu_prec, gauge_param.cpu_prec);
break;
case 2:
mat(spinorRef, hostGauge, spinor, kappa, DAGGER_BIT,
inv_param.cpu_prec, gaugeParam.cpu_prec);
inv_param.cpu_prec, gauge_param.cpu_prec);
break;
default:
printf("Test type not defined\n");
......@@ -280,7 +278,7 @@ void dslashTest() {
float spinorGiB = (float)Vh*spinorSiteSize*sizeof(inv_param.cpu_prec) / (1 << 30);
float sharedKB = 0;//(float)dslashCudaSharedBytes(inv_param.cuda_prec) / (1 << 10);
printf("\nSpinor mem: %.3f GiB\n", spinorGiB);
printf("Gauge mem: %.3f GiB\n", gaugeParam.gaugeGiB);
printf("Gauge mem: %.3f GiB\n", gauge_param.gaugeGiB);
printf("Shared mem: %.3f KB\n", sharedKB);
int attempts = 1;
......@@ -300,7 +298,7 @@ void dslashTest() {
printf("%fms per loop\n", 1000*secs);
int flops = test_type ? 1320*2 + 48 : 1320;
int floats = test_type ? 2*(7*24+8*gaugeParam.packed_size+24)+24 : 7*24+8*gaugeParam.packed_size+24;
int floats = test_type ? 2*(7*24+8*gauge_param.packed_size+24)+24 : 7*24+8*gauge_param.packed_size+24;
if (clover_yes) {
flops += test_type ? 504*2 : 504;
floats += test_type ? 72*2 : 72;
......