Advanced Computing Platform for Theoretical Physics

Commit a2f374da authored by mikeaclark's avatar mikeaclark
Browse files

Nearly implemented double precision GPU...

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@376 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 1226333b
......@@ -23,7 +23,7 @@
extern "C" {
#endif
// ---------- blas_cuda.cu ----------
// ---------- blas_quda.cu ----------
void zeroCuda(float* dst, int cnt);
void copyCuda(float* dst, float *src, int len);
......@@ -56,26 +56,6 @@ double3 caxpbypzYmbwcDotProductWYNormYCuda(float2 a, float2 *x, float2 b, float2
cuDoubleComplex xpaycDotzyCuda(float2 *x, float a, float2 *y, float2 *z, int len);
// ---------- blas_reference.cpp ----------
void zero(float* a, int cnt);
void copy(float* a, float *b, int len);
void ax(float a, float *x, int len);
void axpbyCuda(float a, float *x, float b, float *y, int len);
void axpy(float a, float *x, float *y, int len);
void xpay(float *x, float a, float *y, int len);
void mxpy(float *x, float *y, int len);
float norm(float *vector, int len);
float reDotProduct(float *v1, float *v2, int len);
float imDotProduct(float *v1, float *v2, int len);
double normD(float *vector, int len);
double reDotProductD(float *v1, float *v2, int len);
double imDotProductD(float *v1, float *v2, int len);
#ifdef __cplusplus
}
#endif
......
#include <quda.h>
#include <blas_reference.h>
// performs the operation x[i] *= a
template <typename Float>
void aX(Float a, Float *x, int len) {
for (int i=0; i<len; i++) x[i] *= a;
}
void ax(double a, void *x, int len, Precision precision) {
if (precision == QUDA_DOUBLE_PRECISION) aX(a, (double*)x, len);
else aX((float)a, (float*)x, len);
}
// performs the operation y[i] -= x[i] (minus x plus y)
template <typename Float>
void mXpY(Float *x, Float *y, int len) {
for (int i=0; i<len; i++) y[i] -= x[i];
}
void mxpy(void* x, void* y, int len, Precision precision) {
if (precision == QUDA_DOUBLE_PRECISION) mXpY((double*)x, (double*)y, len);
else mXpY((float*)x, (float*)y, len);
}
// returns the square of the L2 norm of the vector
template <typename Float>
double norm2(Float *v, int len) {
double sum=0.0;
for (int i=0; i<len; i++) sum += v[i]*v[i];
return sum;
}
double norm_2(void *v, int len, Precision precision) {
if (precision == QUDA_DOUBLE_PRECISION) return norm2((double*)v, len);
else return norm2((float*)v, len);
}
/*
// sets all elements of the destination vector to zero
......@@ -12,11 +52,6 @@ void copy(float* a, float *b, int len) {
for (int i = 0; i < len; i++) a[i] = b[i];
}
// performs the operation x[i] *= a
void ax(float a, float *x, int len) {
for (int i=0; i<len; i++) x[i] *= a;
}
// performs the operation y[i] = a*x[i] + b*y[i]
void axpby(float a, float *x, float b, float *y, int len) {
for (int i=0; i<len; i++) y[i] = a*x[i] + b*y[i];
......@@ -27,27 +62,6 @@ void axpy(float a, float *x, float *y, int len) {
for (int i=0; i<len; i++) y[i] += a*x[i];
}
// performs the operation y[i] = x[i] + a*y[i]
void xpay(float *x, float a, float *y, int len) {
for (int i=0; i<len; i++) y[i] = x[i] + a*y[i];
}
// performs the operation y[i] -= x[i] (minus x plus y)
void mxpy(float *x, float *y, int len) {
for (int i=0; i<len; i++) y[i] -= x[i];
}
// returns the square of the L2 norm of the vector
float norm(float *v, int len) {
float sum=0.0;
for (int i=0; i<len; i++) {
sum += v[i]*v[i];
}
return sum;
}
// returns the real part of the dot product of 2 complex valued vectors
float reDotProduct(float *v1, float *v2, int len) {
......@@ -103,3 +117,4 @@ double imDotProductD(float *v1, float *v2, int len) {
return dot;
}
*/
#ifndef _QUDA_BLAS_REF_H
#define _QUDA_BLAS_REF_H
#include <quda.h>
#ifdef __cplusplus
extern "C" {
#endif
// ---------- blas_reference.cpp ----------
double norm_2(void *vector, int len, Precision precision);
void mxpy(void *x, void *y, int len, Precision precision);
void ax(double a, void *x, int len, Precision precision);
/* void zero(float* a, int cnt);
void copy(float* a, float *b, int len);*/
/*void axpbyCuda(float a, float *x, float b, float *y, int len);
void axpy(float a, float *x, float *y, int len);*/
//void xpay(float *x, float a, float *y, int len);
/*float reDotProduct(float *v1, float *v2, int len);
float imDotProduct(float *v1, float *v2, int len);
double normD(float *vector, int len);
double reDotProductD(float *v1, float *v2, int len);
double imDotProductD(float *v1, float *v2, int len);*/
#ifdef __cplusplus
}
#endif
#endif // _QUDA_BLAS_REF_H
......@@ -15,8 +15,6 @@
#define BLOCK_DIM (64) // threads per block
#define GRID_DIM (Nh/BLOCK_DIM) // there are Nh threads in total
#define SPINOR_BYTES (Nh*spinorSiteSize*sizeof(float))
#define PACKED12_GAUGE_BYTES (4*Nh*12*sizeof(float))
#define PACKED8_GAUGE_BYTES (4*Nh*8*sizeof(float))
......@@ -26,9 +24,9 @@
extern "C" {
#endif
extern FullGauge cudaDGauge;
extern FullGauge cudaSGauge;
extern FullGauge cudaHGauge;
extern FullGauge cudaGaugePrecise;
extern FullGauge cudaGaugeSloppy;
extern QudaGaugeParam *gauge_param;
extern QudaInvertParam *invert_param;
......@@ -80,19 +78,6 @@ extern "C" {
QudaSumComplex MatPCDagcDotWXCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven,
float kappa, ParitySpinor tmp, ParitySpinor d, MatPCType matpc_type);*/
// -- dslash_reference.cpp
void dslashReference(float *res, float **gauge, float *spinorField,
int oddBit, int daggerBit);
void Mat(float *out, float **gauge, float *in, float kappa);
void MatDag(float *out, float **gauge, float *in, float kappa);
void MatDagMat(float *out, float **gauge, float *in, float kappa);
void MatPC(float *out, float **gauge, float *in, float kappa, MatPCType matpc_type);
void MatPCDag(float *out, float **gauge, float *in, float kappa, MatPCType matpc_type);
void MatPCDagMatPC(float *out, float **gauge, float *in, float kappa, MatPCType matpc_type);
// -- inv_cg_cuda.cpp
void invertCgCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge,
ParitySpinor tmp, QudaInvertParam *param);
......
This diff is collapsed.
#include <blas_reference.h>
#ifndef _QUDA_DSLASH_REF_H
#define _QUDA_DSLASH_REF_H
#ifdef __cplusplus
extern "C" {
#endif
// -- dslash_reference.cpp
void dslash_reference(void *res, void **gauge, void *spinorField,
int oddBit, int daggerBit, Precision sPrecision, Precision gPrecision);
void mat(void *out, void **gauge, void *in, double kappa, int daggerBit, Precision sPrecision, Precision gPrecision);
void matpc(void *out, void **gauge, void *in, double kappa, MatPCType matpc_type,
int daggerBit, Precision sPrecision, Precision gPrecision);
#ifdef __cplusplus
}
#endif
#endif // _QUDA_DLASH_REF_H
......@@ -2,13 +2,13 @@
#include <stdlib.h>
#include <quda.h>
#include <dslash_reference.h>
#include <util_quda.h>
#include <spinor_quda.h>
#include <gauge_quda.h>
// What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat)
int test_type = 1;
int test_type = 0;
QudaGaugeParam gaugeParam;
QudaInvertParam inv_param;
......@@ -18,85 +18,75 @@ FullSpinor cudaSpinor;
FullSpinor cudaSpinorOut;
ParitySpinor tmp;
float *hostGauge[4];
float *spinor, *spinorRef, *spinorGPU;
float *spinorEven, *spinorOdd;
void *hostGauge[4];
void *spinor, *spinorRef, *spinorGPU;
void *spinorEven, *spinorOdd;
float kappa = 1.0;
double kappa = 1.0;
int ODD_BIT = 0;
int DAGGER_BIT = 0;
int TRANSFER = 0; // include transfer time in the benchmark?
void printSpinorHalfField(float *spinor) {
printSpinor(&spinor[0*spinorSiteSize]);
printf("...\n");
printSpinor(&spinor[(Nh-1)*spinorSiteSize]);
printf("\n");
}
void printSpinorFullField(float *spinor) {
printSpinor(&spinor[0*spinorSiteSize]);
printf("...\n");
printSpinor(&spinor[(N-1)*spinorSiteSize]);
printf("\n");
}
void init() {
gaugeParam.cpu_prec = QUDA_SINGLE_PRECISION;
gaugeParam.cuda_prec = QUDA_HALF_PRECISION;
gaugeParam.reconstruct_precise = QUDA_RECONSTRUCT_8;
gaugeParam.cuda_prec_precise = QUDA_SINGLE_PRECISION;
gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_8;
gaugeParam.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
gaugeParam.X = L1;
gaugeParam.Y = L2;
gaugeParam.Z = L3;
gaugeParam.T = L4;
gaugeParam.anisotropy = 2.3;
gaugeParam.reconstruct = QUDA_RECONSTRUCT_8;
gaugeParam.gauge_order = QUDA_QDP_GAUGE_ORDER;
gaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;
gaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
gauge_param = &gaugeParam;
inv_param.cpu_prec = QUDA_SINGLE_PRECISION;
inv_param.cuda_prec = QUDA_HALF_PRECISION;
inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
if (test_type == 2) inv_param.dirac_order = QUDA_DIRAC_ORDER;
else inv_param.dirac_order = QUDA_DIRAC_ORDER;
inv_param.kappa = kappa;
invert_param = &inv_param;
size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
size_t sSize = (inv_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
// construct input fields
for (int dir = 0; dir < 4; dir++) {
hostGauge[dir] = (float*)malloc(N*gaugeSiteSize*sizeof(float));
}
for (int dir = 0; dir < 4; dir++)
hostGauge[dir] = malloc(N*gaugeSiteSize*gSize);
spinor = (float*)malloc(N*spinorSiteSize*sizeof(float));
spinorRef = (float*)malloc(N*spinorSiteSize*sizeof(float));
spinorGPU = (float*)malloc(N*spinorSiteSize*sizeof(float));
spinor = malloc(N*spinorSiteSize*sSize);
spinorRef = malloc(N*spinorSiteSize*sSize);
spinorGPU = malloc(N*spinorSiteSize*sSize);
spinorEven = spinor;
spinorOdd = spinor + Nh*spinorSiteSize;
printf("Randomizing fields...");
constructGaugeField(hostGauge);
constructSpinorField(spinor);
construct_gauge_field(hostGauge, 1, gaugeParam.cpu_prec);
construct_spinor_field(spinor, 1, 0, 0, 0, inv_param.cpu_prec);
printf("done.\n"); fflush(stdout);
int dev = 0;
initQuda(dev);
loadGaugeQuda((void*)hostGauge, &gaugeParam);
loadGaugeQuda(hostGauge, &gaugeParam);
if (gaugeParam.cuda_prec == QUDA_SINGLE_PRECISION) gauge = cudaSGauge;
else gauge = cudaHGauge;
gauge = cudaGaugePrecise;
printf("Sending fields to GPU..."); fflush(stdout);
if (!TRANSFER) {
Precision prec = QUDA_SINGLE_PRECISION;
cudaSpinor = allocateSpinorField(prec);
cudaSpinorOut = allocateSpinorField(prec);
tmp = allocateParitySpinor(prec);
cudaSpinor = allocateSpinorField(inv_param.cuda_prec);
cudaSpinorOut = allocateSpinorField(inv_param.cuda_prec);
tmp = allocateParitySpinor(inv_param.cuda_prec);
loadSpinorField(cudaSpinor, (void*)spinor, inv_param.cpu_prec,
loadSpinorField(cudaSpinor, spinor, inv_param.cpu_prec,
inv_param.cuda_prec, inv_param.dirac_order);
printf("\nnorm = %e\n", normCuda(cudaSpinor.even.spinor, Nh*24));
}
}
......@@ -121,13 +111,16 @@ void dslashRef() {
fflush(stdout);
switch (test_type) {
case 0:
dslashReference(spinorRef, hostGauge, spinorEven, ODD_BIT, DAGGER_BIT);
dslash_reference(spinorRef, hostGauge, spinorEven, ODD_BIT, DAGGER_BIT,
inv_param.cpu_prec, gaugeParam.cpu_prec);
break;
case 1:
MatPC(spinorRef, hostGauge, spinorEven, kappa, QUDA_MATPC_EVEN_EVEN);
matpc(spinorRef, hostGauge, spinorEven, kappa, QUDA_MATPC_EVEN_EVEN, DAGGER_BIT,
inv_param.cpu_prec, gaugeParam.cpu_prec);
break;
case 2:
Mat(spinorRef, hostGauge, spinor, kappa);
mat(spinorRef, hostGauge, spinor, kappa, DAGGER_BIT,
inv_param.cpu_prec, gaugeParam.cpu_prec);
break;
default:
printf("Test type not defined\n");
......@@ -174,63 +167,25 @@ double dslashCUDA() {
}
void strongCheck() {
int len;
void *spinorRes;
if (test_type < 2) {
printf("Reference:\n");
printSpinorHalfField(spinorRef);
printf("\nCUDA:\n");
printSpinorHalfField(spinorOdd);
len = Nh;
spinorRes = spinorOdd;
} else {
printf("Reference:\n");
printSpinorHalfField(spinorRef);
printf("\nCUDA:\n");
printSpinorHalfField(spinorGPU);
len = N;
spinorRes = spinorGPU;
}
int fail_check = 12;
int fail[fail_check];
for (int f=0; f<fail_check; f++) fail[f] = 0;
int iter[24];
for (int i=0; i<24; i++) iter[i] = 0;
if (test_type < 2) {
for (int i=0; i<Nh; i++) {
for (int j=0; j<24; j++) {
int is = i*24+j;
float diff = fabs(spinorRef[is]-spinorOdd[is]);
for (int f=0; f<fail_check; f++)
if (diff > pow(10,-(f+1))) fail[f]++;
//if (diff > 1e-1) printf("%d %d %e\n", i, j, diff);
if (diff > 1e-3) iter[j]++;
}
}
for (int i=0; i<24; i++) printf("%d fails = %d\n", i, iter[i]);
for (int f=0; f<fail_check; f++) {
printf("%e Failures: %d / %d = %e\n", pow(10,-(f+1)), fail[f], Nh*24, fail[f] / (float)(Nh*24));
}
} else {
for (int i=0; i<N; i++) {
for (int j=0; j<24; j++) {
int is = i*24+j;
float diff = fabs(spinorRef[is]-spinorGPU[is]);
for (int f=0; f<fail_check; f++)
if (diff > pow(10,-(f+1))) fail[f]++;
//if (diff > 1e-1) printf("%d %d %e\n", i, j, diff);
if (diff > 1e-3) iter[j]++;
}
}
for (int i=0; i<24; i++) printf("%d fails = %d\n", i, iter[i]);
printf("Reference:\n");
printSpinorElement(spinorRef, 0, inv_param.cpu_prec); printf("...\n");
printSpinorElement(spinorRef, len-1, inv_param.cpu_prec); printf("\n");
for (int f=0; f<fail_check; f++) {
printf("%e Failures: %d / %d = %e\n", pow(10,-(f+1)), fail[f], N*24, fail[f] / (float)(N*24));
}
}
printf("\nCUDA:\n");
printSpinorElement(spinorRes, 0, inv_param.cpu_prec); printf("...\n");
printSpinorElement(spinorRes, len-1, inv_param.cpu_prec); printf("\n");
//compare_spinor(spinorRef, spinorRes, len, inv_param.cpu_prec);
}
......@@ -238,7 +193,7 @@ void dslashTest() {
init();
float spinorGiB = (float)Nh*spinorSiteSize*sizeof(float) / (1 << 30);
float spinorGiB = (float)Nh*spinorSiteSize*sizeof(inv_param.cpu_prec) / (1 << 30);
float sharedKB = (float)dslashCudaSharedBytes() / (1 << 10);
printf("\nSpinor mem: %.3f GiB\n", spinorGiB);
printf("Gauge mem: %.3f GiB\n", gaugeParam.gaugeGiB);
......@@ -251,10 +206,10 @@ void dslashTest() {
double secs = dslashCUDA();
if (!TRANSFER) {
if (test_type < 2) retrieveParitySpinor(spinorOdd, cudaSpinor.odd, QUDA_SINGLE_PRECISION,
QUDA_SINGLE_PRECISION, QUDA_DIRAC_ORDER);
else retrieveSpinorField(spinorGPU, cudaSpinorOut, QUDA_SINGLE_PRECISION,
QUDA_SINGLE_PRECISION, QUDA_DIRAC_ORDER);
if (test_type < 2) retrieveParitySpinor(spinorOdd, cudaSpinor.odd, inv_param.cpu_prec,
inv_param.cuda_prec, QUDA_DIRAC_ORDER);
else retrieveSpinorField(spinorGPU, cudaSpinorOut, inv_param.cpu_prec,
inv_param.cuda_prec, QUDA_DIRAC_ORDER);
}
// print timing information
printf("%fms per loop\n", 1000*secs);
......@@ -264,8 +219,8 @@ void dslashTest() {
printf("GiB/s = %f\n\n", Nh*floats*sizeof(float)/(secs*(1<<30)));
int res;
if (test_type < 2) res = compareFloats(spinorOdd, spinorRef, Nh*4*3*2, 1e-4);
else res = compareFloats(spinorGPU, spinorRef, N*4*3*2, 1e-4);
if (test_type < 2) res = compare_floats(spinorOdd, spinorRef, Nh*4*3*2, 1e-4, inv_param.cpu_prec);
else res = compare_floats(spinorGPU, spinorRef, N*4*3*2, 1e-4, inv_param.cpu_prec);
printf("%d Test %s\n", i, (1 == res) ? "PASSED" : "FAILED");
strongCheck();
......@@ -280,5 +235,3 @@ void dslashTest() {
int main(int argc, char **argv) {
dslashTest();
}
......@@ -10,11 +10,6 @@
#define SCALE_FLOAT (SHORT_LENGTH-1) / 2.f
#define SHIFT_FLOAT -1.f / (SHORT_LENGTH-1)
// GPU gauge field
FullGauge cudaDGauge;
FullGauge cudaSGauge;
FullGauge cudaHGauge;
inline short floatToShort(float a) {
return (short)((a+SHIFT_FLOAT)*SCALE_FLOAT);
}
......@@ -378,10 +373,10 @@ void packCPSGaugeFieldHS(short4 *res, float *gauge, int oddBit, ReconstructType
}
void allocateGaugeField(FullGauge *gauge, ReconstructType reconstruct, Precision precision) {
void allocateGaugeField(FullGauge *cudaGauge, ReconstructType reconstruct, Precision precision) {
gauge->reconstruct = reconstruct;
gauge->precision = precision;
cudaGauge->reconstruct = reconstruct;
cudaGauge->precision = precision;
int floatSize;
if (precision == QUDA_DOUBLE_PRECISION) floatSize = sizeof(double);
......@@ -389,17 +384,17 @@ void allocateGaugeField(FullGauge *gauge, ReconstructType reconstruct, Precision
else floatSize = sizeof(float)/2;
int elements = (reconstruct == QUDA_RECONSTRUCT_8) ? 8 : 12;
gauge->packedGaugeBytes = 4*Nh*elements*floatSize;
cudaGauge->packedGaugeBytes = 4*Nh*elements*floatSize;
if (!gauge->even) {
if (cudaMalloc((void **)&gauge->even, gauge->packedGaugeBytes) == cudaErrorMemoryAllocation) {
if (!cudaGauge->even) {
if (cudaMalloc((void **)&cudaGauge->even, cudaGauge->packedGaugeBytes) == cudaErrorMemoryAllocation) {
printf("Error allocating even gauge field\n");
exit(0);
}
}
if (!gauge->odd) {
if (cudaMalloc((void **)&gauge->odd, gauge->packedGaugeBytes) == cudaErrorMemoryAllocation) {
if (!cudaGauge->odd) {
if (cudaMalloc((void **)&cudaGauge->odd, cudaGauge->packedGaugeBytes) == cudaErrorMemoryAllocation) {
printf("Error allocating even odd gauge field\n");
exit(0);
}
......@@ -407,21 +402,11 @@ void allocateGaugeField(FullGauge *gauge, ReconstructType reconstruct, Precision
}
void freeGaugeField() {
if (cudaDGauge.even) cudaFree(cudaDGauge.even);
if (cudaDGauge.odd) cudaFree(cudaDGauge.odd);
cudaDGauge.even = NULL;
cudaDGauge.odd = NULL;
if (cudaSGauge.even) cudaFree(cudaSGauge.even);
if (cudaSGauge.odd) cudaFree(cudaSGauge.odd);
cudaSGauge.even = NULL;
cudaSGauge.odd = NULL;
if (cudaHGauge.even) cudaFree(cudaHGauge.even);
if (cudaHGauge.odd) cudaFree(cudaHGauge.odd);
cudaHGauge.even = NULL;
cudaHGauge.odd = NULL;
void freeGaugeField(FullGauge *cudaGauge) {
if (cudaGauge->even) cudaFree(cudaGauge->even);
if (cudaGauge->odd) cudaFree(cudaGauge->odd);
cudaGauge->even = NULL;
cudaGauge->odd = NULL;
}
void loadGaugeField(FullGauge *cudaGauge, void *cpuGauge) {
......@@ -444,11 +429,11 @@ void loadGaugeField(FullGauge *cudaGauge, void *cpuGauge) {
#endif
if (gauge_param->gauge_order == QUDA_QDP_GAUGE_ORDER) {
packQDPGaugeFieldDD(packedEven, (double**)cpuGauge, 0, gauge_param->reconstruct);
packQDPGaugeFieldDD(packedOdd, (double**)cpuGauge, 1, gauge_param->reconstruct);
packQDPGaugeFieldDD(packedEven, (double**)cpuGauge, 0, cudaGauge->reconstruct);
packQDPGaugeFieldDD(packedOdd, (double**)cpuGauge, 1, cudaGauge->reconstruct);
} else if (gauge_param->gauge_order == QUDA_CPS_WILSON_GAUGE_ORDER) {
packCPSGaugeFieldDD(packedEven, (double*)cpuGauge, 0, gauge_param->reconstruct);
packCPSGaugeFieldDD(packedOdd, (double*)cpuGauge, 1, gauge_param->reconstruct);
packCPSGaugeFieldDD(packedEven, (double*)cpuGauge, 0, cudaGauge->reconstruct);
packCPSGaugeFieldDD(packedOdd, (double*)cpuGauge, 1, cudaGauge->reconstruct);
} else {
printf("Sorry, %d GaugeFieldOrder not supported\n", gauge_param->gauge_order);
exit(-1);
......@@ -478,19 +463,19 @@ void loadGaugeField(FullGauge *cudaGauge, void *cpuGauge) {
if (gauge_param->gauge_order == QUDA_QDP_GAUGE_ORDER) {
if (gauge_param->cpu_prec == QUDA_DOUBLE_PRECISION) {
packQDPGaugeFieldSD(packedEven, (double**)cpuGauge, 0, gauge_param->reconstruct);
packQDPGaugeFieldSD(packedOdd, (double**)cpuGauge, 1, gauge_param->reconstruct);
packQDPGaugeFieldSD(packedEven, (double**)cpuGauge, 0, cudaGauge->reconstruct);
packQDPGaugeFieldSD(packedOdd, (double**)cpuGauge, 1, cudaGauge->reconstruct);
} else {
packQDPGaugeFieldSS(packedEven, (float**)cpuGauge, 0, gauge_param->reconstruct);
packQDPGaugeFieldSS(packedOdd, (float**)cpuGauge, 1, gauge_param->reconstruct);
packQDPGaugeFieldSS(packedEven, (float**)cpuGauge, 0, cudaGauge->reconstruct);
packQDPGaugeFieldSS(packedOdd, (float**)cpuGauge, 1, cudaGauge->reconstruct);
}
} else if (gauge_param->gauge_order == QUDA_CPS_WILSON_GAUGE_ORDER) {
if (gauge_param->cpu_prec == QUDA_DOUBLE_PRECISION) {
packCPSGaugeFieldSD(packedEven, (double*)cpuGauge, 0, gauge_param->reconstruct);
packCPSGaugeFieldSD(packedOdd, (double*)cpuGauge, 1, gauge_param->reconstruct);
packCPSGaugeFieldSD(packedEven, (double*)cpuGauge, 0, cudaGauge->reconstruct);
packCPSGaugeFieldSD(packedOdd, (double*)cpuGauge, 1, cudaGauge->reconstruct);
} else {
packCPSGaugeFieldSS(packedEven, (float*)cpuGauge, 0, gauge_param->reconstruct);
packCPSGaugeFieldSS(packedOdd, (float*)cpuGauge, 1, gauge_param->reconstruct);
packCPSGaugeFieldSS(packedEven, (float*)cpuGauge, 0, cudaGauge->reconstruct);
packCPSGaugeFieldSS(packedOdd, (float*)cpuGauge, 1, cudaGauge->reconstruct);
}
} else {
printf("Sorry, %d GaugeFieldOrder not supported\n", gauge_param->gauge_order);
......@@ -520,19 +505,19 @@ void loadGaugeField(FullGauge *cudaGauge, void *cpuGauge) {
if (gauge_param->gauge_order == QUDA_QDP_GAUGE_ORDER) {
if (gauge_param->cpu_prec == QUDA_DOUBLE_PRECISION) {
packQDPGaugeFieldHD(packedEven, (double**)cpuGauge, 0, gauge_param->reconstruct);
packQDPGaugeFieldHD(packedOdd, (double**)cpuGauge, 1, gauge_param->reconstruct);
packQDPGaugeFieldHD(packedEven, (double**)cpuGauge, 0, cudaGauge->reconstruct);
packQDPGaugeFieldHD(packedOdd, (double**)cpuGauge, 1, cudaGauge->reconstruct);
} else {
packQDPGaugeFieldHS(packedEven, (float**)cpuGauge, 0, gauge_param->reconstruct);
packQDPGaugeFieldHS(packedOdd, (float**)cpuGauge, 1, gauge_param->reconstruct);
packQDPGaugeFieldHS(packedEven, (float**)cpuGauge, 0, cudaGauge->reconstruct);
packQDPGaugeFieldHS(packedOdd, (float**)cpuGauge, 1, cudaGauge->reconstruct);
}
} else if (gauge_param->gauge_order == QUDA_CPS_WILSON_GAUGE_ORDER) {
if (gauge_param->cpu_prec == QUDA_DOUBLE_PRECISION) {
packCPSGaugeFieldHD(packedEven, (double*)cpuGauge, 0, gauge_param->reconstruct);
packCPSGaugeFieldHD(packedOdd, (double*)cpuGauge, 1, gauge_param->reconstruct);
packCPSGaugeFieldHD(packedEven, (double*)cpuGauge, 0, cudaGauge->reconstruct);
packCPSGaugeFieldHD(packedOdd, (double*)cpuGauge, 1, cudaGauge->reconstruct);
} else {
packCPSGaugeFieldHS(packedEven, (float*)cpuGauge, 0, gauge_param->reconstruct);
packCPSGaugeFieldHS(packedOdd, (float*)cpuGauge, 1, gauge_param->reconstruct);
packCPSGaugeFieldHS(packedEven, (float*)cpuGauge, 0, cudaGauge->reconstruct);
packCPSGaugeFieldHS(packedOdd, (float*)cpuGauge, 1, cudaGauge->reconstruct);
}
} else {
printf("Sorry, %d GaugeFieldOrder not supported\n", gauge_param->gauge_order);
......@@ -554,34 +539,19 @@ void loadGaugeField(FullGauge *cudaGauge, void *cpuGauge) {
}
void createGaugeField(void *cpuGauge) {
setCudaGaugeParam();
if (gauge_param->X != L1 || gauge_param->Y != L2 || gauge_param->Z != L3 || gauge_param->T != L4) {
printf("QUDA error: dimensions do not match: %d=%d, %d=%d, %d=%d, %d=%d\n",
gauge_param->X, L1, gauge_param->Y, L2, gauge_param->Z, L3, gauge_param->T, L4);
exit(-1);
}
void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, ReconstructType reconstruct, Precision precision) {
if (gauge_param->cpu_prec == QUDA_HALF_PRECISION) {
printf("QUDA error: half precision not supported on cpu\n");
exit(-1);
}
if (gauge_param->reconstruct == QUDA_RECONSTRUCT_NO) {
if (reconstruct == QUDA_RECONSTRUCT_NO) {
printf("QUDA error: ReconstructType not yet supported\n");
exit(-1);
}
gauge_param->packed_size = (gauge_param->reconstruct == QUDA_RECONSTRUCT_8) ? 8 : 12;
allocateGaugeField(&cudaSGauge, gauge_param->reconstruct, QUDA_SINGLE_PRECISION);
loadGaugeField(&cudaSGauge, cpuGauge);
allocateGaugeField(&cudaHGauge, gauge_param->reconstruct, QUDA_HALF_PRECISION);
loadGaugeField(&cudaHGauge, cpuGauge);
// 2 since even-odd
gauge_param->gaugeGiB = (float)2*(cudaSGauge.packedGaugeBytes+cudaHGauge.packedGaugeBytes)/ (1 << 30);
allocateGaugeField(cudaGauge, reconstruct, precision);
loadGaugeField(cudaGauge, cpuGauge);
}
......@@ -8,10 +8,8 @@
extern "C" {
#endif
void allocateGaugeField(FullGauge *gauge, ReconstructType reconstruct, Precision precision);
void freeGaugeField();
void createGaugeField(void *gauge);
void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, ReconstructType reconstruct, Precision precision);
void freeGaugeField(FullGauge *cudaCauge);
#ifdef __cplusplus
}
......
......@@ -81,8 +81,6 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSlop
//rv = MatPCDagcDotWXCuda(v, gauge, p, invert_param->kappa, tmp, b, invert_param->matpc_type);
MatPCDagCuda(v, gaugeSloppy, p, invert_param->kappa, tmp, invert_param->matpc_type);
printf("%e %e\n", normCuda((float*) v.spinor, len), normCuda((float*) p.spinor, len));
rv = cDotProductCuda((float2*)source.spinor, (float2*)v.spinor, len/2);
cuComplex rv32 = make_cuFloatComplex((float)rv.x, (float)rv.y);
alpha = cuCdivf(rho, rv32);
......
......@@ -9,8 +9,8 @@
#include <spinor_quda.h>
#include <gauge_quda.h>
FullGauge gaugeSloppy; // sloppy gauge field
FullGauge gaugePrecise; // precise gauge field
FullGauge cudaGaugePrecise; // precise gauge field
FullGauge cudaGaugeSloppy; // sloppy gauge field
void printGaugeParam(QudaGaugeParam *param) {
......@@ -22,8 +22,10 @@ void printGaugeParam(QudaGaugeParam *param) {
printf("anisotropy = %e\n", param->anisotropy);
printf("gauge_order = %d\n", param->gauge_order);
printf("cpu_prec = %d\n", param->cpu_prec);
printf("cuda_prec = %d\n", param->cuda_prec);
printf("reconstruct = %d\n", param->reconstruct);
printf("cuda_prec_precise = %d\n", param->cuda_prec_precise);
printf("reconstruct_precise = %d\n", param->reconstruct_precise);
printf("cuda_prec_sloppy = %d\n", param->cuda_prec_sloppy);
printf("reconstruct_sloppy = %d\n", param->reconstruct_sloppy);
printf("gauge_fix = %d\n", param->gauge_fix);
printf("t_boundary = %d\n", param->t_boundary);
printf("packed_size = %d\n", param->packed_size);
......@@ -78,11 +80,11 @@ void initQuda(int dev)
fprintf(stderr, "Using device %d: %s\n", dev, deviceProp.name);
cudaSetDevice(dev);
cudaSGauge.even = NULL;
cudaSGauge.odd = NULL;
cudaGaugePrecise.even = NULL;
cudaGaugePrecise.odd = NULL;
cudaHGauge.even = NULL;
cudaHGauge.odd = NULL;
cudaGaugeSloppy.even = NULL;
cudaGaugeSloppy.odd = NULL;
hSpinor1.spinor = NULL;
hSpinor1.spinorNorm = NULL;
......@@ -94,36 +96,42 @@ void initQuda(int dev)
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
{
gauge_param = param;
createGaugeField(h_gauge);
gaugePrecise = cudaSGauge;
gaugeSloppy = (param->cuda_prec == QUDA_HALF_PRECISION) ? cudaHGauge : cudaSGauge;
setCudaGaugeParam();
if (gauge_param->X != L1 || gauge_param->Y != L2 || gauge_param->Z != L3 || gauge_param->T != L4) {
printf("QUDA error: dimensions do not match: %d=%d, %d=%d, %d=%d, %d=%d\n",
gauge_param->X, L1, gauge_param->Y, L2, gauge_param->Z, L3, gauge_param->T, L4);
exit(-1);
}
gauge_param->packed_size = (gauge_param->reconstruct_precise == QUDA_RECONSTRUCT_8) ? 8 : 12;
createGaugeField(&cudaGaugePrecise, h_gauge, gauge_param->reconstruct_precise, gauge_param->cuda_prec_precise);
gauge_param->gaugeGiB = (float)2*cudaGaugePrecise.packedGaugeBytes/ (1 << 30);
if (gauge_param->cuda_prec_sloppy != gauge_param->cuda_prec_precise) {
createGaugeField(&cudaGaugeSloppy, h_gauge, gauge_param->reconstruct_sloppy, gauge_param->cuda_prec_sloppy);
gauge_param->gaugeGiB = (float)2*cudaGaugeSloppy.packedGaugeBytes/ (1 << 30);
} else {
cudaGaugeSloppy = cudaGaugePrecise;
}
}
void endQuda()
{
freeSpinorBuffer();
freeGaugeField();
freeGaugeField(&cudaGaugePrecise);
freeGaugeField(&cudaGaugeSloppy);
}
void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int parity, int dagger)
{
Precision single = QUDA_SINGLE_PRECISION;
ParitySpinor in = allocateParitySpinor(inv_param->cuda_prec);
ParitySpinor out = allocateParitySpinor(inv_param->cuda_prec);
ParitySpinor in = allocateParitySpinor(single);
ParitySpinor out = allocateParitySpinor(single);
loadParitySpinor(in, h_in, inv_param->cpu_prec,
inv_param->cuda_prec, inv_param->dirac_order);
if (gauge_param->cuda_prec == QUDA_SINGLE_PRECISION) {
dslashCuda(out, cudaSGauge, in, parity, dagger);
} else if (inv_param->cuda_prec == QUDA_HALF_PRECISION) {
dslashCuda(out, cudaHGauge, in, parity, dagger);
}
retrieveParitySpinor(h_out, out, inv_param->cpu_prec,
inv_param->cuda_prec, inv_param->dirac_order);
loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->cuda_prec, inv_param->dirac_order);
printf("\nnorm = %e\n", normCuda((float*)(in.spinor), Nh*24));
dslashCuda(out, cudaGaugePrecise, in, parity, dagger);
retrieveParitySpinor(h_out, out, inv_param->cpu_prec, inv_param->cuda_prec, inv_param->dirac_order);
freeParitySpinor(out);
freeParitySpinor(in);
......@@ -131,23 +139,13 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int parity,
void MatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
{
Precision single = QUDA_SINGLE_PRECISION;
ParitySpinor in = allocateParitySpinor(single);
ParitySpinor out = allocateParitySpinor(single);
ParitySpinor tmp = allocateParitySpinor(single);
loadParitySpinor(in, h_in, inv_param->cpu_prec,
inv_param->cuda_prec, inv_param->dirac_order);
ParitySpinor in = allocateParitySpinor(inv_param->cuda_prec);
ParitySpinor out = allocateParitySpinor(inv_param->cuda_prec);
ParitySpinor tmp = allocateParitySpinor(inv_param->cuda_prec);
if (gauge_param->cuda_prec == QUDA_SINGLE_PRECISION) {
MatPCCuda(out, cudaSGauge, in, inv_param->kappa, tmp, inv_param->matpc_type);
} else {
MatPCCuda(out, cudaHGauge, in, inv_param->kappa, tmp, inv_param->matpc_type);
}
retrieveParitySpinor(h_out, out, inv_param->cpu_prec,
inv_param->cuda_prec, inv_param->dirac_order);
loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->cuda_prec, inv_param->dirac_order);
MatPCCuda(out, cudaGaugePrecise, in, inv_param->kappa, tmp, inv_param->matpc_type);
retrieveParitySpinor(h_out, out, inv_param->cpu_prec, inv_param->cuda_prec, inv_param->dirac_order);
freeParitySpinor(tmp);
freeParitySpinor(out);
......@@ -156,23 +154,13 @@ void MatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
void MatPCDagQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
{
Precision single = QUDA_SINGLE_PRECISION;
ParitySpinor in = allocateParitySpinor(single);
ParitySpinor out = allocateParitySpinor(single);
ParitySpinor tmp = allocateParitySpinor(single);
ParitySpinor in = allocateParitySpinor(inv_param->cuda_prec);
ParitySpinor out = allocateParitySpinor(inv_param->cuda_prec);
ParitySpinor tmp = allocateParitySpinor(inv_param->cuda_prec);
loadParitySpinor(in, h_in, inv_param->cpu_prec,
inv_param->cuda_prec, inv_param->dirac_order);
if (gauge_param->cuda_prec == QUDA_SINGLE_PRECISION) {
MatPCDagCuda(out, cudaSGauge, in, inv_param->kappa, tmp, inv_param->matpc_type);
} else {
MatPCDagCuda(out, cudaHGauge, in, inv_param->kappa, tmp, inv_param->matpc_type);
}
retrieveParitySpinor(h_out, out, inv_param->cpu_prec,
inv_param->cuda_prec, inv_param->dirac_order);
loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->cuda_prec, inv_param->dirac_order);
MatPCDagCuda(out, cudaGaugePrecise, in, inv_param->kappa, tmp, inv_param->matpc_type);
retrieveParitySpinor(h_out, out, inv_param->cpu_prec, inv_param->cuda_prec, inv_param->dirac_order);
freeParitySpinor(tmp);
freeParitySpinor(out);
......@@ -181,23 +169,13 @@ void MatPCDagQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
void MatPCDagMatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
{
Precision single = QUDA_SINGLE_PRECISION;
ParitySpinor in = allocateParitySpinor(single);
ParitySpinor out = allocateParitySpinor(single);
ParitySpinor tmp = allocateParitySpinor(single);
loadParitySpinor(in, h_in, inv_param->cpu_prec,
inv_param->cuda_prec, inv_param->dirac_order);
ParitySpinor in = allocateParitySpinor(inv_param->cuda_prec);
ParitySpinor out = allocateParitySpinor(inv_param->cuda_prec);
ParitySpinor tmp = allocateParitySpinor(inv_param->cuda_prec);
if (gauge_param->cuda_prec == QUDA_SINGLE_PRECISION) {
MatPCDagMatPCCuda(out, cudaSGauge, in, inv_param->kappa, tmp, inv_param->matpc_type);
} else {
MatPCDagMatPCCuda(out, cudaHGauge, in, inv_param->kappa, tmp, inv_param->matpc_type);
}
retrieveParitySpinor(h_out, out, inv_param->cpu_prec,
inv_param->cuda_prec, inv_param->dirac_order);
loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->cuda_prec, inv_param->dirac_order);
MatPCDagMatPCCuda(out, cudaGaugePrecise, in, inv_param->kappa, tmp, inv_param->matpc_type);
retrieveParitySpinor(h_out, out, inv_param->cpu_prec, inv_param->cuda_prec, inv_param->dirac_order);
freeParitySpinor(tmp);
freeParitySpinor(out);
......@@ -206,23 +184,17 @@ void MatPCDagMatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) {
FullSpinor in, out;
Precision single = QUDA_SINGLE_PRECISION;
in.even = allocateParitySpinor(single);
in.odd = allocateParitySpinor(single);
out.even = allocateParitySpinor(single);
out.odd = allocateParitySpinor(single);
in.even = allocateParitySpinor(inv_param->cuda_prec);
in.odd = allocateParitySpinor(inv_param->cuda_prec);
out.even = allocateParitySpinor(inv_param->cuda_prec);
out.odd = allocateParitySpinor(inv_param->cuda_prec);
loadSpinorField(in, h_in, inv_param->cpu_prec,
inv_param->cuda_prec, inv_param->dirac_order);
loadSpinorField(in, h_in, inv_param->cpu_prec, inv_param->cuda_prec, inv_param->dirac_order);
dslashCuda(out.odd, cudaGaugePrecise, in.even, 1, 0);
dslashCuda(out.even, cudaGaugePrecise, in.odd, 0, 0);
if (gauge_param->cuda_prec == QUDA_SINGLE_PRECISION) {
dslashCuda(out.odd, cudaSGauge, in.even, 1, 0);
dslashCuda(out.even, cudaSGauge, in.odd, 0, 0);
} else if (inv_param->cuda_prec == QUDA_HALF_PRECISION) {
dslashCuda(out.odd, cudaHGauge, in.even, 1, 0);
dslashCuda(out.even, cudaHGauge, in.odd, 0, 0);
}
xpayCuda((float*)in.even.spinor, -inv_param->kappa, (float*)out.even.spinor, Nh*spinorSiteSize);
xpayCuda((float*)in.odd.spinor, -inv_param->kappa, (float*)out.odd.spinor, Nh*spinorSiteSize);
......@@ -237,23 +209,18 @@ void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) {
void MatDagQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) {
FullSpinor in, out;
Precision single = QUDA_SINGLE_PRECISION;
in.even = allocateParitySpinor(single);
in.odd = allocateParitySpinor(single);
out.even = allocateParitySpinor(single);
out.odd = allocateParitySpinor(single);
in.even = allocateParitySpinor(inv_param->cuda_prec);
in.odd = allocateParitySpinor(inv_param->cuda_prec);
out.even = allocateParitySpinor(inv_param->cuda_prec);
out.odd = allocateParitySpinor(inv_param->cuda_prec);
loadSpinorField(in, h_in, inv_param->cpu_prec,
inv_param->cuda_prec, inv_param->dirac_order);
if (gauge_param->cuda_prec == QUDA_SINGLE_PRECISION) {
dslashCuda(out.odd, cudaSGauge, in.even, 1, 1);
dslashCuda(out.even, cudaSGauge, in.odd, 0, 1);
} else if (inv_param->cuda_prec == QUDA_HALF_PRECISION) {
dslashCuda(out.odd, cudaHGauge, in.even, 1, 1);
dslashCuda(out.even, cudaHGauge, in.odd, 0, 1);
}
dslashCuda(out.odd, cudaGaugePrecise, in.even, 1, 1);
dslashCuda(out.even, cudaGaugePrecise, in.odd, 0, 1);
xpayCuda((float*)in.even.spinor, -inv_param->kappa, (float*)out.even.spinor, Nh*spinorSiteSize);
xpayCuda((float*)in.odd.spinor, -inv_param->kappa, (float*)out.odd.spinor, Nh*spinorSiteSize);
......@@ -268,8 +235,6 @@ void MatDagQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) {
void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
{
Precision single = QUDA_SINGLE_PRECISION;
invert_param = param;
if (param->cuda_prec == QUDA_DOUBLE_PRECISION) {
......@@ -298,14 +263,14 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
float kappa = param->kappa;
FullSpinor b, x;
ParitySpinor in = allocateParitySpinor(single); // source vector
ParitySpinor out = allocateParitySpinor(single); // solution vector
ParitySpinor tmp = allocateParitySpinor(single); // temporary used when applying operator
ParitySpinor in = allocateParitySpinor(invert_param->cuda_prec); // source vector
ParitySpinor out = allocateParitySpinor(invert_param->cuda_prec); // solution vector
ParitySpinor tmp = allocateParitySpinor(invert_param->cuda_prec); // temporary used when applying operator
if (param->solution_type == QUDA_MAT_SOLUTION) {
if (param->preserve_source == QUDA_PRESERVE_SOURCE_YES) {
b.even = allocateParitySpinor(single);
b.odd = allocateParitySpinor(single);
b.even = allocateParitySpinor(invert_param->cuda_prec);
b.odd = allocateParitySpinor(invert_param->cuda_prec);
} else {
b.even = out;
b.odd = tmp;
......@@ -323,9 +288,9 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
}
if (param->matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashXpaySCuda(in, gaugePrecise, b.odd, 0, 0, b.even, kappa);
dslashXpaySCuda(in, cudaGaugePrecise, b.odd, 0, 0, b.even, kappa);
} else {
dslashXpaySCuda(in, gaugePrecise, b.even, 1, 0, b.odd, kappa);
dslashXpaySCuda(in, cudaGaugePrecise, b.even, 1, 0, b.odd, kappa);
}
} else if (param->solution_type == QUDA_MATPC_SOLUTION ||
......@@ -344,16 +309,16 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
case QUDA_CG_INVERTER:
if (param->solution_type != QUDA_MATPCDAG_MATPC_SOLUTION) {
copyCuda((float *)out.spinor, (float *)in.spinor, slenh);
MatPCDagCuda(in, gaugePrecise, out, kappa, tmp, param->matpc_type);
MatPCDagCuda(in, cudaGaugePrecise, out, kappa, tmp, param->matpc_type);
}
invertCgCuda(out, in, gaugeSloppy, tmp, param);
invertCgCuda(out, in, cudaGaugeSloppy, tmp, param);
break;
case QUDA_BICGSTAB_INVERTER:
if (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) {
invertBiCGstabCuda(out, in, gaugeSloppy, gaugePrecise, tmp, param, QUDA_DAG_YES);
invertBiCGstabCuda(out, in, cudaGaugeSloppy, cudaGaugePrecise, tmp, param, QUDA_DAG_YES);
copyCuda((float *)in.spinor, (float *)out.spinor, slenh);
}
invertBiCGstabCuda(out, in, gaugeSloppy, gaugePrecise, tmp, param, QUDA_DAG_NO);
invertBiCGstabCuda(out, in, cudaGaugeSloppy, cudaGaugePrecise, tmp, param, QUDA_DAG_NO);
break;
default:
printf("Inverter type %d not implemented\n", param->inv_type);
......@@ -369,9 +334,9 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
}
if (param->matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashXpaySCuda(x.odd, gaugePrecise, out, 1, 0, b.odd, kappa);
dslashXpaySCuda(x.odd, cudaGaugePrecise, out, 1, 0, b.odd, kappa);
} else {
dslashXpaySCuda(x.even, gaugePrecise, out, 0, 0, b.even, kappa);
dslashXpaySCuda(x.even, cudaGaugePrecise, out, 0, 0, b.even, kappa);
}
retrieveSpinorField(h_x, x, param->cpu_prec, param->cuda_prec, param->dirac_order);
......
......@@ -19,9 +19,13 @@ extern "C" {
QudaGaugeFieldOrder gauge_order;
QudaPrecision cpu_prec;
QudaPrecision cuda_prec;
QudaReconstructType reconstruct;
QudaPrecision cuda_prec_precise;
QudaReconstructType reconstruct_precise;
QudaPrecision cuda_prec_sloppy;
QudaReconstructType reconstruct_sloppy;
QudaGaugeFixed gauge_fix;
QudaTboundary t_boundary;
......
......@@ -4,18 +4,25 @@
#include <quda.h>
#include <util_quda.h>
#include <dslash_reference.h>
int main(int argc, char **argv)
{
int device = 0;
float *gauge[4];
void *gauge[4];
QudaGaugeParam Gauge_param;
QudaInvertParam inv_param;
Gauge_param.cpu_prec = QUDA_SINGLE_PRECISION;
Gauge_param.cuda_prec = QUDA_HALF_PRECISION;
Gauge_param.cuda_prec_precise = QUDA_SINGLE_PRECISION;
Gauge_param.reconstruct_precise = QUDA_RECONSTRUCT_12;
Gauge_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
Gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12;
Gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
Gauge_param.X = L1;
Gauge_param.Y = L2;
......@@ -23,18 +30,17 @@ int main(int argc, char **argv)
Gauge_param.T = L4;
Gauge_param.anisotropy = 1.0;
Gauge_param.reconstruct = QUDA_RECONSTRUCT_8;
inv_param.inv_type = QUDA_CG_INVERTER;
inv_param.inv_type = QUDA_BICGSTAB_INVERTER;
Gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T;
Gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
gauge_param = &Gauge_param;
float mass = -0.95;
float mass = -0.958;
inv_param.kappa = 1.0 / (2.0*(4 + mass));
inv_param.tol = 5e-7;
inv_param.maxiter = 5000;
inv_param.reliable_delta = 1e-8;
inv_param.reliable_delta = 1e-6;
inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION;
inv_param.cpu_prec = QUDA_SINGLE_PRECISION;
inv_param.cuda_prec = QUDA_HALF_PRECISION;
......@@ -44,10 +50,9 @@ int main(int argc, char **argv)
inv_param.dirac_order = QUDA_DIRAC_ORDER;
for (int dir = 0; dir < 4; dir++) {
gauge[dir] = (float*)malloc(N*gaugeSiteSize*sizeof(float));
gauge[dir] = malloc(N*gaugeSiteSize*sizeof(float));
}
constructGaugeField(gauge);
//constructUnitGaugeField(gauge);
construct_gauge_field(gauge, 1, QUDA_SINGLE_PRECISION);
float *spinorIn = (float*)malloc(N*spinorSiteSize*sizeof(float));
float *spinorOut = (float*)malloc(N*spinorSiteSize*sizeof(float));
......@@ -60,8 +65,7 @@ int main(int argc, char **argv)
int i0 = 0;
int s0 = 0;
int c0 = 0;
// constructPointSpinorField(spinorIn, i0, s0, c0);
constructPointSpinorField(spinorIn, i0, s0, c0);
construct_spinor_field(spinorIn, 0, i0, s0, c0, QUDA_SINGLE_PRECISION);
double time0 = -((double)clock()); // Start the timer
......@@ -78,13 +82,13 @@ int main(int argc, char **argv)
printf("done: %i iter / %g secs = %g gflops, total time = %g secs\n",
inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs, time0);
Mat(spinorCheck, gauge, spinorOut, inv_param.kappa);
mat(spinorCheck, gauge, spinorOut, inv_param.kappa, 0, inv_param.cpu_prec, Gauge_param.cpu_prec);
if (inv_param.mass_normalization == QUDA_MASS_NORMALIZATION)
ax(0.5/inv_param.kappa, spinorCheck, N*spinorSiteSize);
ax(0.5/inv_param.kappa, spinorCheck, N*spinorSiteSize, inv_param.cpu_prec);
mxpy(spinorIn, spinorCheck, N*spinorSiteSize);
float nrm2 = norm(spinorCheck, N*spinorSiteSize);
float src2 = norm(spinorIn, N*spinorSiteSize);
mxpy(spinorIn, spinorCheck, N*spinorSiteSize, inv_param.cpu_prec);
float nrm2 = norm_2(spinorCheck, N*spinorSiteSize, QUDA_SINGLE_PRECISION);
float src2 = norm_2(spinorIn, N*spinorSiteSize, QUDA_SINGLE_PRECISION);
printf("Relative residual, requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2));
endQuda();
......
......@@ -35,10 +35,10 @@ aligned_malloc(size_t n, void **m0)
}
void printSpinorHalfField(float *spinor) {
printSpinor(&spinor[0*spinorSiteSize]);
void printSpinorHalfField(void *spinor, Precision precision) {
printSpinorElement(spinor, 0, precision);
printf("...\n");
printSpinor(&spinor[(Nh-1)*spinorSiteSize]);
printSpinorElement(spinor, Nh-1, precision);
printf("\n");
}
......@@ -46,17 +46,16 @@ void init() {
Precision single = QUDA_SINGLE_PRECISION;
cudaSGauge.even = NULL;
cudaSGauge.odd = NULL;
param.cpu_prec = QUDA_SINGLE_PRECISION;
param.cuda_prec = QUDA_SINGLE_PRECISION;
param.cuda_prec_precise = QUDA_SINGLE_PRECISION;
param.reconstruct_precise = QUDA_RECONSTRUCT_12;
param.cuda_prec_sloppy = param.cuda_prec_precise;
param.reconstruct_sloppy = param.reconstruct_precise;
param.X = L1;
param.Y = L2;
param.Z = L3;
param.T = L4;
param.anisotropy = 2.3;
param.reconstruct = QUDA_RECONSTRUCT_12;
param.t_boundary = QUDA_ANTI_PERIODIC_T;
param.gauge_fix = QUDA_GAUGE_FIXED_NO;
gauge_param = &param;
......@@ -100,13 +99,13 @@ void packTest() {
stopwatchStart();
param.gauge_order = QUDA_CPS_WILSON_GAUGE_ORDER;
createGaugeField(cpsGauge);
createGaugeField(&cudaGaugePrecise, cpsGauge, param.reconstruct_precise, param.cuda_prec_precise);
double cpsGtime = stopwatchReadSeconds();
printf("CPS Gauge send time = %e seconds\n", cpsGtime);
stopwatchStart();
param.gauge_order = QUDA_QDP_GAUGE_ORDER;
createGaugeField(qdpGauge);
createGaugeField(&cudaGaugePrecise, qdpGauge, param.reconstruct_precise, param.cuda_prec_precise);
double qdpGtime = stopwatchReadSeconds();
printf("QDP Gauge send time = %e seconds\n", qdpGtime);
......
......@@ -3,10 +3,10 @@
#include <cuda_runtime.h>
#define L1 24 // "x" dimension
#define L2 24 // "y" dimension
#define L3 24 // "z" dimension
#define L4 32 // "time" dimension
#define L1 8 // "x" dimension
#define L2 8 // "y" dimension
#define L3 8 // "z" dimension
#define L4 8 // "time" dimension
#define L1h (L1/2) // half of the full "x" dimension, useful for even/odd lattice indexing
#define N (L1*L2*L3*L4) // total number of lattice points
......@@ -53,7 +53,7 @@ extern "C" {
typedef struct {
Precision precision;
void *spinor; // either (float4 *) or (short4 *), depending on precision
void *spinor; // either (double2*), (float4 *) or (short4 *), depending on precision
float *spinorNorm; // used only when precision is QUDA_HALF_PRECISION
} ParitySpinor;
......
......@@ -10,8 +10,8 @@
FullClover cudaClover;
// Pinned memory for cpu-gpu memory copying
float4 *packedSpinor1 = 0;
float4 *packedSpinor2 = 0;
void *packedSpinor1 = 0;
void *packedSpinor2 = 0;
// Half precision spinor field temporaries
ParitySpinor hSpinor1, hSpinor2;
......@@ -120,6 +120,14 @@ void freeSpinorHalf() {
freeParitySpinor(hSpinor2);
}
inline void packDouble2(double2* a, double *b) {
a->x = b[0]; a->y = b[1];
}
inline void unpackDouble2(double *a, double2 *b) {
a[0] = b->x; a[1] = b->y;
}
inline void packFloat4(float4* a, float *b) {
__m128 SSEtmp;
SSEtmp = _mm_loadu_ps((const float*)b);
......@@ -134,7 +142,50 @@ inline void unpackFloat4(float *a, float4 *b) {
//a[0] = b->x; a[1] = b->y; a[2] = b->z; a[3] = b->w;
}
void packDoubleFullSpinor(float4 *even, float4 *odd, double *spinor) {
void packFullSpinorDD(double2 *even, double2 *odd, double *spinor) {
double K = 1.0 / 2.0;
double b[24];
for (int i=0; i<Nh; i++) {
int boundaryCrossings = i/L1h + i/(L2*L1h) + i/(L3*L2*L1h);
{ // even sites
int k = 2*i + boundaryCrossings%2;
double *a = spinor + k*24;
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
b[0*6+cr] = K*(a[1*6+cr]+a[3*6+cr]);
b[1*6+cr] = -K*(a[0*6+cr]+a[2*6+cr]);
b[2*6+cr] = K*(a[1*6+cr]-a[3*6+cr]);
b[3*6+cr] = K*(a[2*6+cr]-a[0*6+cr]);
}
}
for (int j = 0; j < 12; j++) packDouble2(even+j*Nh+i, b+j*4);
}
{ // odd sites
int k = 2*i + (boundaryCrossings+1)%2;
double *a = spinor + k*24;
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
b[0*6+cr] = K*(a[1*6+cr]+a[3*6+cr]);
b[1*6+cr] = -K*(a[0*6+cr]+a[2*6+cr]);
b[2*6+cr] = K*(a[1*6+cr]-a[3*6+cr]);
b[3*6+cr] = K*(a[2*6+cr]-a[0*6+cr]);
}
}
for (int j = 0; j < 12; j++) packDouble2(odd+j*Nh+i, b+j*4);
}
}
}
void packFullSpinorSD(float4 *even, float4 *odd, double *spinor) {
double K = 1.0 / 2.0;
float b[24];
......@@ -177,12 +228,10 @@ void packDoubleFullSpinor(float4 *even, float4 *odd, double *spinor) {
}
void packSingleFullSpinor(float4 *even, float4 *odd, float *spinor) {
void packFullSpinorSS(float4 *even, float4 *odd, float *spinor) {
float K = 1.0 / 2.0;
float b[24];
//printf("\n");
for (int i=0; i<Nh; i++) {
int boundaryCrossings = i/L1h + i/(L2*L1h) + i/(L3*L2*L1h);
......@@ -221,11 +270,32 @@ void packSingleFullSpinor(float4 *even, float4 *odd, float *spinor) {
for (int j = 0; j < 6; j++) packFloat4(odd+j*Nh+i, b+j*4);
}
}
//exit(0);
}
// Standard spinor packing, colour inside spin
void packDoubleParitySpinor(float4 *res, double *spinor) {
void packParitySpinorDD(double2 *res, double *spinor) {
double K = 1.0 / (2.0);
double b[24];
for (int i = 0; i < Nh; i++) {
double *a = spinor+i*24;
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
b[0*6+cr] = K*(a[1*6+cr]+a[3*6+cr]);
b[1*6+cr] = -K*(a[0*6+cr]+a[2*6+cr]);
b[2*6+cr] = K*(a[1*6+cr]-a[3*6+cr]);
b[3*6+cr] = K*(a[2*6+cr]-a[0*6+cr]);
}
}
for (int j = 0; j < 12; j++) packDouble2(res+j*Nh+i, b+j*4);
}
}
void packParitySpinorSD(float4 *res, double *spinor) {
double K = 1.0 / (2.0);
float b[24];
......@@ -247,7 +317,7 @@ void packDoubleParitySpinor(float4 *res, double *spinor) {
}
// single precision version of the above
void packSingleParitySpinor(float4 *res, float *spinor) {
void packParitySpinorSS(float4 *res, float *spinor) {
float K = 1.0 / (2.0);
float b[24];
......@@ -269,10 +339,10 @@ void packSingleParitySpinor(float4 *res, float *spinor) {
}
// QDP spinor packing, spin inside colour
void packQDPDoubleParitySpinor(float4 *res, double *spinor) {
void packQDPParitySpinorDD(double2 *res, double *spinor) {
double K = 1.0 / 2.0;
float b[24];
double b[24];
for (int i = 0; i < Nh; i++) {
double *a = spinor+i*24;
......@@ -287,14 +357,35 @@ void packQDPDoubleParitySpinor(float4 *res, double *spinor) {
}
}
for (int j = 0; j < 6; j++) packFloat4(res+j*Nh+i, b+j*4);
for (int j = 0; j < 6; j++) packDouble2(res+j*Nh+i, b+j*4);
}
}
// QDP spinor packing, spin inside colour
void packQDPParitySpinorSD(float4 *res, double *spinor) {
double K = 1.0 / 2.0;
float b[24];
for (int i = 0; i < Nh; i++) {
double *a = spinor+i*24;
// reorder and change basis
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
b[0*6+cr] = K*(a[(c*4+1)*2+r]+a[(c*4+3)*2+r]);
b[1*6+cr] = -K*(a[(c*4+0)*2+r]+a[(c*4+2)*2+r]);
b[2*6+cr] = K*(a[(c*4+1)*2+r]-a[(c*4+3)*2+r]);
b[3*6+cr] = K*(a[(c*4+2)*2+r]-a[(c*4+0)*2+r]);
}
}
for (int j = 0; j < 6; j++) packFloat4(res+j*Nh+i, b+j*4);
}
}
// Single precision version of the above
void packQDPSingleParitySpinor(float4 *res, float *spinor) {
void packQDPParitySpinorSS(float4 *res, float *spinor) {
float K = 1.0 / 2.0;
float b[24];
......@@ -316,7 +407,54 @@ void packQDPSingleParitySpinor(float4 *res, float *spinor) {
}
}
void unpackDoubleFullSpinor(double *res, float4 *even, float4 *odd) {
void unpackFullSpinorDD(double *res, double2 *even, double2 *odd) {
double K = 1.0;
double b[24];
for (int i=0; i<Nh; i++) {
int boundaryCrossings = i/L1h + i/(L2*L1h) + i/(L3*L2*L1h);
{ // even sites
int k = 2*i + boundaryCrossings%2;
double *a = res + k*24;
for (int j = 0; j < 12; j++) unpackDouble2(b+j*4, even+j*Nh+i);
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
a[0*6+cr] = -K*(b[1*6+cr]+b[3*6+cr]);
a[1*6+cr] = K*(b[0*6+cr]+b[2*6+cr]);
a[2*6+cr] = -K*(b[1*6+cr]-b[3*6+cr]);
a[3*6+cr] = -K*(b[2*6+cr]-b[0*6+cr]);
}
}
}
{ // odd sites
int k = 2*i + (boundaryCrossings+1)%2;
double *a = res + k*24;
for (int j = 0; j < 12; j++) unpackDouble2(b+j*4, odd+j*Nh+i);
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
a[0*6+cr] = -K*(b[1*6+cr]+b[3*6+cr]);
a[1*6+cr] = K*(b[0*6+cr]+b[2*6+cr]);
a[2*6+cr] = -K*(b[1*6+cr]-b[3*6+cr]);
a[3*6+cr] = -K*(b[2*6+cr]-b[0*6+cr]);
}
}
}
}
}
void unpackFullSpinorDS(double *res, float4 *even, float4 *odd) {
double K = 1.0;
float b[24];
......@@ -363,7 +501,7 @@ void unpackDoubleFullSpinor(double *res, float4 *even, float4 *odd) {
}
void unpackSingleFullSpinor(float *res, float4 *even, float4 *odd) {
void unpackFullSpinorSS(float *res, float4 *even, float4 *odd) {
float K = 1.0;
float b[24];
......@@ -410,7 +548,30 @@ void unpackSingleFullSpinor(float *res, float4 *even, float4 *odd) {
}
void unpackDoubleParitySpinor(double *res, float4 *spinorPacked) {
void unpackParitySpinorDD(double *res, double2 *spinorPacked) {
double K = 1.0;///sqrt(2.0);
double b[24];
for (int i = 0; i < Nh; i++) {
double *a = res+i*24;
for (int j = 0; j < 12; j++) unpackDouble2(b+j*4, spinorPacked+j*Nh+i);
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
a[0*6+cr] = -K*(b[1*6+cr]+b[3*6+cr]);
a[1*6+cr] = K*(b[0*6+cr]+b[2*6+cr]);
a[2*6+cr] = -K*(b[1*6+cr]-b[3*6+cr]);
a[3*6+cr] = -K*(b[2*6+cr]-b[0*6+cr]);
}
}
}
}
void unpackParitySpinorDS(double *res, float4 *spinorPacked) {
float K = 1.0;///sqrt(2.0);
float b[24];
......@@ -433,7 +594,7 @@ void unpackDoubleParitySpinor(double *res, float4 *spinorPacked) {
}
void unpackSingleParitySpinor(float *res, float4 *spinorPacked) {
void unpackParitySpinorSS(float *res, float4 *spinorPacked) {
float K = 1.0;///sqrt(2.0);
float b[24];
......@@ -456,7 +617,29 @@ void unpackSingleParitySpinor(float *res, float4 *spinorPacked) {
}
void unpackQDPDoubleParitySpinor(double *res, float4 *spinorPacked) {
void unpackQDPParitySpinorDD(double *res, double2 *spinorPacked) {
double K = 1.0;///sqrt(2.0);
double b[24];
for (int i = 0; i < Nh; i++) {
double *a = res+i*24;
for (int j = 0; j < 12; j++) unpackDouble2(b+j*4, spinorPacked+j*Nh+i);
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
a[(c*4+0)*2+r] = -K*(b[1*6+cr]+b[3*6+cr]);
a[(c*4+1)*2+r] = K*(b[0*6+cr]+b[2*6+cr]);
a[(c*4+2)*2+r] = -K*(b[1*6+cr]-b[3*6+cr]);
a[(c*4+3)*2+r] = -K*(b[2*6+cr]-b[0*6+cr]);
}
}
}
}
void unpackQDPParitySpinorDS(double *res, float4 *spinorPacked) {
float K = 1.0;///sqrt(2.0);
float b[24];
......@@ -478,7 +661,7 @@ void unpackQDPDoubleParitySpinor(double *res, float4 *spinorPacked) {
}
void unpackQDPSingleParitySpinor(float *res, float4 *spinorPacked) {
void unpackQDPParitySpinorSS(float *res, float4 *spinorPacked) {
float K = 1.0;///sqrt(2.0);
float b[24];
......@@ -503,10 +686,6 @@ void unpackQDPSingleParitySpinor(float *res, float4 *spinorPacked) {
void loadParitySpinor(ParitySpinor ret, void *spinor, Precision cpu_prec,
Precision cuda_prec, DiracFieldOrder dirac_order) {
if (cuda_prec == QUDA_DOUBLE_PRECISION) {
printf("Sorry, double precision not supported\n");
exit(-1);
}
if (cuda_prec == QUDA_HALF_PRECISION) {
if (!hSpinor1.spinor && !hSpinor1.spinorNorm &&
......@@ -521,41 +700,61 @@ void loadParitySpinor(ParitySpinor ret, void *spinor, Precision cpu_prec,
}
}
size_t spinor_bytes;
if (cuda_prec == QUDA_DOUBLE_PRECISION) spinor_bytes = Nh*spinorSiteSize*sizeof(double);
else spinor_bytes = Nh*spinorSiteSize*sizeof(float);
#ifndef __DEVICE_EMULATION__
if (!packedSpinor1) cudaMallocHost((void**)&packedSpinor1, SPINOR_BYTES);
if (!packedSpinor1) cudaMallocHost(&packedSpinor1, spinor_bytes);
#else
if (!packedSpinor1) packedSpinor1 = (float4*)malloc(SPINOR_BYTES);
if (!packedSpinor1) packedSpinor1 = malloc(spinor_bytes);
#endif
if (dirac_order == QUDA_DIRAC_ORDER || QUDA_CPS_WILSON_DIRAC_ORDER) {
if (cpu_prec == QUDA_DOUBLE_PRECISION) packDoubleParitySpinor(packedSpinor1, (double*)spinor);
else packSingleParitySpinor(packedSpinor1, (float*)spinor);
if (cuda_prec == QUDA_DOUBLE_PRECISION) {
packParitySpinorDD((double2*)packedSpinor1, (double*)spinor);
} else {
if (cpu_prec == QUDA_DOUBLE_PRECISION) packParitySpinorSD((float4*)packedSpinor1, (double*)spinor);
else packParitySpinorSS((float4*)packedSpinor1, (float*)spinor);
}
} else if (dirac_order == QUDA_QDP_DIRAC_ORDER) {
if (cpu_prec == QUDA_DOUBLE_PRECISION) packQDPDoubleParitySpinor(packedSpinor1, (double*)spinor);
else packQDPSingleParitySpinor(packedSpinor1, (float*)spinor);
if (cuda_prec == QUDA_DOUBLE_PRECISION) {
packQDPParitySpinorDD((double2*)packedSpinor1, (double*)spinor);
} else {
if (cpu_prec == QUDA_DOUBLE_PRECISION) packQDPParitySpinorSD((float4*)packedSpinor1, (double*)spinor);
else packQDPParitySpinorSS((float4*)packedSpinor1, (float*)spinor);
}
}
cudaMemcpy(ret.spinor, packedSpinor1, SPINOR_BYTES, cudaMemcpyHostToDevice);
cudaMemcpy(ret.spinor, packedSpinor1, spinor_bytes, cudaMemcpyHostToDevice);
}
void loadFullSpinor(FullSpinor ret, void *spinor, Precision cpu_prec, Precision cuda_prec) {
if (cuda_prec == QUDA_DOUBLE_PRECISION || cuda_prec == QUDA_HALF_PRECISION) {
printf("Sorry, only single precision is supported\n");
if (cuda_prec == QUDA_HALF_PRECISION) {
printf("Sorry, half precision isn't supported\n");
exit(-1);
}
size_t spinor_bytes;
if (cuda_prec == QUDA_DOUBLE_PRECISION) spinor_bytes = Nh*spinorSiteSize*sizeof(double);
else spinor_bytes = Nh*spinorSiteSize*sizeof(float);
#ifndef __DEVICE_EMULATION__
if (!packedSpinor1) cudaMallocHost((void**)&packedSpinor1, SPINOR_BYTES);
if (!packedSpinor2) cudaMallocHost((void**)&packedSpinor2, SPINOR_BYTES);
if (!packedSpinor1) cudaMallocHost(&packedSpinor1, spinor_bytes);
if (!packedSpinor2) cudaMallocHost(&packedSpinor2, spinor_bytes);
#else
if (!packedSpinor1) packedSpinor1 = (float4*)malloc(SPINOR_BYTES);
if (!packedSpinor2) packedSpinor2 = (float4*)malloc(SPINOR_BYTES);
if (!packedSpinor1) packedSpinor1 = malloc(spinor_bytes);
if (!packedSpinor2) packedSpinor2 = malloc(spinor_bytes);
#endif
if (cpu_prec == QUDA_DOUBLE_PRECISION) packDoubleFullSpinor(packedSpinor1, packedSpinor2, (double*)spinor);
else packSingleFullSpinor(packedSpinor1, packedSpinor2, (float*)spinor);
if (cuda_prec == QUDA_DOUBLE_PRECISION) {
packFullSpinorDD((double2*)packedSpinor1, (double2*)packedSpinor2, (double*)spinor);
} else {
if (cpu_prec == QUDA_DOUBLE_PRECISION) packFullSpinorSD((float4*)packedSpinor1, (float4*)packedSpinor2, (double*)spinor);
else packFullSpinorSS((float4*)packedSpinor1, (float4*)packedSpinor2, (float*)spinor);
}
cudaMemcpy(ret.even.spinor, packedSpinor1, SPINOR_BYTES, cudaMemcpyHostToDevice);
cudaMemcpy(ret.odd.spinor, packedSpinor2, SPINOR_BYTES, cudaMemcpyHostToDevice);
cudaMemcpy(ret.even.spinor, packedSpinor1, spinor_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(ret.odd.spinor, packedSpinor2, spinor_bytes, cudaMemcpyHostToDevice);
#ifndef __DEVICE_EMULATION__
cudaFreeHost(packedSpinor2);
......@@ -588,30 +787,47 @@ void loadSpinorField(FullSpinor ret, void *spinor, Precision cpu_prec,
void retrieveParitySpinor(void *res, ParitySpinor spinor, Precision cpu_prec, Precision cuda_prec,
DiracFieldOrder dirac_order) {
/*if (cuda_prec != QUDA_SINGLE_PRECISION) {
printf("Sorry, only single precision supported\n");
exit(-1);
}*/
if (!packedSpinor1) cudaMallocHost((void**)&packedSpinor1, SPINOR_BYTES);
cudaMemcpy(packedSpinor1, spinor.spinor, SPINOR_BYTES, cudaMemcpyDeviceToHost);
size_t spinor_bytes;
if (cuda_prec == QUDA_DOUBLE_PRECISION) spinor_bytes = Nh*spinorSiteSize*sizeof(double);
else spinor_bytes = Nh*spinorSiteSize*sizeof(float);
if (!packedSpinor1) cudaMallocHost((void**)&packedSpinor1, spinor_bytes);
cudaMemcpy(packedSpinor1, spinor.spinor, spinor_bytes, cudaMemcpyDeviceToHost);
if (dirac_order == QUDA_DIRAC_ORDER || QUDA_CPS_WILSON_DIRAC_ORDER) {
if (cpu_prec == QUDA_DOUBLE_PRECISION) unpackDoubleParitySpinor((double*)res, packedSpinor1);
else unpackSingleParitySpinor((float*)res, packedSpinor1);
if (cuda_prec == QUDA_DOUBLE_PRECISION) {
unpackParitySpinorDD((double*)res, (double2*)packedSpinor1);
} else {
if (cpu_prec == QUDA_DOUBLE_PRECISION) unpackParitySpinorDS((double*)res, (float4*)packedSpinor1);
else unpackParitySpinorSS((float*)res, (float4*)packedSpinor1);
}
} else if (dirac_order == QUDA_QDP_DIRAC_ORDER) {
if (cpu_prec == QUDA_DOUBLE_PRECISION) unpackQDPDoubleParitySpinor((double*)res, packedSpinor1);
else unpackQDPSingleParitySpinor((float*)res, packedSpinor1);
if (cuda_prec == QUDA_DOUBLE_PRECISION) {
unpackQDPParitySpinorDD((double*)res, (double2*)packedSpinor1);
} else {
if (cpu_prec == QUDA_DOUBLE_PRECISION) unpackQDPParitySpinorDS((double*)res, (float4*)packedSpinor1);
else unpackQDPParitySpinorSS((float*)res, (float4*)packedSpinor1);
}
}
}
void retrieveFullSpinor(void *res, FullSpinor spinor, Precision cpu_prec, Precision cuda_prec) {
if (!packedSpinor1) cudaMallocHost((void**)&packedSpinor1, SPINOR_BYTES);
if (!packedSpinor2) cudaMallocHost((void**)&packedSpinor2, SPINOR_BYTES);
cudaMemcpy(packedSpinor1, spinor.even.spinor, SPINOR_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(packedSpinor2, spinor.odd.spinor, SPINOR_BYTES, cudaMemcpyDeviceToHost);
if (cpu_prec == QUDA_DOUBLE_PRECISION) unpackDoubleFullSpinor((double*)res, packedSpinor1, packedSpinor2);
else unpackSingleFullSpinor((float*)res, packedSpinor1, packedSpinor2);
size_t spinor_bytes;
if (cuda_prec == QUDA_DOUBLE_PRECISION) spinor_bytes = Nh*spinorSiteSize*sizeof(double);
else spinor_bytes = Nh*spinorSiteSize*sizeof(float);
if (!packedSpinor1) cudaMallocHost((void**)&packedSpinor1, spinor_bytes);
if (!packedSpinor2) cudaMallocHost((void**)&packedSpinor2, spinor_bytes);
cudaMemcpy(packedSpinor1, spinor.even.spinor, spinor_bytes, cudaMemcpyDeviceToHost);
cudaMemcpy(packedSpinor2, spinor.odd.spinor, spinor_bytes, cudaMemcpyDeviceToHost);
if (cuda_prec == QUDA_DOUBLE_PRECISION) {
unpackFullSpinorDD((double*)res, (double2*)packedSpinor1, (double2*)packedSpinor2);
} else {
if (cpu_prec == QUDA_DOUBLE_PRECISION) unpackFullSpinorDS((double*)res, (float4*)packedSpinor1, (float4*)packedSpinor2);
else unpackFullSpinorSS((float*)res, (float4*)packedSpinor1, (float4*)packedSpinor2);
}
#ifndef __DEVICE_EMULATION__
cudaFreeHost(packedSpinor2);
......@@ -642,6 +858,7 @@ void retrieveSpinorField(void *res, FullSpinor spinor, Precision cpu_prec,
}
/*
void spinorHalfPack(float *c, short *s0, float *f0) {
float *f = f0;
......@@ -670,3 +887,4 @@ void spinorHalfUnpack(float *f0, float *c, short *s0) {
}
}
*/
......@@ -34,61 +34,51 @@ void SU3Test() {
param.t_boundary = QUDA_ANTI_PERIODIC_T;
printf("Randomizing fields...");
constructGaugeField(gauge);
//constructUnitGaugeField(gauge);
construct_gauge_field(gauge, 1, QUDA_SINGLE_PRECISION);
printf("done.\n");
int fail_check = 12;
int fail8[fail_check], fail12[fail_check], fail8h[fail_check];
int fail8[fail_check], fail12[fail_check];
for (int f=0; f<fail_check; f++) {
fail8[f] = 0;
fail12[f] = 0;
fail8h[f] = 0;
}
int iter8[18], iter12[18], iter8h[18];
int iter8[18], iter12[18];
for (int i=0; i<18; i++) {
iter8[i] = 0;
iter12[i] = 0;
iter8h[i] = 0;
}
for (int eo=0; eo<2; eo++) {
for (int i=0; i<Nh; i++) {
int ga_idx = (eo*Nh+i);
for (int d=0; d<4; d++) {
float gauge8[18], gauge12[18], gauge8half[18];
short gauge8short[18];
float gauge8[18], gauge12[18];
for (int j=0; j<18; j++) {
gauge8[j] = gauge[d][ga_idx*18+j];
gauge12[j] = gauge[d][ga_idx*18+j];
gauge8half[j] = gauge[d][ga_idx*18+j];
}
su3_construct_8(gauge8);
su3_reconstruct_8(gauge8, d, i);
su3_construct(gauge8, QUDA_RECONSTRUCT_8, QUDA_SINGLE_PRECISION);
su3_reconstruct(gauge8, d, i, QUDA_RECONSTRUCT_8, QUDA_SINGLE_PRECISION);
su3_construct_12(gauge12);
su3_reconstruct_12(gauge12, d, i);
su3_construct(gauge12, QUDA_RECONSTRUCT_12, QUDA_SINGLE_PRECISION);
su3_reconstruct(gauge12, d, i, QUDA_RECONSTRUCT_12, QUDA_SINGLE_PRECISION);
su3_construct_8_half(gauge8half, gauge8short);
su3_reconstruct_8_half(gauge8half, gauge8short, d, i);
if (fabs(gauge8[8] - gauge[d][ga_idx*18+8]) > 1e-1) {
printGauge(gauge[d]+ga_idx*18);printf("\n");
printGauge(gauge8);printf("\n");
printGauge(gauge12);
printGaugeElement(gauge[d]+ga_idx*18, 0, QUDA_SINGLE_PRECISION);printf("\n");
printGaugeElement(gauge8, 0, QUDA_SINGLE_PRECISION);printf("\n");
printGaugeElement(gauge12, 0, QUDA_SINGLE_PRECISION);
exit(0);
}
for (int j=0; j<18; j++) {
float diff8 = fabs(gauge8[j] - gauge[d][ga_idx*18+j]);
float diff12 = fabs(gauge12[j] - gauge[d][ga_idx*18+j]);
float diff8h = fabs(gauge8half[j] - gauge[d][ga_idx*18+j]);
for (int f=0; f<fail_check; f++) {
if (diff8 > pow(10,-(f+1))) fail8[f]++;
if (diff12 > pow(10,-(f+1))) fail12[f]++;
if (diff8h > pow(10,-(f+1))) fail8h[f]++;
}
if (diff8 > 1e-3) {
iter8[j]++;
......@@ -99,22 +89,17 @@ void SU3Test() {
iter12[j]++;
}
if (diff8h > 1e-3) {
iter8h[j]++;
}
}
}
}
}
for (int i=0; i<18; i++) printf("%d 12 fails = %d, 8 fails = %d, 8h fails = %d\n",
i, iter12[i], iter8[i], iter8h[i]);
for (int i=0; i<18; i++) printf("%d 12 fails = %d, 8 fails = %d\n", i, iter12[i], iter8[i]);
for (int f=0; f<fail_check; f++) {
printf("%e Failures: 12 component = %d / %d = %e, 8 component = %d / %d = %e, 8h component = %d / %d = %e\n",
printf("%e Failures: 12 component = %d / %d = %e, 8 component = %d / %d = %e\n",
pow(10,-(f+1)), fail12[f], N*4*18, fail12[f] / (float)(4*N*18),
fail8[f], N*4*18, fail8[f] / (float)(4*N*18),
fail8h[f], N*4*18, fail8h[f] / (float)(4*N*18));
fail8[f], N*4*18, fail8[f] / (float)(4*N*18));
}
// release memory
......
This diff is collapsed.
......@@ -9,37 +9,29 @@ extern "C" {
// ---------- qcd.cpp ----------
int compareFloats(float *a, float *b, int len, float tol);
int compare_floats(void *a, void *b, int len, double tol, Precision precision);
void stopwatchStart();
double stopwatchReadSeconds();
void printSpinor(float *spinor);
void printSpinorElement(float *spinor, int X);
void printGauge(float *gauge);
void printGaugeElement(float *gauge, int X);
void printSpinorElement(void *spinor, int X, Precision precision);
void printGaugeElement(void *gauge, int X, Precision precision);
int fullLatticeIndex(int i, int oddBit);
int getOddBit(int X);
void applyGaugeFieldScaling(float **gauge);
void constructUnitGaugeField(float **gauge);
void constructGaugeField(float **gauge);
void constructPointSpinorField(float *spinor, int i0, int s0, int c0);
void constructSpinorField(float *res);
void construct_gauge_field(void **gauge, int type, Precision precision);
void construct_spinor_field(void *spinor, int type, int i0, int s0, int c0, Precision precision);
void applyGamma5(float *out, float *in, int len);
void su3_construct_12(float *mat);
void su3_reconstruct_12(float *mat, int dir, int ga_idx);
void su3_construct_8(float *mat);
void su3_reconstruct_8(float *mat, int dir, int ga_idx);
void su3_construct_8_half(float *mat, short *mat_half);
void su3_reconstruct_8_half(float *mat, short *mat_half, int dir, int ga_idx);
void su3_construct(void *mat, ReconstructType reconstruct, Precision precision);
void su3_reconstruct(void *mat, int dir, int ga_idx, ReconstructType reconstruct, Precision precision);
//void su3_construct_8_half(float *mat, short *mat_half);
//void su3_reconstruct_8_half(float *mat, short *mat_half, int dir, int ga_idx);
void su3_construct_8_bunk(float *mat, int dir);
void su3_reconstruct_8_bunk(float *mat, int dir, int ga_idx);
void apply_gamma5(void *out, void *in, int len, Precision precision);
void compare_spinor(void *spinor_cpu, void *spinor_gpu, int len, Precision precision);
// ---------- gauge_read.cpp ----------
void readGaugeField(char *filename, float *gauge[], int argc, char *argv[]);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment