Advanced Computing Platform for Theoretical Physics

commit大文件会使得服务器变得不稳定,请大家尽量只commit代码,不要commit大的文件。

Commit 4a39a396 authored by mikeaclark's avatar mikeaclark
Browse files

Working dslash double

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@388 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 8cdc25ad
......@@ -7,6 +7,21 @@
#define REDUCE_THREADS 128
#define REDUCE_MAX_BLOCKS 64
#define REDUCE_DOUBLE 64
#define REDUCE_KAHAN 32
#if (__CUDA_ARCH__ == 130)
#define REDUCE_TYPE REDUCE_DOUBLE
#define QudaSumFloat double
#define QudaSumComplex cuDoubleComplex
#define QudaSumFloat3 double3
#else
#define REDUCE_TYPE REDUCE_KAHAN
#define QudaSumFloat float
#define QudaSumComplex cuComplex
#define QudaSumFloat3 float3
#endif
inline void checkSpinor(ParitySpinor &a, ParitySpinor &b) {
if (a.precision == QUDA_HALF_PRECISION || b.precision == QUDA_HALF_PRECISION) {
printf("checkSpinor error, this kernel does not support QUDA_HALF_PRECISION\n");
......@@ -34,7 +49,7 @@ void zeroCuda(Float* dst, int len) {
void zeroQuda(ParitySpinor a) {
if (a.precision == QUDA_DOUBLE_PRECISION) {
zeroCuda((double*)a.spinor, a.length);
} else if (a.precision == QUDA_HALF_PRECISION) {
} else if (a.precision == QUDA_SINGLE_PRECISION) {
zeroCuda((float*)a.spinor, a.length);
} else {
zeroCuda((short*)a.spinor, a.length);
......@@ -65,7 +80,6 @@ __global__ void axpbyKernel(Float a, Float *x, Float b, Float *y, int len) {
}
// performs the operation y[i] = a*x[i] + b*y[i]
template <typename Float>
void axpbyQuda(double a, ParitySpinor x, double b, ParitySpinor y) {
checkSpinor(x, y);
int blocks = min(REDUCE_MAX_BLOCKS, max(x.length/REDUCE_THREADS, 1));
......
......@@ -4,21 +4,6 @@
#ifndef _QUDA_BLAS_H
#define _QUDA_BLAS_H
#define REDUCE_DOUBLE 64
#define REDUCE_KAHAN 32
#if (__CUDA_ARCH__ == 130)
#define REDUCE_TYPE REDUCE_DOUBLE
#define QudaSumFloat double
#define QudaSumComplex cuDoubleComplex
#define QudaSumFloat3 double3
#else
#define REDUCE_TYPE REDUCE_KAHAN
#define QudaSumFloat float
#define QudaSumComplex cuComplex
#define QudaSumFloat3 float3
#endif
#ifdef __cplusplus
extern "C" {
#endif
......
......@@ -408,8 +408,35 @@ void dslashHCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
}
void dslashXpayCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity, int dagger,
ParitySpinor x, double a) {
if (invert_param->cuda_prec == QUDA_DOUBLE_PRECISION) {
dslashXpayDCuda(out, gauge, in, parity, dagger, x, a);
} else if (invert_param->cuda_prec == QUDA_SINGLE_PRECISION) {
dslashXpaySCuda(out, gauge, in, parity, dagger, x, a);
} else if (invert_param->cuda_prec == QUDA_HALF_PRECISION) {
printf("Not yet implemented\n");
exit(-1);
dim3 gridDim(GRID_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
int spinor_float_bytes = Nh*spinorSiteSize*sizeof(float);
cudaBindTexture(0, spinorTexSingle, in.spinor, spinor_float_bytes);
spinorHalfPack <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>(hSpinor1.spinorNorm, hSpinor1.spinor);
dslashXpayHCuda(hSpinor2, gauge, hSpinor1, parity, dagger, x, a);
int spinor_half_bytes = Nh*spinorSiteSize*sizeof(float)/2;
cudaBindTexture(0, spinorTexHalf, hSpinor2.spinor, spinor_half_bytes);
cudaBindTexture(0, spinorTexNorm, hSpinor2.spinorNorm, spinor_half_bytes/12);
spinorHalfUnpack <<<gridDim, blockDim, SHARED_BYTES_SINGLE>>>(out);
}
}
void dslashXpayDCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, double a) {
int oddBit, int daggerBit, ParitySpinor x, double a) {
dim3 gridDim(GRID_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
......
......@@ -5,9 +5,6 @@
#include <quda.h>
#define packed12GaugeSiteSize 12 // real numbers per link, using SU(3) reconstruction
#define packed8GaugeSiteSize 8 // real numbers per link, using SU(3) reconstruction
#define gaugeSiteSize 18 // real numbers per link
#define spinorSiteSize 24 // real numbers per spinor
#define cloverSiteSize 72 // real numbers per block-diagonal clover matrix
......@@ -61,6 +58,8 @@ extern "C" {
// wrapper to above
void dslashCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity, int dagger);
void dslashXpayCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity, int dagger,
ParitySpinor x, double a);
// Full Wilson matrix
void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa);
......
......@@ -483,7 +483,7 @@ void loadGaugeField(FullGauge *cudaGauge, void *cpuGauge) {
printf("Sorry, %d GaugeFieldOrder not supported\n", gauge_param->gauge_order);
exit(-1);
}
cudaMemcpy(cudaGauge->even, packedEven, cudaGauge->packedGaugeBytes, cudaMemcpyHostToDevice);
cudaMemcpy(cudaGauge->odd, packedOdd, cudaGauge->packedGaugeBytes, cudaMemcpyHostToDevice);
......
......@@ -54,7 +54,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSlop
int k=0;
int xUpdate = 0, rUpdate = 0;
//printf("%d iterations, r2 = %e\n", k, r2);
printf("%d iterations, r2 = %e, x2 = %e\n", k, r2, normQuda(x));
stopwatchStart();
while (r2 > stop && k<invert_param->maxiter) {
......@@ -106,16 +106,16 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSlop
int updateX = (rNorm < delta*r0Norm && r0Norm <= maxrx) ? 1 : 0;
int updateR = ((rNorm < delta*maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0;
if (updateR) {
QudaPrecision spinorPrec = invert_param->cuda_prec;
invert_param -> cuda_prec = QUDA_SINGLE_PRECISION;
if (updateR) {
//QudaPrecision spinorPrec = invert_param->cuda_prec;
//invert_param -> cuda_prec = QUDA_SINGLE_PRECISION;
if (dag_type == QUDA_DAG_NO)
MatPCCuda(t, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
else
MatPCDagCuda(t, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
invert_param -> cuda_prec = spinorPrec;
//invert_param -> cuda_prec = spinorPrec;
copyQuda(r, b);
mxpyQuda(t, r);
......@@ -134,14 +134,14 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSlop
maxrx = rNorm;
xUpdate++;
}
}
k++;
printf("%d iterations, r2 = %e, x2 = %e\n", k, r2, normQuda(x));
}
axpyQuda(1.0f, y, x);
axpyQuda(1.0, y, x);
invert_param->secs += stopwatchReadSeconds();
......@@ -156,19 +156,19 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSlop
invert_param->gflops += gflops;
invert_param->iter += k;
#if 0
//#if 0
// Calculate the true residual
if (dag_type == QUDA_DAG_NO)
MatPCCuda(t.spinor, gauge, x.spinor, invert_param->kappa, tmp.spinor, invert_param->matpc_type);
MatPCCuda(t, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
else
MatPCDagCuda(t.spinor, gauge, x.spinor, invert_param->kappa, tmp.spinor, invert_param->matpc_type);
copyQuda(r, b);
MatPCDagCuda(t, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
copyQuda(r, source);
mxpyQuda(t, r);
double true_res = normQuda(r);
printf("Converged after %d iterations, r2 = %e, true_r2 = %e\n",
k, r2, true_res / b2);
#endif
//#endif
freeParitySpinor(b);
freeParitySpinor(y);
......
......@@ -34,7 +34,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge,
zeroQuda(x);
int k=0;
//printf("%d iterations, r2 = %e\n", k, r2);
printf("%d iterations, r2 = %e\n", k, r2);
stopwatchStart();
while (r2 > stop && k<perf->maxiter) {
MatPCDagMatPCCuda(Ap, gauge, p, perf->kappa, tmp, perf->matpc_type);
......
......@@ -109,9 +109,10 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
createGaugeField(&cudaGaugePrecise, h_gauge, gauge_param->reconstruct_precise, gauge_param->cuda_prec_precise);
gauge_param->gaugeGiB = 2.0*cudaGaugePrecise.packedGaugeBytes/ (1 << 30);
if (gauge_param->cuda_prec_sloppy != gauge_param->cuda_prec_precise) {
if (gauge_param->cuda_prec_sloppy != gauge_param->cuda_prec_precise ||
gauge_param->reconstruct_sloppy != gauge_param->reconstruct_precise) {
createGaugeField(&cudaGaugeSloppy, h_gauge, gauge_param->reconstruct_sloppy, gauge_param->cuda_prec_sloppy);
gauge_param->gaugeGiB = 2.0*cudaGaugeSloppy.packedGaugeBytes/ (1 << 30);
gauge_param->gaugeGiB += 2.0*cudaGaugeSloppy.packedGaugeBytes/ (1 << 30);
} else {
cudaGaugeSloppy = cudaGaugePrecise;
}
......@@ -269,9 +270,9 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
}
if (param->matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashXpaySCuda(in, cudaGaugePrecise, b.odd, 0, 0, b.even, kappa);
dslashXpayCuda(in, cudaGaugePrecise, b.odd, 0, 0, b.even, kappa);
} else {
dslashXpaySCuda(in, cudaGaugePrecise, b.even, 1, 0, b.odd, kappa);
dslashXpayCuda(in, cudaGaugePrecise, b.even, 1, 0, b.odd, kappa);
}
} else if (param->solution_type == QUDA_MATPC_SOLUTION ||
......@@ -315,9 +316,9 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
}
if (param->matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashXpaySCuda(x.odd, cudaGaugePrecise, out, 1, 0, b.odd, kappa);
dslashXpayCuda(x.odd, cudaGaugePrecise, out, 1, 0, b.odd, kappa);
} else {
dslashXpaySCuda(x.even, cudaGaugePrecise, out, 0, 0, b.even, kappa);
dslashXpayCuda(x.even, cudaGaugePrecise, out, 0, 0, b.even, kappa);
}
retrieveSpinorField(h_x, x, param->cpu_prec, param->cuda_prec, param->dirac_order);
......
......@@ -15,12 +15,12 @@ int main(int argc, char **argv)
QudaGaugeParam Gauge_param;
QudaInvertParam inv_param;
Gauge_param.cpu_prec = QUDA_SINGLE_PRECISION;
Gauge_param.cpu_prec = QUDA_DOUBLE_PRECISION;
Gauge_param.cuda_prec_precise = QUDA_SINGLE_PRECISION;
Gauge_param.reconstruct_precise = QUDA_RECONSTRUCT_12;
Gauge_param.cuda_prec_precise = QUDA_DOUBLE_PRECISION;
Gauge_param.reconstruct_precise = QUDA_RECONSTRUCT_8;
Gauge_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
Gauge_param.cuda_prec_sloppy = QUDA_DOUBLE_PRECISION;
Gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12;
Gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
......@@ -30,20 +30,20 @@ int main(int argc, char **argv)
Gauge_param.T = L4;
Gauge_param.anisotropy = 1.0;
inv_param.inv_type = QUDA_BICGSTAB_INVERTER;
inv_param.inv_type = QUDA_CG_INVERTER;
Gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T;
Gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
gauge_param = &Gauge_param;
double mass = -0.958;
double mass = 0.0;
inv_param.kappa = 1.0 / (2.0*(4 + mass));
inv_param.tol = 5e-7;
inv_param.maxiter = 5000;
inv_param.reliable_delta = 1e-6;
inv_param.tol = 1e-6;
inv_param.maxiter = 20;
inv_param.reliable_delta = 1e-3;
inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION;
inv_param.cpu_prec = QUDA_SINGLE_PRECISION;
inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_DOUBLE_PRECISION;
inv_param.solution_type = QUDA_MAT_SOLUTION;
inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO;
......@@ -64,7 +64,7 @@ int main(int argc, char **argv)
int i0 = 0;
int s0 = 0;
int c0 = 0;
construct_spinor_field(spinorIn, 0, i0, s0, c0, inv_param.cpu_prec);
construct_spinor_field(spinorIn, 1, i0, s0, c0, inv_param.cpu_prec);
double time0 = -((double)clock()); // Start the timer
......
......@@ -3,10 +3,10 @@
#include <cuda_runtime.h>
#define L1 4 // "x" dimension
#define L2 4 // "y" dimension
#define L3 4 // "z" dimension
#define L4 4 // "time" dimension
#define L1 24 // "x" dimension
#define L2 24 // "y" dimension
#define L3 24 // "z" dimension
#define L4 32 // "time" dimension
#define L1h (L1/2) // half of the full "x" dimension, useful for even/odd lattice indexing
#define N (L1*L2*L3*L4) // total number of lattice points
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment