Advanced Computing Platform for Theoretical Physics

Commit 72aac654 authored by mikeaclark's avatar mikeaclark
Browse files

Clean up of spinor_quda.cpp

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@405 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 937f5509
......@@ -537,7 +537,7 @@ int dslashCudaSharedBytes() {
// Apply the even-odd preconditioned Dirac operator
void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa,
ParitySpinor tmp, MatPCType matpc_type) {
ParitySpinor tmp, MatPCType matpc_type, int dagger) {
checkSpinor(in, out);
checkSpinor(in, tmp);
......@@ -546,64 +546,27 @@ void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa,
if (in.precision == QUDA_DOUBLE_PRECISION) {
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashDCuda(tmp, gauge, in, 1, 0);
dslashXpayDCuda(out, gauge, tmp, 0, 0, in, kappa2);
dslashDCuda(tmp, gauge, in, 1, dagger);
dslashXpayDCuda(out, gauge, tmp, 0, dagger, in, kappa2);
} else {
dslashDCuda(tmp, gauge, in, 0, 0);
dslashXpayDCuda(out, gauge, tmp, 1, 0, in, kappa2);
dslashDCuda(tmp, gauge, in, 0, dagger);
dslashXpayDCuda(out, gauge, tmp, 1, dagger, in, kappa2);
}
} else if (in.precision == QUDA_SINGLE_PRECISION) {
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashSCuda(tmp, gauge, in, 1, 0);
dslashXpaySCuda(out, gauge, tmp, 0, 0, in, kappa2);
dslashSCuda(tmp, gauge, in, 1, dagger);
dslashXpaySCuda(out, gauge, tmp, 0, dagger, in, kappa2);
} else {
dslashSCuda(tmp, gauge, in, 0, 0);
dslashXpaySCuda(out, gauge, tmp, 1, 0, in, kappa2);
dslashSCuda(tmp, gauge, in, 0, dagger);
dslashXpaySCuda(out, gauge, tmp, 1, dagger, in, kappa2);
}
} else if (in.precision == QUDA_HALF_PRECISION) {
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashHCuda(tmp, gauge, in, 1, 0);
dslashXpayHCuda(out, gauge, tmp, 0, 0, in, kappa2);
dslashHCuda(tmp, gauge, in, 1, dagger);
dslashXpayHCuda(out, gauge, tmp, 0, dagger, in, kappa2);
} else {
dslashHCuda(tmp, gauge, in, 0, 0);
dslashXpayHCuda(out, gauge, tmp, 1, 0, in, kappa2);
}
}
}
// Apply the even-odd preconditioned Dirac operator
void MatPCDagCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa,
ParitySpinor tmp, MatPCType matpc_type) {
checkSpinor(in, out);
checkSpinor(in, tmp);
double kappa2 = -kappa*kappa;
if (in.precision == QUDA_DOUBLE_PRECISION) {
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashDCuda(tmp, gauge, in, 1, 1);
dslashXpayDCuda(out, gauge, tmp, 0, 1, in, kappa2);
} else {
dslashDCuda(tmp, gauge, in, 0, 1);
dslashXpayDCuda(out, gauge, tmp, 1, 1, in, kappa2);
}
} else if (in.precision == QUDA_SINGLE_PRECISION) {
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashSCuda(tmp, gauge, in, 1, 1);
dslashXpaySCuda(out, gauge, tmp, 0, 1, in, kappa2);
} else {
dslashSCuda(tmp, gauge, in, 0, 1);
dslashXpaySCuda(out, gauge, tmp, 1, 1, in, kappa2);
}
} else {
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashHCuda(tmp, gauge, in, 1, 1);
dslashXpayHCuda(out, gauge, tmp, 0, 1, in, kappa2);
} else {
dslashHCuda(tmp, gauge, in, 0, 1);
dslashXpayHCuda(out, gauge, tmp, 1, 1, in, kappa2);
dslashHCuda(tmp, gauge, in, 0, dagger);
dslashXpayHCuda(out, gauge, tmp, 1, dagger, in, kappa2);
}
}
......@@ -611,43 +574,27 @@ void MatPCDagCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kap
void MatPCDagMatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in,
double kappa, ParitySpinor tmp, MatPCType matpc_type) {
MatPCCuda(out, gauge, in, kappa, tmp, matpc_type);
MatPCDagCuda(out, gauge, out, kappa, tmp, matpc_type);
MatPCCuda(out, gauge, in, kappa, tmp, matpc_type, 0);
MatPCCuda(out, gauge, out, kappa, tmp, matpc_type, 1);
}
// Apply the full operator
void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa) {
void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa, int dagger) {
checkSpinor(in.even, out.even);
if (in.even.precision == QUDA_DOUBLE_PRECISION) {
dslashXpayDCuda(out.odd, gauge, in.even, 1, 0, in.odd, -kappa);
dslashXpayDCuda(out.even, gauge, in.odd, 0, 0, in.even, -kappa);
dslashXpayDCuda(out.odd, gauge, in.even, 1, dagger, in.odd, -kappa);
dslashXpayDCuda(out.even, gauge, in.odd, 0, dagger, in.even, -kappa);
} else if (in.even.precision == QUDA_SINGLE_PRECISION) {
dslashXpaySCuda(out.odd, gauge, in.even, 1, 0, in.odd, -kappa);
dslashXpaySCuda(out.even, gauge, in.odd, 0, 0, in.even, -kappa);
dslashXpaySCuda(out.odd, gauge, in.even, 1, dagger, in.odd, -kappa);
dslashXpaySCuda(out.even, gauge, in.odd, 0, dagger, in.even, -kappa);
} else if (in.even.precision == QUDA_HALF_PRECISION) {
dslashXpayHCuda(out.odd, gauge, in.even, 1, 0, in.odd, -kappa);
dslashXpayHCuda(out.even, gauge, in.odd, 0, 0, in.even, -kappa);
dslashXpayHCuda(out.odd, gauge, in.even, 1, dagger, in.odd, -kappa);
dslashXpayHCuda(out.even, gauge, in.odd, 0, dagger, in.even, -kappa);
}
}
// Apply the full operator dagger
void MatDaggerCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa) {
checkSpinor(in.even, out.even);
if (in.even.precision == QUDA_SINGLE_PRECISION) {
dslashXpayDCuda(out.odd, gauge, in.even, 1, 1, in.odd, -kappa);
dslashXpayDCuda(out.even, gauge, in.odd, 0, 1, in.even, -kappa);
} else if (in.even.precision == QUDA_SINGLE_PRECISION) {
dslashXpaySCuda(out.odd, gauge, in.even, 1, 1, in.odd, -kappa);
dslashXpaySCuda(out.even, gauge, in.odd, 0, 1, in.even, -kappa);
} else if (in.even.precision == QUDA_HALF_PRECISION) {
dslashXpayHCuda(out.odd, gauge, in.even, 1, 1, in.odd, -kappa);
dslashXpayHCuda(out.even, gauge, in.odd, 0, 1, in.even, -kappa);
}
}
/*
// Apply the even-odd preconditioned Dirac operator
......
......@@ -59,13 +59,11 @@ extern "C" {
ParitySpinor x, double a);
// Full Wilson matrix
void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa);
void MatDagCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa);
void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa, int daggerBit);
void MatPCCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven,
double kappa, ParitySpinor tmp, MatPCType matpc_type);
void MatPCDagCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven,
double kappa, ParitySpinor tmp, MatPCType matpc_type);
double kappa, ParitySpinor tmp, MatPCType matpc_type, int daggerBit);
void MatPCDagMatPCCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven,
double kappa, ParitySpinor tmp, MatPCType matpc_type);
......
......@@ -8,7 +8,7 @@
#include <gauge_quda.h>
// What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat)
int test_type = 1;
int test_type = 2;
QudaGaugeParam gaugeParam;
QudaInvertParam inv_param;
......@@ -25,7 +25,7 @@ void *spinorEven, *spinorOdd;
double kappa = 1.0;
int ODD_BIT = 0;
int DAGGER_BIT = 0;
int TRANSFER = 0; // include transfer time in the benchmark?
int TRANSFER = 1; // include transfer time in the benchmark?
void init() {
......@@ -126,12 +126,12 @@ double dslashCUDA() {
else dslashCuda(cudaSpinor.odd, gauge, cudaSpinor.even, ODD_BIT, DAGGER_BIT);
break;
case 1:
if (TRANSFER) MatPCQuda(spinorOdd, spinorEven, &inv_param);
else MatPCCuda(cudaSpinor.odd, gauge, cudaSpinor.even, kappa, tmp, QUDA_MATPC_EVEN_EVEN);
if (TRANSFER) MatPCQuda(spinorOdd, spinorEven, &inv_param, DAGGER_BIT);
else MatPCCuda(cudaSpinor.odd, gauge, cudaSpinor.even, kappa, tmp, QUDA_MATPC_EVEN_EVEN, DAGGER_BIT);
break;
case 2:
if (TRANSFER) MatQuda(spinorGPU, spinor, &inv_param);
else MatCuda(cudaSpinorOut, gauge, cudaSpinor, kappa);
if (TRANSFER) MatQuda(spinorGPU, spinor, &inv_param, DAGGER_BIT);
else MatCuda(cudaSpinorOut, gauge, cudaSpinor, kappa, DAGGER_BIT);
}
}
......
......@@ -36,7 +36,11 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugeSloppy,
zeroCuda(x_sloppy);
copyCuda(b, src);
copyCuda(r_sloppy, src_sloppy);
copyCuda(r_sloppy, src);
/*MatPCDagCuda(y, gaugePrecise, src, invert_param->kappa, tmp, invert_param->matpc_type);
copyCuda(src_sloppy, y);*/ // uncomment for BiCRstab
zeroCuda(y);
double b2 = normCuda(b);
......@@ -73,7 +77,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugeSloppy,
while (r2 > stop && k<invert_param->maxiter) {
if (k==0) {
rho = make_cuDoubleComplex(r2, 0.0);
rho = make_cuDoubleComplex(r2, 0.0); // cDotProductCuda(src_sloppy, r_sloppy); // BiCRstab
copyCuda(p, r_sloppy);
} else {
alpha_omega = cuCdiv(alpha, omega);
......@@ -85,11 +89,9 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugeSloppy,
cxpaypbzCuda(r_sloppy, beta_omega, v, beta, p); // 8
}
if (dag_type == QUDA_DAG_NO)
MatPCCuda(v, gaugeSloppy, p, invert_param->kappa, tmp_sloppy, invert_param->matpc_type);
else
MatPCDagCuda(v, gaugeSloppy, p, invert_param->kappa, tmp_sloppy, invert_param->matpc_type);
MatPCCuda(v, gaugeSloppy, p, invert_param->kappa, tmp_sloppy, invert_param->matpc_type, dag_type);
// rv = (r0,v)
rv = cDotProductCuda(src_sloppy, v);
alpha = cuCdiv(rho, rv);
......@@ -99,10 +101,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugeSloppy,
caxpyCuda(alpha, v, r_sloppy); // 4
alpha.x *= -1.0; alpha.y *= -1.0;
if (dag_type == QUDA_DAG_NO)
MatPCCuda(t, gaugeSloppy, r_sloppy, invert_param->kappa, tmp_sloppy, invert_param->matpc_type);
else
MatPCDagCuda(t, gaugeSloppy, r_sloppy, invert_param->kappa, tmp_sloppy, invert_param->matpc_type);
MatPCCuda(t, gaugeSloppy, r_sloppy, invert_param->kappa, tmp_sloppy, invert_param->matpc_type, dag_type);
// omega = (t, r) / (t, t)
omega_t2 = cDotProductNormACuda(t, r_sloppy); // 6
......@@ -122,10 +121,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugeSloppy,
if (updateR) {
if (x.precision != x_sloppy.precision) copyCuda(x, x_sloppy);
if (dag_type == QUDA_DAG_NO)
MatPCCuda(r, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
else
MatPCDagCuda(r, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
MatPCCuda(r, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type, dag_type);
r2 = xmyNormCuda(b, r);
if (x.precision != r_sloppy.precision) copyCuda(r_sloppy, r);
......@@ -168,10 +164,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugeSloppy,
#if 0
// Calculate the true residual
if (dag_type == QUDA_DAG_NO)
MatPCCuda(r, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
else
MatPCDagCuda(r, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
MatPCCuda(r, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type, dag_type);
double true_res = xmyNormCuda(src, r);
printf("Converged after %d iterations, r2 = %e, true_r2 = %e\n", k, sqrt(r2/b2), sqrt(true_res / b2));
......
......@@ -134,29 +134,14 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int parity,
freeParitySpinor(in);
}
void MatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
void MatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int dagger)
{
ParitySpinor in = allocateParitySpinor(Nh, inv_param->cuda_prec);
ParitySpinor out = allocateParitySpinor(Nh, inv_param->cuda_prec);
ParitySpinor tmp = allocateParitySpinor(Nh, inv_param->cuda_prec);
loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->dirac_order);
MatPCCuda(out, cudaGaugePrecise, in, inv_param->kappa, tmp, inv_param->matpc_type);
retrieveParitySpinor(h_out, out, inv_param->cpu_prec, inv_param->dirac_order);
freeParitySpinor(tmp);
freeParitySpinor(out);
freeParitySpinor(in);
}
void MatPCDagQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
{
ParitySpinor in = allocateParitySpinor(Nh, inv_param->cuda_prec);
ParitySpinor out = allocateParitySpinor(Nh, inv_param->cuda_prec);
ParitySpinor tmp = allocateParitySpinor(Nh, inv_param->cuda_prec);
loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->dirac_order);
MatPCDagCuda(out, cudaGaugePrecise, in, inv_param->kappa, tmp, inv_param->matpc_type);
MatPCCuda(out, cudaGaugePrecise, in, inv_param->kappa, tmp, inv_param->matpc_type, dagger);
retrieveParitySpinor(h_out, out, inv_param->cpu_prec, inv_param->dirac_order);
freeParitySpinor(tmp);
......@@ -179,29 +164,14 @@ void MatPCDagMatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
freeParitySpinor(in);
}
void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) {
FullSpinor in = allocateSpinorField(N, inv_param->cuda_prec);
FullSpinor out = allocateSpinorField(N, inv_param->cuda_prec);
loadSpinorField(in, h_in, inv_param->cpu_prec, inv_param->dirac_order);
dslashXpayCuda(out.odd, cudaGaugePrecise, in.even, 1, 0, in.odd, -inv_param->kappa);
dslashXpayCuda(out.even, cudaGaugePrecise, in.odd, 0, 0, in.even, -inv_param->kappa);
retrieveSpinorField(h_out, out, inv_param->cpu_prec, inv_param->dirac_order);
freeSpinorField(out);
freeSpinorField(in);
}
void MatDagQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) {
void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int dagger) {
FullSpinor in = allocateSpinorField(N, inv_param->cuda_prec);
FullSpinor out = allocateSpinorField(N, inv_param->cuda_prec);
loadSpinorField(in, h_in, inv_param->cpu_prec, inv_param->dirac_order);
dslashXpayCuda(out.odd, cudaGaugePrecise, in.even, 1, 1, in.odd, -inv_param->kappa);
dslashXpayCuda(out.even, cudaGaugePrecise, in.odd, 0, 1, in.even, -inv_param->kappa);
dslashXpayCuda(out.odd, cudaGaugePrecise, in.even, 1, dagger, in.odd, -inv_param->kappa);
dslashXpayCuda(out.even, cudaGaugePrecise, in.odd, 0, dagger, in.even, -inv_param->kappa);
retrieveSpinorField(h_out, out, inv_param->cpu_prec, inv_param->dirac_order);
......@@ -277,7 +247,7 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
case QUDA_CG_INVERTER:
if (param->solution_type != QUDA_MATPCDAG_MATPC_SOLUTION) {
copyCuda(out, in);
MatPCDagCuda(in, cudaGaugePrecise, out, kappa, tmp, param->matpc_type);
MatPCCuda(in, cudaGaugePrecise, out, kappa, tmp, param->matpc_type, QUDA_DAG_YES);
}
invertCgCuda(out, in, cudaGaugeSloppy, tmp, param);
break;
......
......@@ -70,12 +70,10 @@ extern "C" {
void invertQuda(void *h_x, void *h_b, QudaInvertParam *param);
void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int parity, int dagger);
void MatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param);
void MatPCDagQuda(void *h_out, void *h_in, QudaInvertParam *inv_param);
void MatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int dagger);
void MatPCDagMatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param);
void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param);
void MatDagQuda(void *h_out, void *h_in, QudaInvertParam *inv_param);
void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int dagger);
void endQuda(void);
......
......@@ -20,7 +20,7 @@ int main(int argc, char **argv)
Gauge_param.cuda_prec = QUDA_DOUBLE_PRECISION;
Gauge_param.reconstruct = QUDA_RECONSTRUCT_12;
Gauge_param.cuda_prec_sloppy = QUDA_DOUBLE_PRECISION;
Gauge_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
Gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12;
Gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
......@@ -38,13 +38,13 @@ int main(int argc, char **argv)
double mass = -0.97;
inv_param.kappa = 1.0 / (2.0*(4 + mass));
inv_param.tol = 1e-7;
inv_param.maxiter = 5000;
inv_param.tol = 1e-12;
inv_param.maxiter = 10000;
inv_param.reliable_delta = 1e-2;
inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec_sloppy = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
inv_param.solution_type = QUDA_MAT_SOLUTION;
inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO;
......
......@@ -126,226 +126,233 @@ inline void unpackFloat4(float *a, float4 *b) {
//a[0] = b->x; a[1] = b->y; a[2] = b->z; a[3] = b->w;
}
void packFullSpinorDD(double2 *even, double2 *odd, double *spinor) {
double K = 1.0 / 2.0;
double b[24];
for (int i=0; i<Nh; i++) {
template <typename Float>
inline void packSpinorVector(float4* a, Float *b) {
Float K = 1.0 / 2.0;
int boundaryCrossings = i/L1h + i/(L2*L1h) + i/(L3*L2*L1h);
a[0*Nh].x = K*(b[1*6+0*2+0]+b[3*6+0*2+0]);
a[0*Nh].y = K*(b[1*6+0*2+1]+b[3*6+0*2+1]);
a[0*Nh].z = K*(b[1*6+1*2+0]+b[3*6+1*2+0]);
a[0*Nh].w = K*(b[1*6+1*2+1]+b[3*6+1*2+1]);
{ // even sites
int k = 2*i + boundaryCrossings%2;
double *a = spinor + k*24;
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
b[0*6+cr] = K*(a[1*6+cr]+a[3*6+cr]);
b[1*6+cr] = -K*(a[0*6+cr]+a[2*6+cr]);
b[2*6+cr] = K*(a[1*6+cr]-a[3*6+cr]);
b[3*6+cr] = K*(a[2*6+cr]-a[0*6+cr]);
}
}
for (int j = 0; j < 12; j++) packDouble2(even+j*Nh+i, b+j*2);
}
a[1*Nh].x = K*(b[1*6+2*2+0]+b[3*6+2*2+0]);
a[1*Nh].y = K*(b[1*6+2*2+1]+b[3*6+2*2+1]);
a[1*Nh].z = -K*(b[0*6+0*2+0]+b[2*6+0*2+0]);
a[1*Nh].w = -K*(b[0*6+0*2+1]+b[2*6+0*2+1]);
a[2*Nh].x = -K*(b[0*6+1*2+0]+b[2*6+1*2+0]);
a[2*Nh].y = -K*(b[0*6+1*2+1]+b[2*6+1*2+1]);
a[2*Nh].z = -K*(b[0*6+2*2+0]+b[2*6+2*2+0]);
a[2*Nh].w = -K*(b[0*6+2*2+1]+b[2*6+2*2+1]);
a[3*Nh].x = K*(b[1*6+0*2+0]-b[3*6+0*2+0]);
a[3*Nh].y = K*(b[1*6+0*2+1]-b[3*6+0*2+1]);
a[3*Nh].z = K*(b[1*6+1*2+0]-b[3*6+1*2+0]);
a[3*Nh].w = K*(b[1*6+1*2+1]-b[3*6+1*2+1]);
a[4*Nh].x = K*(b[1*6+2*2+0]-b[3*6+2*2+0]);
a[4*Nh].y = K*(b[1*6+2*2+1]-b[3*6+2*2+1]);
a[4*Nh].z = K*(b[2*6+0*2+0]-b[0*6+0*2+0]);
a[4*Nh].w = K*(b[2*6+0*2+1]-b[0*6+0*2+1]);
a[5*Nh].x = K*(b[2*6+1*2+0]-b[0*6+1*2+0]);
a[5*Nh].y = K*(b[2*6+1*2+1]-b[0*6+1*2+1]);
a[5*Nh].z = K*(b[2*6+2*2+0]-b[0*6+2*2+0]);
a[5*Nh].w = K*(b[2*6+2*2+1]-b[0*6+2*2+1]);
{ // odd sites
int k = 2*i + (boundaryCrossings+1)%2;
double *a = spinor + k*24;
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
b[0*6+cr] = K*(a[1*6+cr]+a[3*6+cr]);
b[1*6+cr] = -K*(a[0*6+cr]+a[2*6+cr]);
b[2*6+cr] = K*(a[1*6+cr]-a[3*6+cr]);
b[3*6+cr] = K*(a[2*6+cr]-a[0*6+cr]);
}
}
for (int j=0; j<12; j++) packDouble2(odd+j*Nh+i, b+j*2);
}
}
}
template <typename Float>
void packFullSpinorSF(float4 *even, float4 *odd, Float *spinor) {
inline void packQDPSpinorVector(float4* a, Float *b) {
Float K = 1.0 / 2.0;
float b[24];
for (int i=0; i<Nh; i++) {
a[0*Nh].x = K*(b[(0*4+1)*2+0]+b[(0*4+3)*2+0]);
a[0*Nh].y = K*(b[(0*4+1)*2+1]+b[(0*4+3)*2+1]);
a[0*Nh].z = K*(b[(1*4+1)*2+0]+b[(1*4+3)*2+0]);
a[0*Nh].w = K*(b[(1*4+1)*2+1]+b[(1*4+3)*2+1]);
int boundaryCrossings = i/L1h + i/(L2*L1h) + i/(L3*L2*L1h);
a[1*Nh].x = K*(b[(2*4+1)*2+0]+b[(2*4+3)*2+0]);
a[1*Nh].y = K*(b[(2*4+1)*2+1]+b[(2*4+3)*2+1]);
a[1*Nh].z = -K*(b[(0*4+0)*2+0]+b[(0*4+2)*2+0]);
a[1*Nh].w = -K*(b[(0*4+0)*2+1]+b[(0*4+2)*2+1]);
{ // even sites
int k = 2*i + boundaryCrossings%2;
Float *a = spinor + k*24;
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
b[0*6+cr] = K*(a[1*6+cr]+a[3*6+cr]);
b[1*6+cr] = -K*(a[0*6+cr]+a[2*6+cr]);
b[2*6+cr] = K*(a[1*6+cr]-a[3*6+cr]);
b[3*6+cr] = K*(a[2*6+cr]-a[0*6+cr]);
}
}
for (int j=0; j<6; j++) packFloat4(even+j*Nh+i, b+j*4);
}
{ // odd sites
int k = 2*i + (boundaryCrossings+1)%2;
Float *a = spinor + k*24;
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
b[0*6+cr] = K*(a[1*6+cr]+a[3*6+cr]);
b[1*6+cr] = -K*(a[0*6+cr]+a[2*6+cr]);
b[2*6+cr] = K*(a[1*6+cr]-a[3*6+cr]);
b[3*6+cr] = K*(a[2*6+cr]-a[0*6+cr]);
}
}
for (int j=0; j<6; j++) packFloat4(odd+j*Nh+i, b+j*4);
}
}
a[2*Nh].x = -K*(b[(1*4+0)*2+0]+b[(1*4+2)*2+0]);
a[2*Nh].y = -K*(b[(1*4+0)*2+1]+b[(1*4+2)*2+1]);
a[2*Nh].z = -K*(b[(2*4+0)*2+0]+b[(2*4+2)*2+0]);
a[2*Nh].w = -K*(b[(2*4+0)*2+1]+b[(2*4+2)*2+1]);
a[3*Nh].x = K*(b[(0*4+1)*2+0]+b[(0*4+3)*2+0]);
a[3*Nh].y = K*(b[(0*4+1)*2+1]+b[(0*4+3)*2+1]);
a[3*Nh].z = K*(b[(1*4+1)*2+0]+b[(1*4+3)*2+0]);
a[3*Nh].w = K*(b[(1*4+1)*2+1]+b[(1*4+3)*2+1]);
a[4*Nh].x = K*(b[(2*4+1)*2+0]+b[(2*4+3)*2+0]);
a[4*Nh].y = K*(b[(2*4+1)*2+1]+b[(2*4+3)*2+1]);
a[4*Nh].z = K*(b[(0*4+2)*2+0]+b[(0*4+0)*2+0]);
a[4*Nh].w = K*(b[(0*4+2)*2+1]+b[(0*4+0)*2+1]);
a[5*Nh].x = K*(b[(1*4+2)*2+0]+b[(1*4+0)*2+0]);
a[5*Nh].y = K*(b[(1*4+2)*2+1]+b[(1*4+0)*2+1]);
a[5*Nh].z = K*(b[(2*4+2)*2+0]+b[(2*4+0)*2+0]);
a[5*Nh].w = K*(b[(2*4+2)*2+1]+b[(2*4+0)*2+1]);
}
// Standard spinor packing, colour inside spin
void packParitySpinorDD(double2 *res, double *spinor) {
double K = 1.0 / (2.0);
double b[24];
template <typename Float>
inline void packSpinorVector(double2* a, Float *b) {
Float K = 1.0 / 2.0;
for (int i = 0; i < Nh; i++) {
double *a = spinor+i*24;
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
b[0*6+cr] = K*(a[1*6+cr]+a[3*6+cr]);
b[1*6+cr] = -K*(a[0*6+cr]+a[2*6+cr]);
b[2*6+cr] = K*(a[1*6+cr]-a[3*6+cr]);
b[3*6+cr] = K*(a[2*6+cr]-a[0*6+cr]);
}
}
for (int c=0; c<3; c++) {
a[c*Nh].x = K*(b[1*6+c*2+0]+b[3*6+c*2+0]);
a[c*Nh].y = K*(b[1*6+c*2+1]+b[3*6+c*2+1]);
a[(3+c)*Nh].x = -K*(b[0*6+c*2+0]+b[2*6+c*2+0]);
a[(3+c)*Nh].y = -K*(b[0*6+c*2+1]+b[2*6+c*2+1]);
a[(6+c)*Nh].x = K*(b[1*6+c*2+0]-b[3*6+c*2+0]);
a[(6+c)*Nh].y = K*(b[1*6+c*2+1]-b[3*6+c*2+1]);
for (int j=0; j<12; j++) packDouble2(res+j*Nh+i, b+j*2);
a[(9+c)*Nh].x = K*(b[2*6+c*2+0]-b[0*6+c*2+0]);
a[(9+c)*Nh].y = K*(b[2*6+c*2+1]-b[0*6+c*2+1]);
}
}
void packParitySpinorSD(float4 *res, double *spinor) {
double K = 1.0 / (2.0);
float b[24];
template <typename Float>
inline void packQDPSpinorVector(double2* a, Float *b) {
Float K = 1.0 / 2.0;
for (int i = 0; i < Nh; i++) {
double *a = spinor+i*24;
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
b[0*6+cr] = K*(a[1*6+cr]+a[3*6+cr]);
b[1*6+cr] = -K*(a[0*6+cr]+a[2*6+cr]);
b[2*6+cr] = K*(a[1*6+cr]-a[3*6+cr]);
b[3*6+cr] = K*(a[2*6+cr]-a[0*6+cr]);
}
}
for (int c=0; c<3; c++) {
a[c*Nh].x = K*(b[(c*4+1)*2+0]+b[(c*4+3)*2+0]);
a[c*Nh].y = K*(b[(c*4+1)*2+1]+b[(c*4+3)*2+1]);
a[(3+c)*Nh].x = -K*(b[(c*4+0)*2+0]+b[(c*4+2)*2+0]);
a[(3+c)*Nh].y = -K*(b[(c*4+0)*2+1]+b[(c*4+2)*2+1]);
for (int j=0; j<6; j++) packFloat4(res+j*Nh+i, b+j*4);
a[(6+c)*Nh].x = K*(b[(c*4+1)*2+0]-b[(c*4+3)*2+0]);
a[(6+c)*Nh].y = K*(b[(c*4+1)*2+1]-b[(c*4+3)*2+1]);
a[(9+c)*Nh].x = K*(b[(c*4+2)*2+0]-b[(c*4+0)*2+0]);
a[(9+c)*Nh].y = K*(b[(c*4+2)*2+1]-b[(c*4+0)*2+1]);
}
}
// single precision version of the above
void packParitySpinorSS(float4 *res, float *spinor) {
float K = 1.0 / (2.0);
float b[24];
}
for (int i = 0; i < Nh; i++) {
float *a = spinor+i*24;
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
b[cr] = K*(a[6+cr]+a[18+cr]);
b[6+cr] = -K*(a[cr]+a[12+cr]);
b[12+cr] = K*(a[6+cr]-a[18+cr]);
b[18+cr] = K*(a[12+cr]-a[cr]);
}
}
template <typename Float>
inline void unpackSpinorVector(Float *a, float4 *b) {
Float K = 1.0;
for (int j = 0; j < 6; j++) packFloat4(res+j*Nh+i, b+j*4);
}
a[0*6+0*2+0] = -K*(b[Nh].z+b[4*Nh].z);
a[0*6+0*2+1] = -K*(b[Nh].w+b[4*Nh].w);
a[0*6+1*2+0] = -K*(b[2*Nh].x+b[5*Nh].x);
a[0*6+1*2+1] = -K*(b[2*Nh].y+b[5*Nh].y);
a[0*6+2*2+0] = -K*(b[2*Nh].z+b[5*Nh].z);
a[0*6+2*2+1] = -K*(b[2*Nh].w+b[5*Nh].w);
a[1*6+0*2+0] = K*(b[0].x+b[3*Nh].x);
a[1*6+0*2+1] = K*(b[0].y+b[3*Nh].y);
a[1*6+1*2+0] = K*(b[0].z+b[3*Nh].z);
a[1*6+1*2+1] = K*(b[0].w+b[3*Nh].w);
a[1*6+2*2+0] = K*(b[Nh].x+b[4*Nh].x);
a[1*6+2*2+1] = K*(b[Nh].y+b[4*Nh].y);
a[2*6+0*2+0] = -K*(b[Nh].z-b[4*Nh].z);
a[2*6+0*2+1] = -K*(b[Nh].w-b[4*Nh].w);
a[2*6+1*2+0] = -K*(b[2*Nh].x-b[5*Nh].x);
a[2*6+1*2+1] = -K*(b[2*Nh].y-b[5*Nh].y);
a[2*6+2*2+0] = -K*(b[2*Nh].z-b[5*Nh].z);
a[2*6+2*2+1] = -K*(b[2*Nh].w-b[5*Nh].w);
a[3*6+0*2+0] = -K*(b[3*Nh].x-b[0].x);
a[3*6+0*2+1] = -K*(b[3*Nh].y-b[0].y);
a[3*6+1*2+0] = -K*(b[3*Nh].z-b[0].z);
a[3*6+1*2+1] = -K*(b[3*Nh].w-b[0].w);
a[3*6+2*2+0] = -K*(b[4*Nh].x-b[Nh].x);
a[3*6+2*2+1] = -K*(b[4*Nh].y-b[Nh].y);
}
// QDP spinor packing, spin inside colour
void packQDPParitySpinorDD(double2 *res, double *spinor) {
double K = 1.0 / 2.0;
template <typename Float>
inline void unpackQDPSpinorVector(Float *a, float4 *b) {
Float K = 1.0;
a[(0*4+0)*2+0] = -K*(b[Nh].z+b[4*Nh].z);
a[(0*4+0)*2+1] = -K*(b[Nh].w+b[4*Nh].w);
a[(1*4+0)*2+0] = -K*(b[2*Nh].x+b[5*Nh].x);
a[(1*4+0)*2+1] = -K*(b[2*Nh].y+b[5*Nh].y);
a[(2*4+0)*2+0] = -K*(b[2*Nh].z+b[5*Nh].z);
a[(2*4+0)*2+1] = -K*(b[2*Nh].w+b[5*Nh].w);
double b[24];
for (int i = 0; i < Nh; i++) {
double *a = spinor+i*24;
// reorder and change basis
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
b[0*6+cr] = K*(a[(c*4+1)*2+r]+a[(c*4+3)*2+r]);
b[1*6+cr] = -K*(a[(c*4+0)*2+r]+a[(c*4+2)*2+r]);
b[2*6+cr] = K*(a[(c*4+1)*2+r]-a[(c*4+3)*2+r]);
b[3*6+cr] = K*(a[(c*4+2)*2+r]-a[(c*4+0)*2+r]);
}
}
a[(0*4+1)*2+0] = K*(b[0].x+b[3*Nh].x);
a[(0*4+1)*2+1] = K*(b[0].y+b[3*Nh].y);
a[(1*4+1)*2+0] = K*(b[0].z+b[3*Nh].z);
a[(1*4+1)*2+1] = K*(b[0].w+b[3*Nh].w);
a[(2*4+1)*2+0] = K*(b[Nh].x+b[4*Nh].x);
a[(2*4+1)*2+1] = K*(b[Nh].y+b[4*Nh].y);
a[(0*4+2)*2+0] = -K*(b[Nh].z-b[4*Nh].z);
a[(0*4+2)*2+1] = -K*(b[Nh].w-b[4*Nh].w);
a[(1*4+2)*2+0] = -K*(b[2*Nh].x-b[5*Nh].x);
a[(1*4+2)*2+1] = -K*(b[2*Nh].y-b[5*Nh].y);
a[(2*4+2)*2+0] = -K*(b[2*Nh].z-b[5*Nh].z);
a[(2*4+2)*2+1] = -K*(b[2*Nh].w-b[5*Nh].w);
a[(0*4+3)*2+0] = -K*(b[3*Nh].x-b[0].x);
a[(0*4+3)*2+1] = -K*(b[3*Nh].y-b[0].y);
a[(1*4+3)*2+0] = -K*(b[3*Nh].z-b[0].z);
a[(1*4+3)*2+1] = -K*(b[3*Nh].w-b[0].w);
a[(2*4+3)*2+0] = -K*(b[4*Nh].x-b[Nh].x);
a[(2*4+3)*2+1] = -K*(b[4*Nh].y-b[Nh].y);
}
template <typename Float>
inline void unpackSpinorVector(Float *a, double2 *b) {
Float K = 1.0;
for (int j = 0; j < 6; j++) packDouble2(res+j*Nh+i, b+j*2);
for (int c=0; c<3; c++) {
a[0*6+c*2+0] = -K*(b[(3+c)*Nh].x+b[(9+c)*Nh].x);
a[0*6+c*2+1] = -K*(b[(3+c)*Nh].y+b[(9+c)*Nh].y);
a[1*6+c*2+0] = K*(b[c*Nh].x+b[(6+c)*Nh].x);
a[1*6+c*2+1] = K*(b[c*Nh].y+b[(6+c)*Nh].y);
a[2*6+c*2+0] = -K*(b[(3+c)*Nh].x-b[(9+c)*Nh].x);
a[2*6+c*2+1] = -K*(b[(3+c)*Nh].y-b[(9+c)*Nh].y);
a[3*6+c*2+0] = -K*(b[(6+c)*Nh].x-b[c*Nh].x);
a[3*6+c*2+1] = -K*(b[(6+c)*Nh].y-b[c*Nh].y);
}
}
// QDP spinor packing, spin inside colour
void packQDPParitySpinorSD(float4 *res, double *spinor) {
double K = 1.0 / 2.0;
float b[24];
for (int i = 0; i < Nh; i++) {
double *a = spinor+i*24;
// reorder and change basis
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
b[0*6+cr] = K*(a[(c*4+1)*2+r]+a[(c*4+3)*2+r]);
b[1*6+cr] = -K*(a[(c*4+0)*2+r]+a[(c*4+2)*2+r]);
b[2*6+cr] = K*(a[(c*4+1)*2+r]-a[(c*4+3)*2+r]);
b[3*6+cr] = K*(a[(c*4+2)*2+r]-a[(c*4+0)*2+r]);
}
}
template <typename Float>
inline void unpackQDPSpinorVector(Float *a, double2 *b) {
Float K = 1.0;
for (int c=0; c<3; c++) {
a[(c*4+0)*2+0] = -K*(b[(3+c)*Nh].x+b[(9+c)*Nh].x);
a[(c*4+0)*2+1] = -K*(b[(3+c)*Nh].y+b[(9+c)*Nh].y);
for (int j = 0; j < 6; j++) packFloat4(res+j*Nh+i, b+j*4);
a[(c*4+1)*2+0] = K*(b[c*Nh].x+b[(6+c)*Nh].x);
a[(c*4+1)*2+1] = K*(b[c*Nh].y+b[(6+c)*Nh].y);
a[(c*4+2)*2+0] = -K*(b[(3+c)*Nh].x-b[(9+c)*Nh].x);
a[(c*4+2)*2+1] = -K*(b[(3+c)*Nh].y-b[(9+c)*Nh].y);
a[(c*4+3)*2+0] = -K*(b[(6+c)*Nh].x-b[c*Nh].x);
a[(c*4+3)*2+1] = -K*(b[(6+c)*Nh].y-b[c*Nh].y);
}
}
// Single precision version of the above
void packQDPParitySpinorSS(float4 *res, float *spinor) {
float K = 1.0 / 2.0;
float b[24];
// Standard spinor packing, colour inside spin
template <typename Float, typename FloatN>
void packParitySpinor(FloatN *res, Float *spinor) {
for (int i = 0; i < Nh; i++) {
float *a = spinor+i*24;
// reorder and change basis
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
b[0*6+cr] = K*(a[(c*4+1)*2+r]+a[(c*4+3)*2+r]);
b[1*6+cr] = -K*(a[(c*4+0)*2+r]+a[(c*4+2)*2+r]);
b[2*6+cr] = K*(a[(c*4+1)*2+r]-a[(c*4+3)*2+r]);
b[3*6+cr] = K*(a[(c*4+2)*2+r]-a[(c*4+0)*2+r]);
}
}
for (int j = 0; j < 6; j++) packFloat4(res+j*Nh+i, b+j*4);
packSpinorVector(res+i, spinor+24*i);
}
}
void unpackFullSpinorDD(double *res, double2 *even, double2 *odd) {
double K = 1.0;
double b[24];
template <typename Float, typename FloatN>
void packFullSpinor(FloatN *even, FloatN *odd, Float *spinor) {
for (int i=0; i<Nh; i++) {
......@@ -353,47 +360,19 @@ void unpackFullSpinorDD(double *res, double2 *even, double2 *odd) {
{ // even sites
int k = 2*i + boundaryCrossings%2;
double *a = res + k*24;
for (int j = 0; j < 12; j++) unpackDouble2(b+j*2, even+j*Nh+i);
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
a[0*6+cr] = -K*(b[1*6+cr]+b[3*6+cr]);
a[1*6+cr] = K*(b[0*6+cr]+b[2*6+cr]);
a[2*6+cr] = -K*(b[1*6+cr]-b[3*6+cr]);
a[3*6+cr] = -K*(b[2*6+cr]-b[0*6+cr]);
}
}
packSpinorVector(even+i, spinor+24*k);
}
{ // odd sites
int k = 2*i + (boundaryCrossings+1)%2;
double *a = res + k*24;
for (int j = 0; j < 12; j++) unpackDouble2(b+j*2, odd+j*Nh+i);
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
a[0*6+cr] = -K*(b[1*6+cr]+b[3*6+cr]);
a[1*6+cr] = K*(b[0*6+cr]+b[2*6+cr]);
a[2*6+cr] = -K*(b[1*6+cr]-b[3*6+cr]);
a[3*6+cr] = -K*(b[2*6+cr]-b[0*6+cr]);
}
}
packSpinorVector(odd+i, spinor+24*k);
}
}
}
template <typename Float>
void unpackFullSpinorFS(Float *res, float4 *even, float4 *odd) {
Float K = 1.0;
float b[24];
template <typename Float, typename FloatN>
void unpackFullSpinor(Float *res, FloatN *even, FloatN *odd) {
for (int i=0; i<Nh; i++) {
......@@ -401,179 +380,43 @@ void unpackFullSpinorFS(Float *res, float4 *even, float4 *odd) {
{ // even sites
int k = 2*i + boundaryCrossings%2;
Float *a = res + k*24;
for (int j = 0; j < 6; j++) unpackFloat4(b+j*4, even+j*Nh+i);
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
a[0*6+cr] = -K*(b[1*6+cr]+b[3*6+cr]);
a[1*6+cr] = K*(b[0*6+cr]+b[2*6+cr]);
a[2*6+cr] = -K*(b[1*6+cr]-b[3*6+cr]);
a[3*6+cr] = -K*(b[2*6+cr]-b[0*6+cr]);
}
}
unpackSpinorVector(res+24*k, even+i);
}
{ // odd sites
int k = 2*i + (boundaryCrossings+1)%2;
Float *a = res + k*24;
for (int j = 0; j < 6; j++) unpackFloat4(b+j*4, odd+j*Nh+i);
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
a[0*6+cr] = -K*(b[1*6+cr]+b[3*6+cr]);
a[1*6+cr] = K*(b[0*6+cr]+b[2*6+cr]);
a[2*6+cr] = -K*(b[1*6+cr]-b[3*6+cr]);
a[3*6+cr] = -K*(b[2*6+cr]-b[0*6+cr]);
}
}
unpackSpinorVector(res+24*k, odd+i);
}
}
}
void unpackParitySpinorDD(double *res, double2 *spinorPacked) {
double K = 1.0;///sqrt(2.0);
double b[24];
for (int i = 0; i < Nh; i++) {
double *a = res+i*24;
for (int j = 0; j < 12; j++) unpackDouble2(b+j*2, spinorPacked+j*Nh+i);
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
a[0*6+cr] = -K*(b[1*6+cr]+b[3*6+cr]);
a[1*6+cr] = K*(b[0*6+cr]+b[2*6+cr]);
a[2*6+cr] = -K*(b[1*6+cr]-b[3*6+cr]);
a[3*6+cr] = -K*(b[2*6+cr]-b[0*6+cr]);
}
}
}
}
void unpackParitySpinorDS(double *res, float4 *spinorPacked) {
float K = 1.0;///sqrt(2.0);
float b[24];
template <typename Float, typename FloatN>
void unpackParitySpinor(Float *res, FloatN *spinorPacked) {
for (int i = 0; i < Nh; i++) {
double *a = res+i*24;
for (int j = 0; j < 6; j++) unpackFloat4(b+j*4, spinorPacked+j*Nh+i);
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
a[0*6+cr] = -K*(b[1*6+cr]+b[3*6+cr]);
a[1*6+cr] = K*(b[0*6+cr]+b[2*6+cr]);
a[2*6+cr] = -K*(b[1*6+cr]-b[3*6+cr]);
a[3*6+cr] = -K*(b[2*6+cr]-b[0*6+cr]);
}
}
unpackSpinorVector(res+i*24, spinorPacked+i);
}
}
void unpackParitySpinorSS(float *res, float4 *spinorPacked) {
float K = 1.0;///sqrt(2.0);
float b[24];
for (int i = 0; i < Nh; i++) {
float *a = res+i*24;
for (int j = 0; j < 6; j++) unpackFloat4(b+j*4, spinorPacked+j*Nh+i);
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
a[0*6+cr] = -K*(b[1*6+cr]+b[3*6+cr]);
a[1*6+cr] = K*(b[0*6+cr]+b[2*6+cr]);
a[2*6+cr] = -K*(b[1*6+cr]-b[3*6+cr]);
a[3*6+cr] = -K*(b[2*6+cr]-b[0*6+cr]);
}
}
}
}
void unpackQDPParitySpinorDD(double *res, double2 *spinorPacked) {
double K = 1.0;///sqrt(2.0);
double b[24];
for (int i = 0; i < Nh; i++) {
double *a = res+i*24;
for (int j = 0; j < 12; j++) unpackDouble2(b+j*2, spinorPacked+j*Nh+i);
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
a[(c*4+0)*2+r] = -K*(b[1*6+cr]+b[3*6+cr]);
a[(c*4+1)*2+r] = K*(b[0*6+cr]+b[2*6+cr]);
a[(c*4+2)*2+r] = -K*(b[1*6+cr]-b[3*6+cr]);
a[(c*4+3)*2+r] = -K*(b[2*6+cr]-b[0*6+cr]);
}
}
}
}
void unpackQDPParitySpinorDS(double *res, float4 *spinorPacked) {
float K = 1.0;///sqrt(2.0);
float b[24];
// QDP spinor packing, spin inside colour
template <typename Float, typename FloatN>
void packQDPParitySpinor(FloatN *res, Float *spinor) {
for (int i = 0; i < Nh; i++) {
double *a = res+i*24;
for (int j = 0; j < 6; j++) unpackFloat4(b+j*4, spinorPacked+j*Nh+i);
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
a[(c*4+0)*2+r] = -K*(b[1*6+cr]+b[3*6+cr]);
a[(c*4+1)*2+r] = K*(b[0*6+cr]+b[2*6+cr]);
a[(c*4+2)*2+r] = -K*(b[1*6+cr]-b[3*6+cr]);
a[(c*4+3)*2+r] = -K*(b[2*6+cr]-b[0*6+cr]);
}
}
packQDPSpinorVector(res+i, spinor+i*24);
}
}
void unpackQDPParitySpinorSS(float *res, float4 *spinorPacked) {
float K = 1.0;///sqrt(2.0);
float b[24];
// QDP spinor packing, spin inside colour
template <typename Float, typename FloatN>
void unpackQDPParitySpinor(Float *res, FloatN *spinor) {
for (int i = 0; i < Nh; i++) {
float *a = res+i*24;
for (int j = 0; j < 6; j++) unpackFloat4(b+j*4, spinorPacked+j*Nh+i);
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
a[(c*4+0)*2+r] = -K*(b[1*6+cr]+b[3*6+cr]);
a[(c*4+1)*2+r] = K*(b[0*6+cr]+b[2*6+cr]);
a[(c*4+2)*2+r] = -K*(b[1*6+cr]-b[3*6+cr]);
a[(c*4+3)*2+r] = -K*(b[2*6+cr]-b[0*6+cr]);
}
}
unpackQDPSpinorVector(res+i*24, spinor+i);
}
}
void loadParitySpinor(ParitySpinor ret, void *spinor, Precision cpu_prec,
DiracFieldOrder dirac_order) {
......@@ -595,17 +438,17 @@ void loadParitySpinor(ParitySpinor ret, void *spinor, Precision cpu_prec,
if (dirac_order == QUDA_DIRAC_ORDER || QUDA_CPS_WILSON_DIRAC_ORDER) {
if (ret.precision == QUDA_DOUBLE_PRECISION) {
packParitySpinorDD((double2*)packedSpinor1, (double*)spinor);
packParitySpinor((double2*)packedSpinor1, (double*)spinor);
} else {
if (cpu_prec == QUDA_DOUBLE_PRECISION) packParitySpinorSD((float4*)packedSpinor1, (double*)spinor);
else packParitySpinorSS((float4*)packedSpinor1, (float*)spinor);
if (cpu_prec == QUDA_DOUBLE_PRECISION) packParitySpinor((float4*)packedSpinor1, (double*)spinor);
else packParitySpinor((float4*)packedSpinor1, (float*)spinor);
}
} else if (dirac_order == QUDA_QDP_DIRAC_ORDER) {
if (ret.precision == QUDA_DOUBLE_PRECISION) {
packQDPParitySpinorDD((double2*)packedSpinor1, (double*)spinor);
packQDPParitySpinor((double2*)packedSpinor1, (double*)spinor);
} else {
if (cpu_prec == QUDA_DOUBLE_PRECISION) packQDPParitySpinorSD((float4*)packedSpinor1, (double*)spinor);
else packQDPParitySpinorSS((float4*)packedSpinor1, (float*)spinor);
if (cpu_prec == QUDA_DOUBLE_PRECISION) packQDPParitySpinor((float4*)packedSpinor1, (double*)spinor);
else packQDPParitySpinor((float4*)packedSpinor1, (float*)spinor);
}
}
cudaMemcpy(ret.spinor, packedSpinor1, spinor_bytes, cudaMemcpyHostToDevice);
......@@ -634,11 +477,12 @@ void loadFullSpinor(FullSpinor ret, void *spinor, Precision cpu_prec) {
#endif
if (ret.even.precision == QUDA_DOUBLE_PRECISION) {
packFullSpinorDD((double2*)packedSpinor1, (double2*)packedSpinor2, (double*)spinor);
packFullSpinor((double2*)packedSpinor1, (double2*)packedSpinor2, (double*)spinor);
} else {
if (cpu_prec == QUDA_DOUBLE_PRECISION)
packFullSpinorSF((float4*)packedSpinor1, (float4*)packedSpinor2, (double*)spinor);
else packFullSpinorSF((float4*)packedSpinor1, (float4*)packedSpinor2, (float*)spinor);
packFullSpinor((float4*)packedSpinor1, (float4*)packedSpinor2, (double*)spinor);
else
packFullSpinor((float4*)packedSpinor1, (float4*)packedSpinor2, (float*)spinor);
}
cudaMemcpy(ret.even.spinor, packedSpinor1, spinor_bytes, cudaMemcpyHostToDevice);
......@@ -692,17 +536,17 @@ void retrieveParitySpinor(void *res, ParitySpinor spinor, Precision cpu_prec, Di
cudaMemcpy(packedSpinor1, spinor.spinor, spinor_bytes, cudaMemcpyDeviceToHost);
if (dirac_order == QUDA_DIRAC_ORDER || QUDA_CPS_WILSON_DIRAC_ORDER) {
if (spinor.precision == QUDA_DOUBLE_PRECISION) {
unpackParitySpinorDD((double*)res, (double2*)packedSpinor1);
unpackParitySpinor((double*)res, (double2*)packedSpinor1);
} else {
if (cpu_prec == QUDA_DOUBLE_PRECISION) unpackParitySpinorDS((double*)res, (float4*)packedSpinor1);
else unpackParitySpinorSS((float*)res, (float4*)packedSpinor1);
if (cpu_prec == QUDA_DOUBLE_PRECISION) unpackParitySpinor((double*)res, (float4*)packedSpinor1);
else unpackParitySpinor((float*)res, (float4*)packedSpinor1);
}
} else if (dirac_order == QUDA_QDP_DIRAC_ORDER) {
if (spinor.precision == QUDA_DOUBLE_PRECISION) {
unpackQDPParitySpinorDD((double*)res, (double2*)packedSpinor1);
unpackQDPParitySpinor((double*)res, (double2*)packedSpinor1);
} else {
if (cpu_prec == QUDA_DOUBLE_PRECISION) unpackQDPParitySpinorDS((double*)res, (float4*)packedSpinor1);
else unpackQDPParitySpinorSS((float*)res, (float4*)packedSpinor1);
if (cpu_prec == QUDA_DOUBLE_PRECISION) unpackQDPParitySpinor((double*)res, (float4*)packedSpinor1);
else unpackQDPParitySpinor((float*)res, (float4*)packedSpinor1);
}
}
} else {
......@@ -726,11 +570,11 @@ void retrieveFullSpinor(void *res, FullSpinor spinor, Precision cpu_prec) {
cudaMemcpy(packedSpinor1, spinor.even.spinor, spinor_bytes, cudaMemcpyDeviceToHost);
cudaMemcpy(packedSpinor2, spinor.odd.spinor, spinor_bytes, cudaMemcpyDeviceToHost);
if (spinor.even.precision == QUDA_DOUBLE_PRECISION) {
unpackFullSpinorDD((double*)res, (double2*)packedSpinor1, (double2*)packedSpinor2);
unpackFullSpinor((double*)res, (double2*)packedSpinor1, (double2*)packedSpinor2);
} else {
if (cpu_prec == QUDA_DOUBLE_PRECISION)
unpackFullSpinorFS((double*)res, (float4*)packedSpinor1, (float4*)packedSpinor2);
else unpackFullSpinorFS((float*)res, (float4*)packedSpinor1, (float4*)packedSpinor2);
unpackFullSpinor((double*)res, (float4*)packedSpinor1, (float4*)packedSpinor2);
else unpackFullSpinor((float*)res, (float4*)packedSpinor1, (float4*)packedSpinor2);
}
#ifndef __DEVICE_EMULATION__
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment