Clean up of spinor_quda.cpp

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@405 be54200a-260c-0410-bdd7-ce6af2a381ab

Clean up of spinor_quda.cpp
git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@405 be54200a-260c-0410-bdd7-ce6af2a381ab
72aac654 · mikeaclark · 937f5509 · 72aac654 · 72aac654 · 72aac654
Commit 72aac654 authored Jun 29, 2009 by mikeaclark
--- a/dslash_quda.cu
+++ b/dslash_quda.cu
@@ -537,7 +537,7 @@ int dslashCudaSharedBytes() {

 // Apply the even-odd preconditioned Dirac operator
 void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa, 
-	       ParitySpinor tmp, MatPCType matpc_type) {
+	       ParitySpinor tmp, MatPCType matpc_type, int dagger) {

  checkSpinor(in, out);
  checkSpinor(in, tmp);
@@ -546,64 +546,27 @@ void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa,

  if (in.precision == QUDA_DOUBLE_PRECISION) {
    if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
-      dslashDCuda(tmp, gauge, in, 1, 0);
-      dslashXpayDCuda(out, gauge, tmp, 0, 0, in, kappa2); 
+      dslashDCuda(tmp, gauge, in, 1, dagger);
+      dslashXpayDCuda(out, gauge, tmp, 0, dagger, in, kappa2); 
    } else {
-      dslashDCuda(tmp, gauge, in, 0, 0);
-      dslashXpayDCuda(out, gauge, tmp, 1, 0, in, kappa2); 
+      dslashDCuda(tmp, gauge, in, 0, dagger);
+      dslashXpayDCuda(out, gauge, tmp, 1, dagger, in, kappa2); 
    }
  } else if (in.precision == QUDA_SINGLE_PRECISION) {
    if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
-      dslashSCuda(tmp, gauge, in, 1, 0);
-      dslashXpaySCuda(out, gauge, tmp, 0, 0, in, kappa2); 
+      dslashSCuda(tmp, gauge, in, 1, dagger);
+      dslashXpaySCuda(out, gauge, tmp, 0, dagger, in, kappa2); 
    } else {
-      dslashSCuda(tmp, gauge, in, 0, 0);
-      dslashXpaySCuda(out, gauge, tmp, 1, 0, in, kappa2); 
+      dslashSCuda(tmp, gauge, in, 0, dagger);
+      dslashXpaySCuda(out, gauge, tmp, 1, dagger, in, kappa2); 
    }
  } else if (in.precision == QUDA_HALF_PRECISION) {
    if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
-      dslashHCuda(tmp, gauge, in, 1, 0);
-      dslashXpayHCuda(out, gauge, tmp, 0, 0, in, kappa2); 
+      dslashHCuda(tmp, gauge, in, 1, dagger);
+      dslashXpayHCuda(out, gauge, tmp, 0, dagger, in, kappa2); 
    } else {
-      dslashHCuda(tmp, gauge, in, 0, 0);
-      dslashXpayHCuda(out, gauge, tmp, 1, 0, in, kappa2); 
-    }
-  }
-
-}
-
-// Apply the even-odd preconditioned Dirac operator
-void MatPCDagCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa, 
-		  ParitySpinor tmp, MatPCType matpc_type) {
-
-  checkSpinor(in, out);
-  checkSpinor(in, tmp);
-
-  double kappa2 = -kappa*kappa;
-
-  if (in.precision == QUDA_DOUBLE_PRECISION) {
-    if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
-      dslashDCuda(tmp, gauge, in, 1, 1);
-      dslashXpayDCuda(out, gauge, tmp, 0, 1, in, kappa2);
-    } else {
-      dslashDCuda(tmp, gauge, in, 0, 1);
-      dslashXpayDCuda(out, gauge, tmp, 1, 1, in, kappa2);
-    }
-  } else if (in.precision == QUDA_SINGLE_PRECISION) {
-    if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
-      dslashSCuda(tmp, gauge, in, 1, 1);
-      dslashXpaySCuda(out, gauge, tmp, 0, 1, in, kappa2);
-    } else {
-      dslashSCuda(tmp, gauge, in, 0, 1);
-      dslashXpaySCuda(out, gauge, tmp, 1, 1, in, kappa2);
-    }
-  } else {
-    if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
-      dslashHCuda(tmp, gauge, in, 1, 1);
-      dslashXpayHCuda(out, gauge, tmp, 0, 1, in, kappa2);
-    } else {
-      dslashHCuda(tmp, gauge, in, 0, 1);
-      dslashXpayHCuda(out, gauge, tmp, 1, 1, in, kappa2);
+      dslashHCuda(tmp, gauge, in, 0, dagger);
+      dslashXpayHCuda(out, gauge, tmp, 1, dagger, in, kappa2); 
    }
  }

@@ -611,43 +574,27 @@ void MatPCDagCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kap

 void MatPCDagMatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, 
 		       double kappa, ParitySpinor tmp, MatPCType matpc_type) {
-  MatPCCuda(out, gauge, in, kappa, tmp, matpc_type);
-  MatPCDagCuda(out, gauge, out, kappa, tmp, matpc_type);
+  MatPCCuda(out, gauge, in, kappa, tmp, matpc_type, 0);
+  MatPCCuda(out, gauge, out, kappa, tmp, matpc_type, 1);
 }

 // Apply the full operator
-void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa) {
+void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa, int dagger) {
  checkSpinor(in.even, out.even);

  if (in.even.precision == QUDA_DOUBLE_PRECISION) {
-    dslashXpayDCuda(out.odd, gauge, in.even, 1, 0, in.odd, -kappa);
-    dslashXpayDCuda(out.even, gauge, in.odd, 0, 0, in.even, -kappa);
+    dslashXpayDCuda(out.odd, gauge, in.even, 1, dagger, in.odd, -kappa);
+    dslashXpayDCuda(out.even, gauge, in.odd, 0, dagger, in.even, -kappa);
  } else if (in.even.precision == QUDA_SINGLE_PRECISION) {
-    dslashXpaySCuda(out.odd, gauge, in.even, 1, 0, in.odd, -kappa);
-    dslashXpaySCuda(out.even, gauge, in.odd, 0, 0, in.even, -kappa);
+    dslashXpaySCuda(out.odd, gauge, in.even, 1, dagger, in.odd, -kappa);
+    dslashXpaySCuda(out.even, gauge, in.odd, 0, dagger, in.even, -kappa);
  } else if (in.even.precision == QUDA_HALF_PRECISION) {
-    dslashXpayHCuda(out.odd, gauge, in.even, 1, 0, in.odd, -kappa);
-    dslashXpayHCuda(out.even, gauge, in.odd, 0, 0, in.even, -kappa);
+    dslashXpayHCuda(out.odd, gauge, in.even, 1, dagger, in.odd, -kappa);
+    dslashXpayHCuda(out.even, gauge, in.odd, 0, dagger, in.even, -kappa);
  }

 }

-// Apply the full operator dagger
-void MatDaggerCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa) {
-  checkSpinor(in.even, out.even);
-
-  if (in.even.precision == QUDA_SINGLE_PRECISION) {
-    dslashXpayDCuda(out.odd, gauge, in.even, 1, 1, in.odd, -kappa);
-    dslashXpayDCuda(out.even, gauge, in.odd, 0, 1, in.even, -kappa);
-  } else if (in.even.precision == QUDA_SINGLE_PRECISION) {
-    dslashXpaySCuda(out.odd, gauge, in.even, 1, 1, in.odd, -kappa);
-    dslashXpaySCuda(out.even, gauge, in.odd, 0, 1, in.even, -kappa);
-  } else if (in.even.precision == QUDA_HALF_PRECISION) {
-    dslashXpayHCuda(out.odd, gauge, in.even, 1, 1, in.odd, -kappa);
-    dslashXpayHCuda(out.even, gauge, in.odd, 0, 1, in.even, -kappa);
-  }
-
-}

 /*
 // Apply the even-odd preconditioned Dirac operator

--- a/dslash_quda.h
+++ b/dslash_quda.h
@@ -59,13 +59,11 @@ extern "C" {
 		      ParitySpinor x, double a);

  // Full Wilson matrix
-  void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa);
-  void MatDagCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa);
+  void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa, int daggerBit);

  void MatPCCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven, 
-		 double kappa, ParitySpinor tmp, MatPCType matpc_type);
-  void MatPCDagCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven, 
-		    double kappa, ParitySpinor tmp, MatPCType matpc_type);
+		 double kappa, ParitySpinor tmp, MatPCType matpc_type, int daggerBit);
+
  void MatPCDagMatPCCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven,
 			 double kappa, ParitySpinor tmp, MatPCType matpc_type);
  

--- a/dslash_test.c
+++ b/dslash_test.c
@@ -8,7 +8,7 @@
 #include <gauge_quda.h>

 // What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat)
-int test_type = 1;
+int test_type = 2;

 QudaGaugeParam gaugeParam;
 QudaInvertParam inv_param;
@@ -25,7 +25,7 @@ void *spinorEven, *spinorOdd;
 double kappa = 1.0;
 int ODD_BIT = 0;
 int DAGGER_BIT = 0;
-int TRANSFER = 0; // include transfer time in the benchmark?
+int TRANSFER = 1; // include transfer time in the benchmark?

 void init() {

@@ -126,12 +126,12 @@ double dslashCUDA() {
      else dslashCuda(cudaSpinor.odd, gauge, cudaSpinor.even, ODD_BIT, DAGGER_BIT);
      break;
    case 1:
-      if (TRANSFER) MatPCQuda(spinorOdd, spinorEven, &inv_param);
-      else MatPCCuda(cudaSpinor.odd, gauge, cudaSpinor.even, kappa, tmp, QUDA_MATPC_EVEN_EVEN);
+      if (TRANSFER) MatPCQuda(spinorOdd, spinorEven, &inv_param, DAGGER_BIT);
+      else MatPCCuda(cudaSpinor.odd, gauge, cudaSpinor.even, kappa, tmp, QUDA_MATPC_EVEN_EVEN, DAGGER_BIT);
      break;
    case 2:
-      if (TRANSFER) MatQuda(spinorGPU, spinor, &inv_param);
-      else MatCuda(cudaSpinorOut, gauge, cudaSpinor, kappa);
+      if (TRANSFER) MatQuda(spinorGPU, spinor, &inv_param, DAGGER_BIT);
+      else MatCuda(cudaSpinorOut, gauge, cudaSpinor, kappa, DAGGER_BIT);
    }
  }
    

--- a/inv_bicgstab_quda.cpp
+++ b/inv_bicgstab_quda.cpp
@@ -36,7 +36,11 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugeSloppy,

  zeroCuda(x_sloppy);
  copyCuda(b, src);
-  copyCuda(r_sloppy, src_sloppy);
+  copyCuda(r_sloppy, src);
+
+  /*MatPCDagCuda(y, gaugePrecise, src, invert_param->kappa, tmp, invert_param->matpc_type);
+    copyCuda(src_sloppy, y);*/ // uncomment for BiCRstab
+
  zeroCuda(y);

  double b2 = normCuda(b);
@@ -73,7 +77,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugeSloppy,
  while (r2 > stop && k<invert_param->maxiter) {

    if (k==0) {
-      rho = make_cuDoubleComplex(r2, 0.0);
+      rho = make_cuDoubleComplex(r2, 0.0); // cDotProductCuda(src_sloppy, r_sloppy); // BiCRstab
      copyCuda(p, r_sloppy);
    } else {
      alpha_omega = cuCdiv(alpha, omega);
@@ -85,11 +89,9 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugeSloppy,
      cxpaypbzCuda(r_sloppy, beta_omega, v, beta, p); // 8
    }

-    if (dag_type == QUDA_DAG_NO) 
-      MatPCCuda(v, gaugeSloppy, p, invert_param->kappa, tmp_sloppy, invert_param->matpc_type);
-    else 
-      MatPCDagCuda(v, gaugeSloppy, p, invert_param->kappa, tmp_sloppy, invert_param->matpc_type);
+    MatPCCuda(v, gaugeSloppy, p, invert_param->kappa, tmp_sloppy, invert_param->matpc_type, dag_type);
    
+    // rv = (r0,v)
    rv = cDotProductCuda(src_sloppy, v);

    alpha = cuCdiv(rho, rv);
@@ -99,10 +101,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugeSloppy,
    caxpyCuda(alpha, v, r_sloppy); // 4
    alpha.x *= -1.0; alpha.y *= -1.0;

-    if (dag_type == QUDA_DAG_NO) 
-      MatPCCuda(t, gaugeSloppy, r_sloppy, invert_param->kappa, tmp_sloppy, invert_param->matpc_type);
-    else  
-      MatPCDagCuda(t, gaugeSloppy, r_sloppy, invert_param->kappa, tmp_sloppy, invert_param->matpc_type);
+    MatPCCuda(t, gaugeSloppy, r_sloppy, invert_param->kappa, tmp_sloppy, invert_param->matpc_type, dag_type);

    // omega = (t, r) / (t, t)
    omega_t2 = cDotProductNormACuda(t, r_sloppy); // 6
@@ -122,10 +121,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugeSloppy,
    if (updateR) {
      if (x.precision != x_sloppy.precision) copyCuda(x, x_sloppy);

-      if (dag_type == QUDA_DAG_NO) 
-	MatPCCuda(r, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
-      else 
-	MatPCDagCuda(r, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
+      MatPCCuda(r, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type, dag_type);

      r2 = xmyNormCuda(b, r);
      if (x.precision != r_sloppy.precision) copyCuda(r_sloppy, r);
@@ -168,10 +164,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugeSloppy,

 #if 0
  // Calculate the true residual
-  if (dag_type == QUDA_DAG_NO) 
-    MatPCCuda(r, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
-  else 
-    MatPCDagCuda(r, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
+  MatPCCuda(r, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type, dag_type);
  double true_res = xmyNormCuda(src, r);
  
  printf("Converged after %d iterations, r2 = %e, true_r2 = %e\n", k, sqrt(r2/b2), sqrt(true_res / b2));

--- a/invert_quda.cpp
+++ b/invert_quda.cpp
@@ -134,29 +134,14 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int parity,
  freeParitySpinor(in);
 }

-void MatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
+void MatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int dagger)
 {
  ParitySpinor in = allocateParitySpinor(Nh, inv_param->cuda_prec);
  ParitySpinor out = allocateParitySpinor(Nh, inv_param->cuda_prec);
  ParitySpinor tmp = allocateParitySpinor(Nh, inv_param->cuda_prec);
  
  loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->dirac_order);
-  MatPCCuda(out, cudaGaugePrecise, in, inv_param->kappa, tmp, inv_param->matpc_type);
-  retrieveParitySpinor(h_out, out, inv_param->cpu_prec, inv_param->dirac_order);
-
-  freeParitySpinor(tmp);
-  freeParitySpinor(out);
-  freeParitySpinor(in);
-}
-
-void MatPCDagQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
-{
-  ParitySpinor in = allocateParitySpinor(Nh, inv_param->cuda_prec);
-  ParitySpinor out = allocateParitySpinor(Nh, inv_param->cuda_prec);
-  ParitySpinor tmp = allocateParitySpinor(Nh, inv_param->cuda_prec);
-  
-  loadParitySpinor(in, h_in, inv_param->cpu_prec, inv_param->dirac_order);
-  MatPCDagCuda(out, cudaGaugePrecise, in, inv_param->kappa, tmp, inv_param->matpc_type);
+  MatPCCuda(out, cudaGaugePrecise, in, inv_param->kappa, tmp, inv_param->matpc_type, dagger);
  retrieveParitySpinor(h_out, out, inv_param->cpu_prec, inv_param->dirac_order);

  freeParitySpinor(tmp);
@@ -179,29 +164,14 @@ void MatPCDagMatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
  freeParitySpinor(in);
 }

-void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) {
-  FullSpinor in = allocateSpinorField(N, inv_param->cuda_prec);
-  FullSpinor out = allocateSpinorField(N, inv_param->cuda_prec);
-
-  loadSpinorField(in, h_in, inv_param->cpu_prec, inv_param->dirac_order);
-
-  dslashXpayCuda(out.odd, cudaGaugePrecise, in.even, 1, 0, in.odd, -inv_param->kappa);
-  dslashXpayCuda(out.even, cudaGaugePrecise, in.odd, 0, 0, in.even, -inv_param->kappa);
-
-  retrieveSpinorField(h_out, out, inv_param->cpu_prec, inv_param->dirac_order);
-
-  freeSpinorField(out);
-  freeSpinorField(in);
-}
-
-void MatDagQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) {
+void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int dagger) {
  FullSpinor in = allocateSpinorField(N, inv_param->cuda_prec);
  FullSpinor out = allocateSpinorField(N, inv_param->cuda_prec);

  loadSpinorField(in, h_in, inv_param->cpu_prec, inv_param->dirac_order);

-  dslashXpayCuda(out.odd, cudaGaugePrecise, in.even, 1, 1, in.odd, -inv_param->kappa);
-  dslashXpayCuda(out.even, cudaGaugePrecise, in.odd, 0, 1, in.even, -inv_param->kappa);
+  dslashXpayCuda(out.odd, cudaGaugePrecise, in.even, 1, dagger, in.odd, -inv_param->kappa);
+  dslashXpayCuda(out.even, cudaGaugePrecise, in.odd, 0, dagger, in.even, -inv_param->kappa);

  retrieveSpinorField(h_out, out, inv_param->cpu_prec, inv_param->dirac_order);

@@ -277,7 +247,7 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
  case QUDA_CG_INVERTER:
    if (param->solution_type != QUDA_MATPCDAG_MATPC_SOLUTION) {
      copyCuda(out, in);
-      MatPCDagCuda(in, cudaGaugePrecise, out, kappa, tmp, param->matpc_type);
+      MatPCCuda(in, cudaGaugePrecise, out, kappa, tmp, param->matpc_type, QUDA_DAG_YES);
    }
    invertCgCuda(out, in, cudaGaugeSloppy, tmp, param);
    break;

--- a/invert_quda.h
+++ b/invert_quda.h
@@ -70,12 +70,10 @@ extern "C" {
  void invertQuda(void *h_x, void *h_b, QudaInvertParam *param);

  void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int parity, int dagger);
-  void MatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param);
-  void MatPCDagQuda(void *h_out, void *h_in, QudaInvertParam *inv_param);
+  void MatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int dagger);
  void MatPCDagMatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param);

-  void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param);
-  void MatDagQuda(void *h_out, void *h_in, QudaInvertParam *inv_param);
+  void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int dagger);

  void endQuda(void);


--- a/invert_test.c
+++ b/invert_test.c
@@ -20,7 +20,7 @@ int main(int argc, char **argv)
  Gauge_param.cuda_prec = QUDA_DOUBLE_PRECISION;
  Gauge_param.reconstruct = QUDA_RECONSTRUCT_12;

-  Gauge_param.cuda_prec_sloppy = QUDA_DOUBLE_PRECISION;
+  Gauge_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
  Gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12;

  Gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
@@ -38,13 +38,13 @@ int main(int argc, char **argv)
  
  double mass = -0.97;
  inv_param.kappa = 1.0 / (2.0*(4 + mass));
-  inv_param.tol = 1e-7;
-  inv_param.maxiter = 5000;
+  inv_param.tol = 1e-12;
+  inv_param.maxiter = 10000;
  inv_param.reliable_delta = 1e-2;
  inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION;
  inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
  inv_param.cuda_prec = QUDA_DOUBLE_PRECISION;
-  inv_param.cuda_prec_sloppy = QUDA_DOUBLE_PRECISION;
+  inv_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
  inv_param.solution_type = QUDA_MAT_SOLUTION;
  inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
  inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO;

--- a/spinor_quda.cpp
+++ b/spinor_quda.cpp
@@ -126,226 +126,233 @@ inline void unpackFloat4(float *a, float4 *b) {
  //a[0] = b->x; a[1] = b->y; a[2] = b->z; a[3] = b->w;
 }

-void packFullSpinorDD(double2 *even, double2 *odd, double *spinor) {
-  double K = 1.0 / 2.0;
-  double b[24];
-
-  for (int i=0; i<Nh; i++) {
+template <typename Float>
+inline void packSpinorVector(float4* a, Float *b) {
+  Float K = 1.0 / 2.0;

-    int boundaryCrossings = i/L1h + i/(L2*L1h) + i/(L3*L2*L1h);
+  a[0*Nh].x = K*(b[1*6+0*2+0]+b[3*6+0*2+0]);
+  a[0*Nh].y = K*(b[1*6+0*2+1]+b[3*6+0*2+1]);
+  a[0*Nh].z = K*(b[1*6+1*2+0]+b[3*6+1*2+0]);
+  a[0*Nh].w = K*(b[1*6+1*2+1]+b[3*6+1*2+1]);

-    { // even sites
-      int k = 2*i + boundaryCrossings%2; 
-      double *a = spinor + k*24;
-      for (int c=0; c<3; c++) {
-	for (int r=0; r<2; r++) {
-	  int cr = c*2+r;
-	  b[0*6+cr] = K*(a[1*6+cr]+a[3*6+cr]);
-	  b[1*6+cr] = -K*(a[0*6+cr]+a[2*6+cr]);
-	  b[2*6+cr] = K*(a[1*6+cr]-a[3*6+cr]);
-	  b[3*6+cr] = K*(a[2*6+cr]-a[0*6+cr]);
-	}
-      }
-      for (int j = 0; j < 12; j++) packDouble2(even+j*Nh+i, b+j*2);
-    }
+  a[1*Nh].x = K*(b[1*6+2*2+0]+b[3*6+2*2+0]);
+  a[1*Nh].y = K*(b[1*6+2*2+1]+b[3*6+2*2+1]);
+  a[1*Nh].z = -K*(b[0*6+0*2+0]+b[2*6+0*2+0]);
+  a[1*Nh].w = -K*(b[0*6+0*2+1]+b[2*6+0*2+1]);
+  
+  a[2*Nh].x = -K*(b[0*6+1*2+0]+b[2*6+1*2+0]);
+  a[2*Nh].y = -K*(b[0*6+1*2+1]+b[2*6+1*2+1]);
+  a[2*Nh].z = -K*(b[0*6+2*2+0]+b[2*6+2*2+0]);
+  a[2*Nh].w = -K*(b[0*6+2*2+1]+b[2*6+2*2+1]);
+
+  a[3*Nh].x = K*(b[1*6+0*2+0]-b[3*6+0*2+0]);
+  a[3*Nh].y = K*(b[1*6+0*2+1]-b[3*6+0*2+1]);
+  a[3*Nh].z = K*(b[1*6+1*2+0]-b[3*6+1*2+0]);
+  a[3*Nh].w = K*(b[1*6+1*2+1]-b[3*6+1*2+1]);
+
+  a[4*Nh].x = K*(b[1*6+2*2+0]-b[3*6+2*2+0]);
+  a[4*Nh].y = K*(b[1*6+2*2+1]-b[3*6+2*2+1]);
+  a[4*Nh].z = K*(b[2*6+0*2+0]-b[0*6+0*2+0]);
+  a[4*Nh].w = K*(b[2*6+0*2+1]-b[0*6+0*2+1]);
+
+  a[5*Nh].x = K*(b[2*6+1*2+0]-b[0*6+1*2+0]);
+  a[5*Nh].y = K*(b[2*6+1*2+1]-b[0*6+1*2+1]);
+  a[5*Nh].z = K*(b[2*6+2*2+0]-b[0*6+2*2+0]);
+  a[5*Nh].w = K*(b[2*6+2*2+1]-b[0*6+2*2+1]);
    
-    { // odd sites
-      int k = 2*i + (boundaryCrossings+1)%2;
-      double *a = spinor + k*24;
-      for (int c=0; c<3; c++) {
-	for (int r=0; r<2; r++) {
-	  int cr = c*2+r;
-	  b[0*6+cr] = K*(a[1*6+cr]+a[3*6+cr]);
-	  b[1*6+cr] = -K*(a[0*6+cr]+a[2*6+cr]);
-	  b[2*6+cr] = K*(a[1*6+cr]-a[3*6+cr]);
-	  b[3*6+cr] = K*(a[2*6+cr]-a[0*6+cr]);
-	}
-      }
-      for (int j=0; j<12; j++) packDouble2(odd+j*Nh+i, b+j*2);
-    }
-  }
-
 }

 template <typename Float>
-void packFullSpinorSF(float4 *even, float4 *odd, Float *spinor) {
+inline void packQDPSpinorVector(float4* a, Float *b) {
  Float K = 1.0 / 2.0;
-  float b[24];

-  for (int i=0; i<Nh; i++) {
+  a[0*Nh].x = K*(b[(0*4+1)*2+0]+b[(0*4+3)*2+0]);
+  a[0*Nh].y = K*(b[(0*4+1)*2+1]+b[(0*4+3)*2+1]);
+  a[0*Nh].z = K*(b[(1*4+1)*2+0]+b[(1*4+3)*2+0]);
+  a[0*Nh].w = K*(b[(1*4+1)*2+1]+b[(1*4+3)*2+1]);

-    int boundaryCrossings = i/L1h + i/(L2*L1h) + i/(L3*L2*L1h);
+  a[1*Nh].x = K*(b[(2*4+1)*2+0]+b[(2*4+3)*2+0]);
+  a[1*Nh].y = K*(b[(2*4+1)*2+1]+b[(2*4+3)*2+1]);
+  a[1*Nh].z = -K*(b[(0*4+0)*2+0]+b[(0*4+2)*2+0]);
+  a[1*Nh].w = -K*(b[(0*4+0)*2+1]+b[(0*4+2)*2+1]);

-    { // even sites
-      int k = 2*i + boundaryCrossings%2; 
-      Float *a = spinor + k*24;
-      for (int c=0; c<3; c++) {
-	for (int r=0; r<2; r++) {
-	  int cr = c*2+r;
-	  b[0*6+cr] = K*(a[1*6+cr]+a[3*6+cr]);
-	  b[1*6+cr] = -K*(a[0*6+cr]+a[2*6+cr]);
-	  b[2*6+cr] = K*(a[1*6+cr]-a[3*6+cr]);
-	  b[3*6+cr] = K*(a[2*6+cr]-a[0*6+cr]);
-	}
-      }
-      for (int j=0; j<6; j++) packFloat4(even+j*Nh+i, b+j*4);
-    }
-    
-    { // odd sites
-      int k = 2*i + (boundaryCrossings+1)%2;
-      Float *a = spinor + k*24;
-      for (int c=0; c<3; c++) {
-	for (int r=0; r<2; r++) {
-	  int cr = c*2+r;
-	  b[0*6+cr] = K*(a[1*6+cr]+a[3*6+cr]);
-	  b[1*6+cr] = -K*(a[0*6+cr]+a[2*6+cr]);
-	  b[2*6+cr] = K*(a[1*6+cr]-a[3*6+cr]);
-	  b[3*6+cr] = K*(a[2*6+cr]-a[0*6+cr]);
-	}
-      } 
-      for (int j=0; j<6; j++) packFloat4(odd+j*Nh+i, b+j*4);
-    }
-  }
+  a[2*Nh].x = -K*(b[(1*4+0)*2+0]+b[(1*4+2)*2+0]);
+  a[2*Nh].y = -K*(b[(1*4+0)*2+1]+b[(1*4+2)*2+1]);
+  a[2*Nh].z = -K*(b[(2*4+0)*2+0]+b[(2*4+2)*2+0]);
+  a[2*Nh].w = -K*(b[(2*4+0)*2+1]+b[(2*4+2)*2+1]);
+
+  a[3*Nh].x = K*(b[(0*4+1)*2+0]+b[(0*4+3)*2+0]);
+  a[3*Nh].y = K*(b[(0*4+1)*2+1]+b[(0*4+3)*2+1]);
+  a[3*Nh].z = K*(b[(1*4+1)*2+0]+b[(1*4+3)*2+0]);
+  a[3*Nh].w = K*(b[(1*4+1)*2+1]+b[(1*4+3)*2+1]);

+  a[4*Nh].x = K*(b[(2*4+1)*2+0]+b[(2*4+3)*2+0]);
+  a[4*Nh].y = K*(b[(2*4+1)*2+1]+b[(2*4+3)*2+1]);
+  a[4*Nh].z = K*(b[(0*4+2)*2+0]+b[(0*4+0)*2+0]);
+  a[4*Nh].w = K*(b[(0*4+2)*2+1]+b[(0*4+0)*2+1]);
+
+  a[5*Nh].x = K*(b[(1*4+2)*2+0]+b[(1*4+0)*2+0]);
+  a[5*Nh].y = K*(b[(1*4+2)*2+1]+b[(1*4+0)*2+1]);
+  a[5*Nh].z = K*(b[(2*4+2)*2+0]+b[(2*4+0)*2+0]);
+  a[5*Nh].w = K*(b[(2*4+2)*2+1]+b[(2*4+0)*2+1]);
 }

-// Standard spinor packing, colour inside spin
-void packParitySpinorDD(double2 *res, double *spinor) {
-  double K = 1.0 / (2.0);
-  double b[24];
+template <typename Float>
+inline void packSpinorVector(double2* a, Float *b) {
+  Float K = 1.0 / 2.0;

-  for (int i = 0; i < Nh; i++) {
-    double *a = spinor+i*24;
-
-    for (int c=0; c<3; c++) {
-      for (int r=0; r<2; r++) {
-	int cr = c*2+r;
-	b[0*6+cr] = K*(a[1*6+cr]+a[3*6+cr]);
-	b[1*6+cr] = -K*(a[0*6+cr]+a[2*6+cr]);
-	b[2*6+cr] = K*(a[1*6+cr]-a[3*6+cr]);
-	b[3*6+cr] = K*(a[2*6+cr]-a[0*6+cr]);
-      }
-    }
+  for (int c=0; c<3; c++) {
+    a[c*Nh].x = K*(b[1*6+c*2+0]+b[3*6+c*2+0]);
+    a[c*Nh].y = K*(b[1*6+c*2+1]+b[3*6+c*2+1]);
+
+    a[(3+c)*Nh].x = -K*(b[0*6+c*2+0]+b[2*6+c*2+0]);
+    a[(3+c)*Nh].y = -K*(b[0*6+c*2+1]+b[2*6+c*2+1]);
+
+    a[(6+c)*Nh].x = K*(b[1*6+c*2+0]-b[3*6+c*2+0]);
+    a[(6+c)*Nh].y = K*(b[1*6+c*2+1]-b[3*6+c*2+1]);

-    for (int j=0; j<12; j++) packDouble2(res+j*Nh+i, b+j*2);
+    a[(9+c)*Nh].x = K*(b[2*6+c*2+0]-b[0*6+c*2+0]);
+    a[(9+c)*Nh].y = K*(b[2*6+c*2+1]-b[0*6+c*2+1]);
  }
+
 }

-void packParitySpinorSD(float4 *res, double *spinor) {
-  double K = 1.0 / (2.0);
-  float b[24];
+template <typename Float>
+inline void packQDPSpinorVector(double2* a, Float *b) {
+  Float K = 1.0 / 2.0;

-  for (int i = 0; i < Nh; i++) {
-    double *a = spinor+i*24;
-
-    for (int c=0; c<3; c++) {
-      for (int r=0; r<2; r++) {
-	int cr = c*2+r;
-	b[0*6+cr] = K*(a[1*6+cr]+a[3*6+cr]);
-	b[1*6+cr] = -K*(a[0*6+cr]+a[2*6+cr]);
-	b[2*6+cr] = K*(a[1*6+cr]-a[3*6+cr]);
-	b[3*6+cr] = K*(a[2*6+cr]-a[0*6+cr]);
-      }
-    }
+  for (int c=0; c<3; c++) {
+    a[c*Nh].x = K*(b[(c*4+1)*2+0]+b[(c*4+3)*2+0]);
+    a[c*Nh].y = K*(b[(c*4+1)*2+1]+b[(c*4+3)*2+1]);
+
+    a[(3+c)*Nh].x = -K*(b[(c*4+0)*2+0]+b[(c*4+2)*2+0]);
+    a[(3+c)*Nh].y = -K*(b[(c*4+0)*2+1]+b[(c*4+2)*2+1]);

-    for (int j=0; j<6; j++) packFloat4(res+j*Nh+i, b+j*4);
+    a[(6+c)*Nh].x = K*(b[(c*4+1)*2+0]-b[(c*4+3)*2+0]);
+    a[(6+c)*Nh].y = K*(b[(c*4+1)*2+1]-b[(c*4+3)*2+1]);
+
+    a[(9+c)*Nh].x = K*(b[(c*4+2)*2+0]-b[(c*4+0)*2+0]);
+    a[(9+c)*Nh].y = K*(b[(c*4+2)*2+1]-b[(c*4+0)*2+1]);
  }
-}

-// single precision version of the above
-void packParitySpinorSS(float4 *res, float *spinor) {
-  float K = 1.0 / (2.0);
-  float b[24];
+}

-  for (int i = 0; i < Nh; i++) {
-    float *a = spinor+i*24;
-
-    for (int c=0; c<3; c++) {
-      for (int r=0; r<2; r++) {
-	int cr = c*2+r;
-	b[cr] = K*(a[6+cr]+a[18+cr]);
-	b[6+cr] = -K*(a[cr]+a[12+cr]);
-	b[12+cr] = K*(a[6+cr]-a[18+cr]);
-	b[18+cr] = K*(a[12+cr]-a[cr]);
-      }
-    }
+template <typename Float>
+inline void unpackSpinorVector(Float *a, float4 *b) {
+  Float K = 1.0;

-    for (int j = 0; j < 6; j++) packFloat4(res+j*Nh+i, b+j*4);
-  }
+  a[0*6+0*2+0] = -K*(b[Nh].z+b[4*Nh].z);
+  a[0*6+0*2+1] = -K*(b[Nh].w+b[4*Nh].w);
+  a[0*6+1*2+0] = -K*(b[2*Nh].x+b[5*Nh].x);
+  a[0*6+1*2+1] = -K*(b[2*Nh].y+b[5*Nh].y);
+  a[0*6+2*2+0] = -K*(b[2*Nh].z+b[5*Nh].z);
+  a[0*6+2*2+1] = -K*(b[2*Nh].w+b[5*Nh].w);
+  
+  a[1*6+0*2+0] = K*(b[0].x+b[3*Nh].x);
+  a[1*6+0*2+1] = K*(b[0].y+b[3*Nh].y);
+  a[1*6+1*2+0] = K*(b[0].z+b[3*Nh].z);
+  a[1*6+1*2+1] = K*(b[0].w+b[3*Nh].w);  
+  a[1*6+2*2+0] = K*(b[Nh].x+b[4*Nh].x);
+  a[1*6+2*2+1] = K*(b[Nh].y+b[4*Nh].y);
+  
+  a[2*6+0*2+0] = -K*(b[Nh].z-b[4*Nh].z);
+  a[2*6+0*2+1] = -K*(b[Nh].w-b[4*Nh].w);
+  a[2*6+1*2+0] = -K*(b[2*Nh].x-b[5*Nh].x);
+  a[2*6+1*2+1] = -K*(b[2*Nh].y-b[5*Nh].y);
+  a[2*6+2*2+0] = -K*(b[2*Nh].z-b[5*Nh].z);
+  a[2*6+2*2+1] = -K*(b[2*Nh].w-b[5*Nh].w);
+  
+  a[3*6+0*2+0] = -K*(b[3*Nh].x-b[0].x);
+  a[3*6+0*2+1] = -K*(b[3*Nh].y-b[0].y);
+  a[3*6+1*2+0] = -K*(b[3*Nh].z-b[0].z);
+  a[3*6+1*2+1] = -K*(b[3*Nh].w-b[0].w);
+  a[3*6+2*2+0] = -K*(b[4*Nh].x-b[Nh].x);
+  a[3*6+2*2+1] = -K*(b[4*Nh].y-b[Nh].y);
 }

-// QDP spinor packing, spin inside colour
-void packQDPParitySpinorDD(double2 *res, double *spinor) {
-  double K = 1.0 / 2.0;
+template <typename Float>
+inline void unpackQDPSpinorVector(Float *a, float4 *b) {
+  Float K = 1.0;
+
+  a[(0*4+0)*2+0] = -K*(b[Nh].z+b[4*Nh].z);
+  a[(0*4+0)*2+1] = -K*(b[Nh].w+b[4*Nh].w);
+  a[(1*4+0)*2+0] = -K*(b[2*Nh].x+b[5*Nh].x);
+  a[(1*4+0)*2+1] = -K*(b[2*Nh].y+b[5*Nh].y);
+  a[(2*4+0)*2+0] = -K*(b[2*Nh].z+b[5*Nh].z);
+  a[(2*4+0)*2+1] = -K*(b[2*Nh].w+b[5*Nh].w);
  
-  double b[24];
-  for (int i = 0; i < Nh; i++) {
-    double *a = spinor+i*24;
-
-    // reorder and change basis
-    for (int c=0; c<3; c++) {
-      for (int r=0; r<2; r++) {
-	int cr = c*2+r;
-	b[0*6+cr] = K*(a[(c*4+1)*2+r]+a[(c*4+3)*2+r]);
-	b[1*6+cr] = -K*(a[(c*4+0)*2+r]+a[(c*4+2)*2+r]);
-	b[2*6+cr] = K*(a[(c*4+1)*2+r]-a[(c*4+3)*2+r]);
-	b[3*6+cr] = K*(a[(c*4+2)*2+r]-a[(c*4+0)*2+r]);
-      }
-    }
+  a[(0*4+1)*2+0] = K*(b[0].x+b[3*Nh].x);
+  a[(0*4+1)*2+1] = K*(b[0].y+b[3*Nh].y);
+  a[(1*4+1)*2+0] = K*(b[0].z+b[3*Nh].z);
+  a[(1*4+1)*2+1] = K*(b[0].w+b[3*Nh].w);  
+  a[(2*4+1)*2+0] = K*(b[Nh].x+b[4*Nh].x);
+  a[(2*4+1)*2+1] = K*(b[Nh].y+b[4*Nh].y);
+  
+  a[(0*4+2)*2+0] = -K*(b[Nh].z-b[4*Nh].z);
+  a[(0*4+2)*2+1] = -K*(b[Nh].w-b[4*Nh].w);
+  a[(1*4+2)*2+0] = -K*(b[2*Nh].x-b[5*Nh].x);
+  a[(1*4+2)*2+1] = -K*(b[2*Nh].y-b[5*Nh].y);
+  a[(2*4+2)*2+0] = -K*(b[2*Nh].z-b[5*Nh].z);
+  a[(2*4+2)*2+1] = -K*(b[2*Nh].w-b[5*Nh].w);
+  
+  a[(0*4+3)*2+0] = -K*(b[3*Nh].x-b[0].x);
+  a[(0*4+3)*2+1] = -K*(b[3*Nh].y-b[0].y);
+  a[(1*4+3)*2+0] = -K*(b[3*Nh].z-b[0].z);
+  a[(1*4+3)*2+1] = -K*(b[3*Nh].w-b[0].w);
+  a[(2*4+3)*2+0] = -K*(b[4*Nh].x-b[Nh].x);
+  a[(2*4+3)*2+1] = -K*(b[4*Nh].y-b[Nh].y);
+}
+
+template <typename Float>
+inline void unpackSpinorVector(Float *a, double2 *b) {
+  Float K = 1.0;

-    for (int j = 0; j < 6; j++) packDouble2(res+j*Nh+i, b+j*2);
+  for (int c=0; c<3; c++) {
+    a[0*6+c*2+0] = -K*(b[(3+c)*Nh].x+b[(9+c)*Nh].x);
+    a[0*6+c*2+1] = -K*(b[(3+c)*Nh].y+b[(9+c)*Nh].y);
+
+    a[1*6+c*2+0] = K*(b[c*Nh].x+b[(6+c)*Nh].x);
+    a[1*6+c*2+1] = K*(b[c*Nh].y+b[(6+c)*Nh].y);
+
+    a[2*6+c*2+0] = -K*(b[(3+c)*Nh].x-b[(9+c)*Nh].x);
+    a[2*6+c*2+1] = -K*(b[(3+c)*Nh].y-b[(9+c)*Nh].y);
+    
+    a[3*6+c*2+0] = -K*(b[(6+c)*Nh].x-b[c*Nh].x);
+    a[3*6+c*2+1] = -K*(b[(6+c)*Nh].y-b[c*Nh].y);
  }
+
 }

-// QDP spinor packing, spin inside colour
-void packQDPParitySpinorSD(float4 *res, double *spinor) {
-  double K = 1.0 / 2.0;
-  
-  float b[24];
-  for (int i = 0; i < Nh; i++) {
-    double *a = spinor+i*24;
-
-    // reorder and change basis
-    for (int c=0; c<3; c++) {
-      for (int r=0; r<2; r++) {
-	int cr = c*2+r;
-	b[0*6+cr] = K*(a[(c*4+1)*2+r]+a[(c*4+3)*2+r]);
-	b[1*6+cr] = -K*(a[(c*4+0)*2+r]+a[(c*4+2)*2+r]);
-	b[2*6+cr] = K*(a[(c*4+1)*2+r]-a[(c*4+3)*2+r]);
-	b[3*6+cr] = K*(a[(c*4+2)*2+r]-a[(c*4+0)*2+r]);
-      }
-    }
+template <typename Float>
+inline void unpackQDPSpinorVector(Float *a, double2 *b) {
+  Float K = 1.0;
+
+  for (int c=0; c<3; c++) {
+    a[(c*4+0)*2+0] = -K*(b[(3+c)*Nh].x+b[(9+c)*Nh].x);
+    a[(c*4+0)*2+1] = -K*(b[(3+c)*Nh].y+b[(9+c)*Nh].y);

-    for (int j = 0; j < 6; j++) packFloat4(res+j*Nh+i, b+j*4);
+    a[(c*4+1)*2+0] = K*(b[c*Nh].x+b[(6+c)*Nh].x);
+    a[(c*4+1)*2+1] = K*(b[c*Nh].y+b[(6+c)*Nh].y);
+
+    a[(c*4+2)*2+0] = -K*(b[(3+c)*Nh].x-b[(9+c)*Nh].x);
+    a[(c*4+2)*2+1] = -K*(b[(3+c)*Nh].y-b[(9+c)*Nh].y);
+    
+    a[(c*4+3)*2+0] = -K*(b[(6+c)*Nh].x-b[c*Nh].x);
+    a[(c*4+3)*2+1] = -K*(b[(6+c)*Nh].y-b[c*Nh].y);
  }
+
 }

-// Single precision version of the above
-void packQDPParitySpinorSS(float4 *res, float *spinor) {
-  float K = 1.0 / 2.0;
-  
-  float b[24];
+// Standard spinor packing, colour inside spin
+template <typename Float, typename FloatN>
+void packParitySpinor(FloatN *res, Float *spinor) {
  for (int i = 0; i < Nh; i++) {
-    float *a = spinor+i*24;
-
-    // reorder and change basis
-    for (int c=0; c<3; c++) {
-      for (int r=0; r<2; r++) {
-	int cr = c*2+r;
-	b[0*6+cr] = K*(a[(c*4+1)*2+r]+a[(c*4+3)*2+r]);
-	b[1*6+cr] = -K*(a[(c*4+0)*2+r]+a[(c*4+2)*2+r]);
-	b[2*6+cr] = K*(a[(c*4+1)*2+r]-a[(c*4+3)*2+r]);
-	b[3*6+cr] = K*(a[(c*4+2)*2+r]-a[(c*4+0)*2+r]);
-      }
-    }
-
-    for (int j = 0; j < 6; j++) packFloat4(res+j*Nh+i, b+j*4);
+    packSpinorVector(res+i, spinor+24*i);
  }
 }

-void unpackFullSpinorDD(double *res, double2 *even, double2 *odd) {
-  double K = 1.0;
-  double b[24];
+template <typename Float, typename FloatN>
+void packFullSpinor(FloatN *even, FloatN *odd, Float *spinor) {

  for (int i=0; i<Nh; i++) {

@@ -353,47 +360,19 @@ void unpackFullSpinorDD(double *res, double2 *even, double2 *odd) {

    { // even sites
      int k = 2*i + boundaryCrossings%2; 
-      double *a = res + k*24;
-
-      for (int j = 0; j < 12; j++) unpackDouble2(b+j*2, even+j*Nh+i);
-
-      for (int c=0; c<3; c++) {
-	for (int r=0; r<2; r++) {
-	  int cr = c*2+r;
-	  a[0*6+cr] = -K*(b[1*6+cr]+b[3*6+cr]);
-	  a[1*6+cr] = K*(b[0*6+cr]+b[2*6+cr]);
-	  a[2*6+cr] = -K*(b[1*6+cr]-b[3*6+cr]);
-	  a[3*6+cr] = -K*(b[2*6+cr]-b[0*6+cr]);
-	}
-      }
-      
+      packSpinorVector(even+i, spinor+24*k);
    }
    
    { // odd sites
      int k = 2*i + (boundaryCrossings+1)%2;
-      double *a = res + k*24;
-
-      for (int j = 0; j < 12; j++) unpackDouble2(b+j*2, odd+j*Nh+i);
-
-      for (int c=0; c<3; c++) {
-	for (int r=0; r<2; r++) {
-	  int cr = c*2+r;
-	  a[0*6+cr] = -K*(b[1*6+cr]+b[3*6+cr]);
-	  a[1*6+cr] = K*(b[0*6+cr]+b[2*6+cr]);
-	  a[2*6+cr] = -K*(b[1*6+cr]-b[3*6+cr]);
-	  a[3*6+cr] = -K*(b[2*6+cr]-b[0*6+cr]);
-	}
-      }
-      
+      packSpinorVector(odd+i, spinor+24*k);
    }
  }

 }

-template <typename Float>
-void unpackFullSpinorFS(Float *res, float4 *even, float4 *odd) {
-  Float K = 1.0;
-  float b[24];
+template <typename Float, typename FloatN>
+void unpackFullSpinor(Float *res, FloatN *even, FloatN *odd) {

  for (int i=0; i<Nh; i++) {

@@ -401,179 +380,43 @@ void unpackFullSpinorFS(Float *res, float4 *even, float4 *odd) {

    { // even sites
      int k = 2*i + boundaryCrossings%2; 
-      Float *a = res + k*24;
-
-      for (int j = 0; j < 6; j++) unpackFloat4(b+j*4, even+j*Nh+i);
-
-      for (int c=0; c<3; c++) {
-	for (int r=0; r<2; r++) {
-	  int cr = c*2+r;
-	  a[0*6+cr] = -K*(b[1*6+cr]+b[3*6+cr]);
-	  a[1*6+cr] = K*(b[0*6+cr]+b[2*6+cr]);
-	  a[2*6+cr] = -K*(b[1*6+cr]-b[3*6+cr]);
-	  a[3*6+cr] = -K*(b[2*6+cr]-b[0*6+cr]);
-	}
-      }
-      
+      unpackSpinorVector(res+24*k, even+i);
    }
    
    { // odd sites
      int k = 2*i + (boundaryCrossings+1)%2;
-      Float *a = res + k*24;
-
-      for (int j = 0; j < 6; j++) unpackFloat4(b+j*4, odd+j*Nh+i);
-
-      for (int c=0; c<3; c++) {
-	for (int r=0; r<2; r++) {
-	  int cr = c*2+r;
-	  a[0*6+cr] = -K*(b[1*6+cr]+b[3*6+cr]);
-	  a[1*6+cr] = K*(b[0*6+cr]+b[2*6+cr]);
-	  a[2*6+cr] = -K*(b[1*6+cr]-b[3*6+cr]);
-	  a[3*6+cr] = -K*(b[2*6+cr]-b[0*6+cr]);
-	}
-      }
-      
+      unpackSpinorVector(res+24*k, odd+i);
    }
  }

 }

-void unpackParitySpinorDD(double *res, double2 *spinorPacked) {
-  double K = 1.0;///sqrt(2.0);
-  double b[24];
-
-  for (int i = 0; i < Nh; i++) {
-    double *a = res+i*24;
-
-    for (int j = 0; j < 12; j++) unpackDouble2(b+j*2, spinorPacked+j*Nh+i);
-
-    for (int c=0; c<3; c++) {
-      for (int r=0; r<2; r++) {
-	int cr = c*2+r;
-	a[0*6+cr] = -K*(b[1*6+cr]+b[3*6+cr]);
-	a[1*6+cr] = K*(b[0*6+cr]+b[2*6+cr]);
-	a[2*6+cr] = -K*(b[1*6+cr]-b[3*6+cr]);
-	a[3*6+cr] = -K*(b[2*6+cr]-b[0*6+cr]);
-      }
-    }
-
-  }

-}
-
-void unpackParitySpinorDS(double *res, float4 *spinorPacked) {
-  float K = 1.0;///sqrt(2.0);
-  float b[24];
+template <typename Float, typename FloatN>
+void unpackParitySpinor(Float *res, FloatN *spinorPacked) {

  for (int i = 0; i < Nh; i++) {
-    double *a = res+i*24;
-
-    for (int j = 0; j < 6; j++) unpackFloat4(b+j*4, spinorPacked+j*Nh+i);
-
-    for (int c=0; c<3; c++) {
-      for (int r=0; r<2; r++) {
-	int cr = c*2+r;
-	a[0*6+cr] = -K*(b[1*6+cr]+b[3*6+cr]);
-	a[1*6+cr] = K*(b[0*6+cr]+b[2*6+cr]);
-	a[2*6+cr] = -K*(b[1*6+cr]-b[3*6+cr]);
-	a[3*6+cr] = -K*(b[2*6+cr]-b[0*6+cr]);
-      }
-    }
-
+    unpackSpinorVector(res+i*24, spinorPacked+i);
  }

 }

-void unpackParitySpinorSS(float *res, float4 *spinorPacked) {
-  float K = 1.0;///sqrt(2.0);
-  float b[24];
-
-  for (int i = 0; i < Nh; i++) {
-    float *a = res+i*24;
-
-    for (int j = 0; j < 6; j++) unpackFloat4(b+j*4, spinorPacked+j*Nh+i);
-
-    for (int c=0; c<3; c++) {
-      for (int r=0; r<2; r++) {
-	int cr = c*2+r;
-	a[0*6+cr] = -K*(b[1*6+cr]+b[3*6+cr]);
-	a[1*6+cr] = K*(b[0*6+cr]+b[2*6+cr]);
-	a[2*6+cr] = -K*(b[1*6+cr]-b[3*6+cr]);
-	a[3*6+cr] = -K*(b[2*6+cr]-b[0*6+cr]);
-      }
-    }
-
-  }
-
-}
-
-void unpackQDPParitySpinorDD(double *res, double2 *spinorPacked) {
-  double K = 1.0;///sqrt(2.0);
-  double b[24];
-
-  for (int i = 0; i < Nh; i++) {
-    double *a = res+i*24;
-
-    for (int j = 0; j < 12; j++) unpackDouble2(b+j*2, spinorPacked+j*Nh+i);
-
-    for (int c=0; c<3; c++) {
-      for (int r=0; r<2; r++) {
-	int cr = c*2+r;
-	a[(c*4+0)*2+r] = -K*(b[1*6+cr]+b[3*6+cr]);
-	a[(c*4+1)*2+r] = K*(b[0*6+cr]+b[2*6+cr]);
-	a[(c*4+2)*2+r] = -K*(b[1*6+cr]-b[3*6+cr]);
-	a[(c*4+3)*2+r] = -K*(b[2*6+cr]-b[0*6+cr]);
-      }
-    }
-  }
-
-}
-
-void unpackQDPParitySpinorDS(double *res, float4 *spinorPacked) {
-  float K = 1.0;///sqrt(2.0);
-  float b[24];
-
+// QDP spinor packing, spin inside colour
+template <typename Float, typename FloatN>
+void packQDPParitySpinor(FloatN *res, Float *spinor) {
  for (int i = 0; i < Nh; i++) {
-    double *a = res+i*24;
-
-    for (int j = 0; j < 6; j++) unpackFloat4(b+j*4, spinorPacked+j*Nh+i);
-
-    for (int c=0; c<3; c++) {
-      for (int r=0; r<2; r++) {
-	int cr = c*2+r;
-	a[(c*4+0)*2+r] = -K*(b[1*6+cr]+b[3*6+cr]);
-	a[(c*4+1)*2+r] = K*(b[0*6+cr]+b[2*6+cr]);
-	a[(c*4+2)*2+r] = -K*(b[1*6+cr]-b[3*6+cr]);
-	a[(c*4+3)*2+r] = -K*(b[2*6+cr]-b[0*6+cr]);
-      }
-    }
+    packQDPSpinorVector(res+i, spinor+i*24);
  }
-
 }

-void unpackQDPParitySpinorSS(float *res, float4 *spinorPacked) {
-  float K = 1.0;///sqrt(2.0);
-  float b[24];
-
+// QDP spinor packing, spin inside colour
+template <typename Float, typename FloatN>
+void unpackQDPParitySpinor(Float *res, FloatN *spinor) {
  for (int i = 0; i < Nh; i++) {
-    float *a = res+i*24;
-
-    for (int j = 0; j < 6; j++) unpackFloat4(b+j*4, spinorPacked+j*Nh+i);
-
-    for (int c=0; c<3; c++) {
-      for (int r=0; r<2; r++) {
-	int cr = c*2+r;
-	a[(c*4+0)*2+r] = -K*(b[1*6+cr]+b[3*6+cr]);
-	a[(c*4+1)*2+r] = K*(b[0*6+cr]+b[2*6+cr]);
-	a[(c*4+2)*2+r] = -K*(b[1*6+cr]-b[3*6+cr]);
-	a[(c*4+3)*2+r] = -K*(b[2*6+cr]-b[0*6+cr]);
-      }
-    }
-
+    unpackQDPSpinorVector(res+i*24, spinor+i);
  }
 }

-
 void loadParitySpinor(ParitySpinor ret, void *spinor, Precision cpu_prec, 
 		      DiracFieldOrder dirac_order) {

@@ -595,17 +438,17 @@ void loadParitySpinor(ParitySpinor ret, void *spinor, Precision cpu_prec,
    
    if (dirac_order == QUDA_DIRAC_ORDER || QUDA_CPS_WILSON_DIRAC_ORDER) {
      if (ret.precision == QUDA_DOUBLE_PRECISION) {
-	packParitySpinorDD((double2*)packedSpinor1, (double*)spinor);
+	packParitySpinor((double2*)packedSpinor1, (double*)spinor);
      } else {
-	if (cpu_prec == QUDA_DOUBLE_PRECISION) packParitySpinorSD((float4*)packedSpinor1, (double*)spinor);
-	else packParitySpinorSS((float4*)packedSpinor1, (float*)spinor);
+	if (cpu_prec == QUDA_DOUBLE_PRECISION) packParitySpinor((float4*)packedSpinor1, (double*)spinor);
+	else packParitySpinor((float4*)packedSpinor1, (float*)spinor);
      }
    } else if (dirac_order == QUDA_QDP_DIRAC_ORDER) {
      if (ret.precision == QUDA_DOUBLE_PRECISION) {
-	packQDPParitySpinorDD((double2*)packedSpinor1, (double*)spinor);
+	packQDPParitySpinor((double2*)packedSpinor1, (double*)spinor);
      } else {
-	if (cpu_prec == QUDA_DOUBLE_PRECISION) packQDPParitySpinorSD((float4*)packedSpinor1, (double*)spinor);
-	else packQDPParitySpinorSS((float4*)packedSpinor1, (float*)spinor);
+	if (cpu_prec == QUDA_DOUBLE_PRECISION) packQDPParitySpinor((float4*)packedSpinor1, (double*)spinor);
+	else packQDPParitySpinor((float4*)packedSpinor1, (float*)spinor);
      }
    }
    cudaMemcpy(ret.spinor, packedSpinor1, spinor_bytes, cudaMemcpyHostToDevice);
@@ -634,11 +477,12 @@ void loadFullSpinor(FullSpinor ret, void *spinor, Precision cpu_prec) {
 #endif
    
    if (ret.even.precision == QUDA_DOUBLE_PRECISION) {
-      packFullSpinorDD((double2*)packedSpinor1, (double2*)packedSpinor2, (double*)spinor);
+      packFullSpinor((double2*)packedSpinor1, (double2*)packedSpinor2, (double*)spinor);
    } else {
      if (cpu_prec == QUDA_DOUBLE_PRECISION) 
-	packFullSpinorSF((float4*)packedSpinor1, (float4*)packedSpinor2, (double*)spinor);
-      else packFullSpinorSF((float4*)packedSpinor1, (float4*)packedSpinor2, (float*)spinor);
+	packFullSpinor((float4*)packedSpinor1, (float4*)packedSpinor2, (double*)spinor);
+      else 
+	packFullSpinor((float4*)packedSpinor1, (float4*)packedSpinor2, (float*)spinor);
    }
    
    cudaMemcpy(ret.even.spinor, packedSpinor1, spinor_bytes, cudaMemcpyHostToDevice);
@@ -692,17 +536,17 @@ void retrieveParitySpinor(void *res, ParitySpinor spinor, Precision cpu_prec, Di
    cudaMemcpy(packedSpinor1, spinor.spinor, spinor_bytes, cudaMemcpyDeviceToHost);
    if (dirac_order == QUDA_DIRAC_ORDER || QUDA_CPS_WILSON_DIRAC_ORDER) {
      if (spinor.precision == QUDA_DOUBLE_PRECISION) {
-	unpackParitySpinorDD((double*)res, (double2*)packedSpinor1);
+	unpackParitySpinor((double*)res, (double2*)packedSpinor1);
      } else {
-	if (cpu_prec == QUDA_DOUBLE_PRECISION) unpackParitySpinorDS((double*)res, (float4*)packedSpinor1);
-	else unpackParitySpinorSS((float*)res, (float4*)packedSpinor1);
+	if (cpu_prec == QUDA_DOUBLE_PRECISION) unpackParitySpinor((double*)res, (float4*)packedSpinor1);
+	else unpackParitySpinor((float*)res, (float4*)packedSpinor1);
      }
    } else if (dirac_order == QUDA_QDP_DIRAC_ORDER) {
      if (spinor.precision == QUDA_DOUBLE_PRECISION) {
-	unpackQDPParitySpinorDD((double*)res, (double2*)packedSpinor1);
+	unpackQDPParitySpinor((double*)res, (double2*)packedSpinor1);
      } else {
-	if (cpu_prec == QUDA_DOUBLE_PRECISION) unpackQDPParitySpinorDS((double*)res, (float4*)packedSpinor1);
-	else unpackQDPParitySpinorSS((float*)res, (float4*)packedSpinor1);
+	if (cpu_prec == QUDA_DOUBLE_PRECISION) unpackQDPParitySpinor((double*)res, (float4*)packedSpinor1);
+	else unpackQDPParitySpinor((float*)res, (float4*)packedSpinor1);
      }
    }
  } else {
@@ -726,11 +570,11 @@ void retrieveFullSpinor(void *res, FullSpinor spinor, Precision cpu_prec) {
    cudaMemcpy(packedSpinor1, spinor.even.spinor, spinor_bytes, cudaMemcpyDeviceToHost);
    cudaMemcpy(packedSpinor2, spinor.odd.spinor, spinor_bytes, cudaMemcpyDeviceToHost);
    if (spinor.even.precision == QUDA_DOUBLE_PRECISION) {
-      unpackFullSpinorDD((double*)res, (double2*)packedSpinor1, (double2*)packedSpinor2);
+      unpackFullSpinor((double*)res, (double2*)packedSpinor1, (double2*)packedSpinor2);
    } else {
      if (cpu_prec == QUDA_DOUBLE_PRECISION) 
-	unpackFullSpinorFS((double*)res, (float4*)packedSpinor1, (float4*)packedSpinor2);
-      else unpackFullSpinorFS((float*)res, (float4*)packedSpinor1, (float4*)packedSpinor2);
+	unpackFullSpinor((double*)res, (float4*)packedSpinor1, (float4*)packedSpinor2);
+      else unpackFullSpinor((float*)res, (float4*)packedSpinor1, (float4*)packedSpinor2);
    }
    
 #ifndef __DEVICE_EMULATION__