more work on quda clover

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@462 be54200a-260c-0410-bdd7-ce6af2a381ab

more work on quda clover
git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@462 be54200a-260c-0410-bdd7-ce6af2a381ab
458d290a · rbabich · 7f8a13b9 · 458d290a · 458d290a · 458d290a
Commit 458d290a authored Sep 20, 2009 by rbabich
--- a/Makefile.tmpl
+++ b/Makefile.tmpl
@@ -6,7 +6,7 @@ CPU_ARCH = x86_64   # x86 or x86_64
 GPU_ARCH = sm_13    # sm_10, sm_11, sm_12, or sm_13
 GPU_EMU  = false    # set to 'true' for device emulation

-PYTHON = python2.6  # python 2.5 or later required for 'make generated'
+PYTHON = python2.6  # python 2.5 or later required for 'make gen'

 ######

@@ -36,44 +36,45 @@ LDFLAGS = -fPIC $(LIB)

 all: dslash_test invert_test su3_test pack_test

-ILIB = libquda.a
-ILIB_OBJS = blas_quda.o blas_reference.o dslash_quda.o dslash_reference.o   \
-	gauge_quda.o inv_bicgstab_quda.o inv_cg_quda.o invert_quda.o        \
-	spinor_quda.o util_quda.o
-ILIB_HDRS = blas_quda.h blas_reference.h dslash_def.h dslash_quda.h         \
-	dslash_reference.h enum_quda.h gauge_quda.h invert_quda.h           \
-	io_spinor.h quda.h read_clover.h read_gauge.h reduce_complex_core.h \
-	reduce_core.h reduce_triple_core.h spinor_quda.h util_quda.h
-ILIB_CORE = dslash_core.h dslash_dagger_core.h
+QUDA = libquda.a
+QUDA_OBJS = blas_quda.o blas_reference.o clover_quda.o dslash_quda.o      \
+	dslash_reference.o gauge_quda.o inv_bicgstab_quda.o inv_cg_quda.o \
+	invert_quda.o spinor_quda.o util_quda.o
+QUDA_HDRS = blas_quda.h blas_reference.h clover_def.h dslash_def.h        \
+	dslash_quda.h dslash_reference.h enum_quda.h gauge_quda.h         \
+	invert_quda.h io_spinor.h quda.h read_clover.h read_gauge.h       \
+	reduce_complex_core.h reduce_core.h reduce_triple_core.h          \
+	spinor_quda.h util_quda.h
+QUDA_CORE = clover_core.h dslash_core.h dslash_dagger_core.h

-$(ILIB): $(ILIB_OBJS)
-	ar cru $@ $(ILIB_OBJS)
+$(QUDA): $(QUDA_OBJS)
+	ar cru $@ $(QUDA_OBJS)

-dslash_test: dslash_test.o $(ILIB)
-	$(CXX) $(LDFLAGS) $< $(ILIB) -o $@
+dslash_test: dslash_test.o $(QUDA)
+	$(CXX) $(LDFLAGS) $< $(QUDA) -o $@

-invert_test: invert_test.o $(ILIB)
-	$(CXX) $(LDFLAGS) $< $(ILIB) -o $@
+invert_test: invert_test.o $(QUDA)
+	$(CXX) $(LDFLAGS) $< $(QUDA) -o $@

-su3_test: su3_test.o $(ILIB)
-	$(CXX) $(LDFLAGS) $< $(ILIB) -o $@
+su3_test: su3_test.o $(QUDA)
+	$(CXX) $(LDFLAGS) $< $(QUDA) -o $@

-pack_test: pack_test.o $(ILIB)
-	$(CXX) $(LDFLAGS) $< $(ILIB) -o $@
+pack_test: pack_test.o $(QUDA)
+	$(CXX) $(LDFLAGS) $< $(QUDA) -o $@

-generated:
+gen:
 	$(PYTHON) dslash_cuda_gen.py

 clean:
-	-rm -f *.o dslash_test invert_test su3_test pack_test $(ILIB)
+	-rm -f *.o dslash_test invert_test su3_test pack_test $(QUDA)

-%.o: %.c $(ILIB_HDRS)
+%.o: %.c $(QUDA_HDRS)
 	$(CC) $(CFLAGS) $< -c -o $@

-%.o: %.cpp $(ILIB_HDRS)
+%.o: %.cpp $(QUDA_HDRS)
 	$(CXX) $(CXXFLAGS) $< -c -o $@

-%.o: %.cu $(ILIB_HDRS) $(ILIB_CORE)
+%.o: %.cu $(QUDA_HDRS) $(QUDA_CORE)
 	$(NVCC) $(NVCCFLAGS) $< -c -o $@

-.PHONY: all generated clean
+.PHONY: all gen clean
--- a/README
+++ b/README
-QUDA v0.x Release Notes
-----------------------
+
+Release Notes for QUDA v0.x
+---------------------------

 Overview:

@@ -44,7 +45,8 @@ Installation:
 In the source directory, copy the template 'Makefile.tmpl' to
 'Makefile', and edit the first few lines to specify the CUDA install
 path, the platform (x86 or x86_64), and the GPU architecture (see
-"Compatibility" above).  Then type 'make' to build the library.
+"Hardware compatibility" above).  Then type 'make' to build the
+library.


 Using the library:
@@ -74,9 +76,11 @@ For help or to report a bug, please contact Mike Clark
 (mikec@seas.harvard.edu) or Ron Babich (rbabich@bu.edu).

 If you find this code useful in your work, a citation to the following
-write-up would be appreciated:
+would be appreciated:

 K. Barros et al., "Blasting through lattice calculations using CUDA,"
 PoS LATTICE2008, 045 (2008) [arXiv:0810.5365 [hep-lat]].

-Please also let us know so that we can send you updates and bug-fixes.
+Please also drop us a note so that we can send you updates and
+bug-fixes.
+
--- a/clover_core.h
+++ b/clover_core.h
--- a/clover_def.h
+++ b/clover_def.h
+// clover_def.h - clover kernel definitions
+
+// initialize on first iteration
+
+#ifndef DD_LOOP
+#define DD_LOOP
+#define DD_XPAY 0
+#define DD_SPREC 0
+#define DD_CPREC 0
+#endif
+
+// set options for current iteration
+
+#if (DD_XPAY==0) // no xpay 
+#define DD_XPAY_F 
+#define DD_PARAM2 int oddBit
+#else            // xpay
+#define DD_XPAY_F Xpay
+#if (DD_SPREC == 0)
+#define DD_PARAM2 int oddBit, double a
+#else
+#define DD_PARAM2 int oddBit, float a
+#endif
+#define DSLASH_XPAY
+#endif
+
+#if (DD_SPREC==0) // double-precision spinor field
+#define DD_SPREC_F D
+#define DD_PARAM1 double2* g_out
+#define READ_SPINOR READ_SPINOR_DOUBLE
+#define SPINORTEX spinorTexDouble
+#define WRITE_SPINOR WRITE_SPINOR_DOUBLE2
+#define SPINOR_DOUBLE
+#if (DD_XPAY==1)
+#define ACCUMTEX accumTexDouble
+#define READ_ACCUM READ_ACCUM_DOUBLE
+#endif
+#elif (DD_SPREC==1) // single-precision spinor field
+#define DD_SPREC_F S
+#define DD_PARAM1 float4* g_out
+#define READ_SPINOR READ_SPINOR_SINGLE
+#define SPINORTEX spinorTexSingle
+#define WRITE_SPINOR WRITE_SPINOR_FLOAT4
+#if (DD_XPAY==1)
+#define ACCUMTEX accumTexSingle
+#define READ_ACCUM READ_ACCUM_SINGLE
+#endif
+#else            // half-precision spinor field
+#define DD_SPREC_F H
+#define READ_SPINOR READ_SPINOR_HALF
+#define SPINORTEX spinorTexHalf
+#define DD_PARAM1 short4* g_out, float *c
+#define WRITE_SPINOR WRITE_SPINOR_SHORT4
+#if (DD_XPAY==1)
+#define ACCUMTEX accumTexHalf
+#define READ_ACCUM READ_ACCUM_HALF
+#endif
+#endif
+
+#if (DD_CPREC==0) // double-precision clover term
+#define DD_CPREC_F D
+#define CLOVERTEX cloverTexDouble
+#define READ_CLOVER READ_CLOVER_DOUBLE
+#define CLOVER_DOUBLE
+#elif (DD_CPREC==1) // single-precision clover term
+#define DD_CPREC_F S
+#define CLOVERTEX cloverTexSingle
+#define READ_CLOVER READ_CLOVER_SINGLE
+#else               // half-precision clover term
+#define DD_CPREC_F H
+#define CLOVERTEX cloverTexHalf
+#define READ_CLOVER READ_CLOVER_HALF
+#endif
+
+#define DD_CONCAT(s,c,x) clover ## s ## c ## x ## Kernel
+#define DD_FUNC(s,c,x) DD_CONCAT(s,c,x)
+
+// define the kernel
+
+#if !(__CUDA_ARCH__ != 130 && (DD_SPREC == 0 || DD_CPREC == 0))
+
+__global__ void
+DD_FUNC(DD_SPREC_F, DD_CPREC_F, DD_XPAY_F)(DD_PARAM1, DD_PARAM2) {
+#include "clover_core.h"
+}
+
+#endif
+
+// clean up
+
+#undef DD_SPREC_F
+#undef DD_CPREC_F
+#undef DD_XPAY_F
+#undef DD_PARAM1
+#undef DD_PARAM2
+#undef DD_CONCAT
+#undef DD_FUNC
+
+#undef DSLASH_XPAY
+#undef READ_SPINOR
+#undef SPINORTEX
+#undef WRITE_SPINOR
+#undef ACCUMTEX
+#undef READ_ACCUM
+#undef CLOVERTEX
+#undef READ_CLOVER
+#undef GAUGE_DOUBLE
+#undef SPINOR_DOUBLE
+#undef CLOVER_DOUBLE
+
+// prepare next set of options, or clean up after final iteration
+
+//#if (DD_XPAY==0)   // xpay variant is not needed
+//#undef DD_XPAY
+//#define DD_XPAY 1
+//#else
+//#undef DD_XPAY
+//#define DD_XPAY 0
+
+#if (DD_SPREC==0)
+#undef DD_SPREC
+#define DD_SPREC 1
+#elif (DD_SPREC==1)
+#undef DD_SPREC
+#define DD_SPREC 2
+#else
+#undef DD_SPREC
+#define DD_SPREC 0
+
+#if (DD_CPREC==0)
+#undef DD_CPREC
+#define DD_CPREC 1
+#elif (DD_CPREC==1)
+#undef DD_CPREC
+#define DD_CPREC 2
+#else
+
+#undef DD_LOOP
+#undef DD_XPAY
+#undef DD_SPREC
+#undef DD_CPREC
+
+#endif // DD_CPREC
+#endif // DD_SPREC
+//#endif // DD_XPAY
+
+#ifdef DD_LOOP
+#include "clover_def.h"
+#endif
--- a/clover_quda.cpp
+++ b/clover_quda.cpp
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include <quda.h>
+#include <spinor_quda.h>
+#include <util_quda.h>
+
+
+ParityClover allocateParityClover(int *X, Precision precision)
+{
+  ParityClover ret;
+
+  ret.precision = precision;
+  ret.volume = 1;
+  for (int d=0; d<4; d++) {
+    ret.X[d] = X[d];
+    ret.volume *= X[d];
+  }
+  ret.Nc = 3;
+  ret.Ns = 4;
+  ret.length = ret.volume*ret.Nc*ret.Nc*ret.Ns*ret.Ns/2; // block-diagonal Hermitian (72 reals)
+
+  if (precision == QUDA_DOUBLE_PRECISION) ret.bytes = ret.length*sizeof(double);
+  else if (precision == QUDA_SINGLE_PRECISION) ret.bytes = ret.length*sizeof(float);
+  else ret.bytes = ret.length*sizeof(short);
+
+  if (cudaMalloc((void**)&(ret.clover), ret.bytes) == cudaErrorMemoryAllocation) {
+    printf("Error allocating clover term\n");
+    exit(0);
+  }   
+
+  if (precision == QUDA_HALF_PRECISION) {
+    if (cudaMalloc((void**)&ret.cloverNorm, ret.bytes/18) == cudaErrorMemoryAllocation) {
+      printf("Error allocating cloverNorm\n");
+      exit(0);
+    }
+  }
+
+  return ret;
+}
+
+FullClover allocateCloverField(int *X, Precision precision)
+{
+  FullClover ret;
+  ret.even = allocateParityClover(X, precision);
+  ret.odd = allocateParityClover(X, precision);
+  return ret;
+}
+
+void freeParityClover(ParityClover *clover)
+{
+  cudaFree(clover->clover);
+  clover->clover = NULL;
+}
+
+void freeCloverField(FullClover *clover)
+{
+  freeParityClover(&clover->even);
+  freeParityClover(&clover->odd);
+}
+
+template <typename Float>
+static inline void packCloverMatrix(float4* a, Float *b, int Vh)
+{
+  for (int i=0; i<18; i++) {
+    a[i*Vh].x = b[4*i+0];
+    a[i*Vh].y = b[4*i+1];
+    a[i*Vh].z = b[4*i+2];
+    a[i*Vh].w = b[4*i+3];
+  }
+}
+
+template <typename Float>
+static inline void packCloverMatrix(double2* a, Float *b, int Vh)
+{
+  for (int i=0; i<36; i++) {
+    a[i*Vh].x = b[2*i+0];
+    a[i*Vh].y = b[2*i+1];
+  }
+}
+
+template <typename Float, typename FloatN>
+static void packParityClover(FloatN *res, Float *clover, int Vh)
+{
+  for (int i = 0; i < Vh; i++) {
+    packCloverMatrix(res+i, clover+72*i, Vh);
+  }
+}
+
+template <typename Float, typename FloatN>
+static void packFullClover(FloatN *even, FloatN *odd, Float *clover, int *X)
+{
+  int Vh = X[0]*X[1]*X[2]*X[3];
+  X[0] *= 2; // X now contains dimensions of the full lattice
+
+  for (int i=0; i<Vh; i++) {
+
+    int boundaryCrossings = i/X[0] + i/(X[1]*X[0]) + i/(X[2]*X[1]*X[0]);
+
+    { // even sites
+      int k = 2*i + boundaryCrossings%2; 
+      packCloverMatrix(even+i, clover+72*k, Vh);
+    }
+    
+    { // odd sites
+      int k = 2*i + (boundaryCrossings+1)%2;
+      packCloverMatrix(odd+i, clover+72*k, Vh);
+    }
+  }
+}
+
+template<typename Float>
+static inline void packCloverMatrixHalf(short4 *res, float *norm, Float *clover, int Vh)
+{
+  Float max, a, c;
+
+  // treat the two chiral blocks separately
+  for (int chi=0; chi<2; chi++) {
+    max = fabs(clover[0]);
+    for (int i=1; i<36; i++) {
+      if ((a = fabs(clover[i])) > max) max = a;
+    }
+    c = MAX_SHORT/max;
+    for (int i=0; i<9; i++) {
+      res[i*Vh].x = (short) (c * clover[4*i+0]);
+      res[i*Vh].y = (short) (c * clover[4*i+1]);
+      res[i*Vh].z = (short) (c * clover[4*i+2]);
+      res[i*Vh].w = (short) (c * clover[4*i+3]);
+    }
+    norm[chi*Vh] = 1/c;
+    res += 9;
+    clover += 36;
+  }
+}
+
+template <typename Float>
+static void packParityCloverHalf(short4 *res, float *norm, Float *clover, int Vh)
+{
+  for (int i = 0; i < Vh; i++) {
+    packCloverMatrixHalf(res+i, norm+i, clover+72*i, Vh);
+  }
+}
+
+template <typename Float>
+static void packFullCloverHalf(short4 *even, float *evenNorm, short4 *odd, float *oddNorm,
+			       Float *clover, int *X)
+{
+  int Vh = X[0]*X[1]*X[2]*X[3];
+  X[0] *= 2; // X now contains dimensions of the full lattice
+
+  for (int i=0; i<Vh; i++) {
+
+    int boundaryCrossings = i/X[0] + i/(X[1]*X[0]) + i/(X[2]*X[1]*X[0]);
+
+    { // even sites
+      int k = 2*i + boundaryCrossings%2; 
+      packCloverMatrixHalf(even+i, evenNorm+i, clover+72*k, Vh);
+    }
+    
+    { // odd sites
+      int k = 2*i + (boundaryCrossings+1)%2;
+      packCloverMatrixHalf(odd+i, oddNorm+i, clover+72*k, Vh);
+    }
+  }
+}
+
+void loadParityClover(ParityClover ret, void *clover, Precision cpu_prec, 
+		      CloverFieldOrder clover_order)
+{
+  // use pinned memory                                                                                           
+  void *packedClover, *packedCloverNorm;
+
+  if (ret.precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) {
+    printf("QUDA error: cannot have CUDA double precision without double CPU precision\n");
+    exit(-1);
+  }
+  if (clover_order != QUDA_PACKED_CLOVER_ORDER) {
+    printf("QUDA error: invalid clover order\n");
+    exit(-1);
+  }
+
+#ifndef __DEVICE_EMULATION__
+  cudaMallocHost(&packedClover, ret.bytes);
+  if (ret.precision == QUDA_HALF_PRECISION) cudaMallocHost(&packedCloverNorm, ret.bytes/18);
+#else
+  packedClover = malloc(ret.bytes);
+  if (ret.precision == QUDA_HALF_PRECISION) packedCloverNorm = malloc(ret.bytes/18);
+#endif
+    
+  if (ret.precision == QUDA_DOUBLE_PRECISION) {
+    packParityClover((double2 *)packedClover, (double *)clover, ret.volume);
+  } else if (ret.precision == QUDA_SINGLE_PRECISION) {
+    if (cpu_prec == QUDA_DOUBLE_PRECISION) {
+      packParityClover((float4 *)packedClover, (double *)clover, ret.volume);
+    } else {
+      packParityClover((float4 *)packedClover, (float *)clover, ret.volume);
+    }
+  } else {
+    if (cpu_prec == QUDA_DOUBLE_PRECISION) {
+      packParityCloverHalf((short4 *)packedClover, (float *)packedCloverNorm, (double *)clover, ret.volume);
+    } else {
+      packParityCloverHalf((short4 *)packedClover, (float *)packedCloverNorm, (float *)clover, ret.volume);
+    }
+  }
+  
+  cudaMemcpy(ret.clover, packedClover, ret.bytes, cudaMemcpyHostToDevice);
+  if (ret.precision == QUDA_HALF_PRECISION) {
+    cudaMemcpy(ret.cloverNorm, packedCloverNorm, ret.bytes/18, cudaMemcpyHostToDevice);
+  }
+
+#ifndef __DEVICE_EMULATION__
+  cudaFreeHost(packedClover);
+  if (ret.precision == QUDA_HALF_PRECISION) cudaFreeHost(packedCloverNorm);
+#else
+  free(packedClover);
+  if (ret.precision == CUDA_HALF_PRECISION) free(packedCloverNorm);
+#endif
+
+}
+
+void loadFullClover(FullClover ret, void *clover, Precision cpu_prec,
+		    CloverFieldOrder clover_order)
+{
+  // use pinned memory                                                                                           
+  void *packedEven, *packedEvenNorm, *packedOdd, *packedOddNorm;
+
+  if (ret.even.precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) {
+    printf("QUDA error: cannot have CUDA double precision without double CPU precision\n");
+    exit(-1);
+  }
+  if (clover_order != QUDA_LEX_PACKED_CLOVER_ORDER) {
+    printf("QUDA error: invalid clover order\n");
+    exit(-1);
+  }
+
+#ifndef __DEVICE_EMULATION__
+  cudaMallocHost(&packedEven, ret.even.bytes);
+  cudaMallocHost(&packedOdd, ret.even.bytes);
+  if (ret.even.precision == QUDA_HALF_PRECISION) {
+    cudaMallocHost(&packedEvenNorm, ret.even.bytes/18);
+    cudaMallocHost(&packedOddNorm, ret.even.bytes/18);
+  }
+#else
+  packedEven = malloc(ret.even.bytes);
+  packedOdd = malloc(ret.even.bytes);
+  if (ret.even.precision == QUDA_HALF_PRECISION) {
+    packedEvenNorm = malloc(ret.even.bytes/18);
+    packedOddNorm = malloc(ret.even.bytes/18);
+  }
+#endif
+    
+  if (ret.even.precision == QUDA_DOUBLE_PRECISION) {
+    packFullClover((double2 *)packedEven, (double2 *)packedOdd, (double *)clover, ret.even.X);
+  } else if (ret.even.precision == QUDA_SINGLE_PRECISION) {
+    if (cpu_prec == QUDA_DOUBLE_PRECISION) {
+      packFullClover((float4 *)packedEven, (float4 *)packedOdd, (double *)clover, ret.even.X);
+    } else {
+      packFullClover((float4 *)packedEven, (float4 *)packedOdd, (float *)clover, ret.even.X);    
+    }
+  } else {
+    if (cpu_prec == QUDA_DOUBLE_PRECISION) {
+      packFullCloverHalf((short4 *)packedEven, (float *) packedEvenNorm, (short4 *)packedOdd,
+			 (float *) packedOddNorm, (double *)clover, ret.even.X);
+    } else {
+      packFullCloverHalf((short4 *)packedEven, (float *) packedEvenNorm, (short4 *)packedOdd,
+			 (float * )packedOddNorm, (float *)clover, ret.even.X);    
+    }
+  }
+
+  cudaMemcpy(ret.even.clover, packedEven, ret.even.bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(ret.odd.clover, packedOdd, ret.even.bytes, cudaMemcpyHostToDevice);
+  if (ret.even.precision == QUDA_HALF_PRECISION) {
+    cudaMemcpy(ret.even.cloverNorm, packedEvenNorm, ret.even.bytes/18, cudaMemcpyHostToDevice);
+    cudaMemcpy(ret.odd.cloverNorm, packedOddNorm, ret.even.bytes/18, cudaMemcpyHostToDevice);
+  }
+
+#ifndef __DEVICE_EMULATION__
+  cudaFreeHost(packedEven);
+  cudaFreeHost(packedOdd);
+  if (ret.even.precision == QUDA_HALF_PRECISION) {
+    cudaFreeHost(packedEvenNorm);
+    cudaFreeHost(packedOddNorm);
+  }
+#else
+  free(packedEven);
+  free(packedOdd);
+  if (ret.even.precision == QUDA_HALF_PRECISION) {
+    free(packedEvenNorm);
+    free(packedOddNorm);
+  }
+#endif
+
+}
+
+void loadCloverField(FullClover ret, void *clover, Precision cpu_prec, CloverFieldOrder clover_order)
+{
+  void *clover_odd;
+
+  if (cpu_prec == QUDA_SINGLE_PRECISION) clover_odd = (float *)clover + ret.even.length;
+  else clover_odd = (double *)clover + ret.even.length;
+
+  if (clover_order == QUDA_LEX_PACKED_CLOVER_ORDER) {
+    loadFullClover(ret, clover, cpu_prec, clover_order);
+  } else if (clover_order == QUDA_PACKED_CLOVER_ORDER) {
+    loadParityClover(ret.even, clover, cpu_prec, clover_order);
+    loadParityClover(ret.odd, clover_odd, cpu_prec, clover_order);
+  } else {
+    printf("QUDA error: CloverFieldOrder %d not supported\n", clover_order);
+    exit(-1);
+  }
+}
--- a/dslash_core.h
+++ b/dslash_core.h
@@ -1085,10 +1085,10 @@ o32_re = o32_im = 0;
    spinorFloat a30_re =  o00_re - o20_re;
    spinorFloat a30_im =  o00_im - o20_im;
    
-    o00_re = a00_re;
-    o10_re = a10_re;
-    o20_re = a20_re;
-    o30_re = a30_re;
+    o00_re = a00_re;  o00_im = a00_im;
+    o10_re = a10_re;  o10_im = a10_im;
+    o20_re = a20_re;  o20_im = a20_im;
+    o30_re = a30_re;  o30_im = a30_im;
 }
 {
    spinorFloat a01_re = -o11_re - o31_re;
@@ -1100,10 +1100,10 @@ o32_re = o32_im = 0;
    spinorFloat a31_re =  o01_re - o21_re;
    spinorFloat a31_im =  o01_im - o21_im;
    
-    o01_re = a01_re;
-    o11_re = a11_re;
-    o21_re = a21_re;
-    o31_re = a31_re;
+    o01_re = a01_re;  o01_im = a01_im;
+    o11_re = a11_re;  o11_im = a11_im;
+    o21_re = a21_re;  o21_im = a21_im;
+    o31_re = a31_re;  o31_im = a31_im;
 }
 {
    spinorFloat a02_re = -o12_re - o32_re;
@@ -1115,10 +1115,10 @@ o32_re = o32_im = 0;
    spinorFloat a32_re =  o02_re - o22_re;
    spinorFloat a32_im =  o02_im - o22_im;
    
-    o02_re = a02_re;
-    o12_re = a12_re;
-    o22_re = a22_re;
-    o32_re = a32_re;
+    o02_re = a02_re;  o02_im = a02_im;
+    o12_re = a12_re;  o12_im = a12_im;
+    o22_re = a22_re;  o22_im = a22_im;
+    o32_re = a32_re;  o32_im = a32_im;
 }

 // apply first chiral block
@@ -1329,10 +1329,10 @@ o32_re = o32_im = 0;
    spinorFloat a30_re = -o00_re + o20_re;
    spinorFloat a30_im = -o00_im + o20_im;
    
-    o00_re = a00_re;
-    o10_re = a10_re;
-    o20_re = a20_re;
-    o30_re = a30_re;
+    o00_re = a00_re;  o00_im = a00_im;
+    o10_re = a10_re;  o10_im = a10_im;
+    o20_re = a20_re;  o20_im = a20_im;
+    o30_re = a30_re;  o30_im = a30_im;
 }
 {
    spinorFloat a01_re =  o11_re + o31_re;
@@ -1344,10 +1344,10 @@ o32_re = o32_im = 0;
    spinorFloat a31_re = -o01_re + o21_re;
    spinorFloat a31_im = -o01_im + o21_im;
    
-    o01_re = a01_re;
-    o11_re = a11_re;
-    o21_re = a21_re;
-    o31_re = a31_re;
+    o01_re = a01_re;  o01_im = a01_im;
+    o11_re = a11_re;  o11_im = a11_im;
+    o21_re = a21_re;  o21_im = a21_im;
+    o31_re = a31_re;  o31_im = a31_im;
 }
 {
    spinorFloat a02_re =  o12_re + o32_re;
@@ -1359,10 +1359,10 @@ o32_re = o32_im = 0;
    spinorFloat a32_re = -o02_re + o22_re;
    spinorFloat a32_im = -o02_im + o22_im;
    
-    o02_re = a02_re;
-    o12_re = a12_re;
-    o22_re = a22_re;
-    o32_re = a32_re;
+    o02_re = a02_re;  o02_im = a02_im;
+    o12_re = a12_re;  o12_im = a12_im;
+    o22_re = a22_re;  o22_im = a22_im;
+    o32_re = a32_re;  o32_im = a32_im;
 }
 #endif // DSLASH_CLOVER

@@ -1474,3 +1474,40 @@ o32_re = o32_im = 0;
 #undef i31_im
 #undef i32_re
 #undef i32_im
+
+#undef c00_00_re
+#undef c01_01_re
+#undef c02_02_re
+#undef c10_10_re
+#undef c11_11_re
+#undef c12_12_re
+#undef c01_00_re
+#undef c01_00_im
+#undef c02_00_re
+#undef c02_00_im
+#undef c10_00_re
+#undef c10_00_im
+#undef c11_00_re
+#undef c11_00_im
+#undef c12_00_re
+#undef c12_00_im
+#undef c02_01_re
+#undef c02_01_im
+#undef c10_01_re
+#undef c10_01_im
+#undef c11_01_re
+#undef c11_01_im
+#undef c12_01_re
+#undef c12_01_im
+#undef c10_02_re
+#undef c10_02_im
+#undef c11_02_re
+#undef c11_02_im
+#undef c12_02_re
+#undef c12_02_im
+#undef c11_10_re
+#undef c11_10_im
+#undef c12_10_re
+#undef c12_10_im
+#undef c12_11_re
+#undef c12_11_im
--- a/dslash_cuda_gen.py
+++ b/dslash_cuda_gen.py
@@ -467,7 +467,8 @@ def toChiralBasis(c):
    str.append("\n")

    for s in range (0,4):
-        str.append(out_re(s,c)+" = "+a_re(0,s,c)+";\n")
+        str.append(out_re(s,c)+" = "+a_re(0,s,c)+";  ")
+        str.append(out_im(s,c)+" = "+a_im(0,s,c)+";\n")

    return block(''.join(str))
 # end def toChiralBasis
@@ -486,7 +487,8 @@ def fromChiralBasis(c): # note: factor of 1/2 is included in clover term normali
    str.append("\n")

    for s in range (0,4):
-        str.append(out_re(s,c)+" = "+a_re(0,s,c)+";\n")
+        str.append(out_re(s,c)+" = "+a_re(0,s,c)+";  ")
+        str.append(out_im(s,c)+" = "+a_im(0,s,c)+";\n")

    return block(''.join(str))
 # end def fromChiralBasis
@@ -516,7 +518,8 @@ def cloverMult(chi):

    for s in range (0,2):
        for c in range (0,3):
-            str.append(out_re(2*chi+s,c)+" = "+a_re(chi,s,c)+"; "+out_im(2*chi+s,c)+" = "+a_im(chi,s,c)+";\n")
+            str.append(out_re(2*chi+s,c)+" = "+a_re(chi,s,c)+";  ")
+            str.append(out_im(2*chi+s,c)+" = "+a_im(chi,s,c)+";\n")
    str.append("\n")

    return block(''.join(str))+"\n"
@@ -595,6 +598,20 @@ def epilog():
            i = 3*s+c
            str.append("#undef "+in_re(s,c)+"\n")
            str.append("#undef "+in_im(s,c)+"\n")
+    str.append("\n")
+
+    for m in range(0,6):
+        s = m/3
+        c = m%3
+        str.append("#undef "+c_re(0,s,c,s,c)+"\n")
+    for n in range(0,6):
+        sn = n/3
+        cn = n%3
+        for m in range(n+1,6):
+            sm = m/3
+            cm = m%3
+            str.append("#undef "+c_re(0,sm,cm,sn,cn)+"\n")
+            str.append("#undef "+c_im(0,sm,cm,sn,cn)+"\n")

    return ''.join(str)
 # end def epilog

--- a/dslash_dagger_core.h
+++ b/dslash_dagger_core.h
@@ -1085,10 +1085,10 @@ o32_re = o32_im = 0;
    spinorFloat a30_re =  o00_re - o20_re;
    spinorFloat a30_im =  o00_im - o20_im;
    
-    o00_re = a00_re;
-    o10_re = a10_re;
-    o20_re = a20_re;
-    o30_re = a30_re;
+    o00_re = a00_re;  o00_im = a00_im;
+    o10_re = a10_re;  o10_im = a10_im;
+    o20_re = a20_re;  o20_im = a20_im;
+    o30_re = a30_re;  o30_im = a30_im;
 }
 {
    spinorFloat a01_re = -o11_re - o31_re;
@@ -1100,10 +1100,10 @@ o32_re = o32_im = 0;
    spinorFloat a31_re =  o01_re - o21_re;
    spinorFloat a31_im =  o01_im - o21_im;
    
-    o01_re = a01_re;
-    o11_re = a11_re;
-    o21_re = a21_re;
-    o31_re = a31_re;
+    o01_re = a01_re;  o01_im = a01_im;
+    o11_re = a11_re;  o11_im = a11_im;
+    o21_re = a21_re;  o21_im = a21_im;
+    o31_re = a31_re;  o31_im = a31_im;
 }
 {
    spinorFloat a02_re = -o12_re - o32_re;
@@ -1115,10 +1115,10 @@ o32_re = o32_im = 0;
    spinorFloat a32_re =  o02_re - o22_re;
    spinorFloat a32_im =  o02_im - o22_im;
    
-    o02_re = a02_re;
-    o12_re = a12_re;
-    o22_re = a22_re;
-    o32_re = a32_re;
+    o02_re = a02_re;  o02_im = a02_im;
+    o12_re = a12_re;  o12_im = a12_im;
+    o22_re = a22_re;  o22_im = a22_im;
+    o32_re = a32_re;  o32_im = a32_im;
 }

 // apply first chiral block
@@ -1329,10 +1329,10 @@ o32_re = o32_im = 0;
    spinorFloat a30_re = -o00_re + o20_re;
    spinorFloat a30_im = -o00_im + o20_im;
    
-    o00_re = a00_re;
-    o10_re = a10_re;
-    o20_re = a20_re;
-    o30_re = a30_re;
+    o00_re = a00_re;  o00_im = a00_im;
+    o10_re = a10_re;  o10_im = a10_im;
+    o20_re = a20_re;  o20_im = a20_im;
+    o30_re = a30_re;  o30_im = a30_im;
 }
 {
    spinorFloat a01_re =  o11_re + o31_re;
@@ -1344,10 +1344,10 @@ o32_re = o32_im = 0;
    spinorFloat a31_re = -o01_re + o21_re;
    spinorFloat a31_im = -o01_im + o21_im;
    
-    o01_re = a01_re;
-    o11_re = a11_re;
-    o21_re = a21_re;
-    o31_re = a31_re;
+    o01_re = a01_re;  o01_im = a01_im;
+    o11_re = a11_re;  o11_im = a11_im;
+    o21_re = a21_re;  o21_im = a21_im;
+    o31_re = a31_re;  o31_im = a31_im;
 }
 {
    spinorFloat a02_re =  o12_re + o32_re;
@@ -1359,10 +1359,10 @@ o32_re = o32_im = 0;
    spinorFloat a32_re = -o02_re + o22_re;
    spinorFloat a32_im = -o02_im + o22_im;
    
-    o02_re = a02_re;
-    o12_re = a12_re;
-    o22_re = a22_re;
-    o32_re = a32_re;
+    o02_re = a02_re;  o02_im = a02_im;
+    o12_re = a12_re;  o12_im = a12_im;
+    o22_re = a22_re;  o22_im = a22_im;
+    o32_re = a32_re;  o32_im = a32_im;
 }
 #endif // DSLASH_CLOVER

@@ -1474,3 +1474,40 @@ o32_re = o32_im = 0;
 #undef i31_im
 #undef i32_re
 #undef i32_im
+
+#undef c00_00_re
+#undef c01_01_re
+#undef c02_02_re
+#undef c10_10_re
+#undef c11_11_re
+#undef c12_12_re
+#undef c01_00_re
+#undef c01_00_im
+#undef c02_00_re
+#undef c02_00_im
+#undef c10_00_re
+#undef c10_00_im
+#undef c11_00_re
+#undef c11_00_im
+#undef c12_00_re
+#undef c12_00_im
+#undef c02_01_re
+#undef c02_01_im
+#undef c10_01_re
+#undef c10_01_im
+#undef c11_01_re
+#undef c11_01_im
+#undef c12_01_re
+#undef c12_01_im
+#undef c10_02_re
+#undef c10_02_im
+#undef c11_02_re
+#undef c11_02_im
+#undef c12_02_re
+#undef c12_02_im
+#undef c11_10_re
+#undef c11_10_im
+#undef c12_10_re
+#undef c12_10_im
+#undef c12_11_re
+#undef c12_11_im
--- a/dslash_def.h
+++ b/dslash_def.h
@@ -30,12 +30,12 @@

 #ifndef DD_LOOP
 #define DD_LOOP
-#define DD_DAG 0
+//#define DD_DAG 0
 #define DD_XPAY 0
 #define DD_RECON 0
 #define DD_GPREC 0
 #define DD_SPREC 0
-#define DD_CPREC 1
+#define DD_CPREC 0
 #endif

 // set options for current iteration
@@ -211,12 +211,12 @@ DD_FUNC(DD_GPREC_F, DD_SPREC_F, DD_CPREC_F, DD_RECON_F, DD_DAG_F, DD_XPAY_F)(DD_

 // prepare next set of options, or clean up after final iteration

-#if (DD_DAG==0)
-#undef DD_DAG
-#define DD_DAG 1
-#else
-#undef DD_DAG
-#define DD_DAG 0
+//#if (DD_DAG==0)
+//#undef DD_DAG
+//#define DD_DAG 1
+//#else
+//#undef DD_DAG
+//#define DD_DAG 0

 #if (DD_XPAY==0)
 #undef DD_XPAY
@@ -252,14 +252,14 @@ DD_FUNC(DD_GPREC_F, DD_SPREC_F, DD_CPREC_F, DD_RECON_F, DD_DAG_F, DD_XPAY_F)(DD_
 #undef DD_SPREC
 #define DD_SPREC 0

-//#if (DD_CPREC==0)
-//#undef DD_CPREC
-//#define DD_CPREC 1
-#if (DD_CPREC==1)
+#if (DD_CPREC==0)
+#undef DD_CPREC
+#define DD_CPREC 1
+#elif (DD_CPREC==1)
+#undef DD_CPREC
+#define DD_CPREC 2
+#elif (DD_CPREC==2)
 #undef DD_CPREC
-//#define DD_CPREC 2
-//#elif (DD_CPREC==2)
-//#undef DD_CPREC
 #define DD_CPREC 3
 #else

@@ -276,7 +276,7 @@ DD_FUNC(DD_GPREC_F, DD_SPREC_F, DD_CPREC_F, DD_RECON_F, DD_DAG_F, DD_XPAY_F)(DD_
 #endif // DD_GPREC
 #endif // DD_RECON
 #endif // DD_XPAY
-#endif // DD_DAG
+//#endif // DD_DAG

 #ifdef DD_LOOP
 #include "dslash_def.h"

--- a/dslash_quda.cu
+++ b/dslash_quda.cu
--- a/dslash_quda.h
+++ b/dslash_quda.h
@@ -16,16 +16,15 @@ extern "C" {
  extern FullGauge cudaGaugePrecise;
  extern FullGauge cudaGaugeSloppy;

+  extern FullClover cudaClover;
+  extern FullClover cudaCloverSloppy;
+
  extern QudaGaugeParam *gauge_param;
  extern QudaInvertParam *invert_param;

-  extern FullClover cudaClover;
-
 // ---------- dslash_quda.cu ----------

  int dslashCudaSharedBytes(Precision spinor_prec, int blockDim);
-  void initDslashCuda();
-  void bindGaugeTex(FullGauge gauge, int oddBit);

  // Double precision routines
  void dslashDCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
@@ -46,22 +45,76 @@ extern "C" {
 		       int oddBit, int daggerBit, ParitySpinor x, double a);

  // wrapper to above
-  void dslashCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity, int dagger);
-  void dslashXpayCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity, int dagger,
-		      ParitySpinor x, double a);
+  void dslashCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in,
+		  int parity, int dagger);
+  void dslashXpayCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in,
+		      int parity, int dagger, ParitySpinor x, double a);

  // Full Wilson matrix
-  void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa, int daggerBit);
-
+  void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa,
+	       int daggerBit);
  void MatPCCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven, 
-		 double kappa, ParitySpinor tmp, MatPCType matpc_type, int daggerBit);
+		 double kappa, ParitySpinor tmp, MatPCType matpc_type,
+		 int daggerBit);
+  void MatPCDagMatPCCuda(ParitySpinor outEven, FullGauge gauge,
+			 ParitySpinor inEven, double kappa, ParitySpinor tmp,
+			 MatPCType matpc_type);
+
+  // clover Dslash routines
+  void cloverDslashCuda(ParitySpinor out, FullGauge gauge,
+			FullClover cloverInv, ParitySpinor in, int parity,
+			int dagger);
+  void cloverDslashDCuda(ParitySpinor res, FullGauge gauge,
+			 FullClover cloverInv, ParitySpinor spinor,
+			 int oddBit, int daggerBit);
+  void cloverDslashSCuda(ParitySpinor res, FullGauge gauge,
+			 FullClover cloverInv, ParitySpinor spinor,
+			 int oddBit, int daggerBit);
+  void cloverDslashHCuda(ParitySpinor res, FullGauge gauge,
+			 FullClover cloverInv, ParitySpinor spinor,
+			 int oddBit, int daggerBit);

-  void MatPCDagMatPCCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven,
-			 double kappa, ParitySpinor tmp, MatPCType matpc_type);
+  void cloverDslashXpayCuda(ParitySpinor out, FullGauge gauge,
+			    FullClover cloverInv, ParitySpinor in, int parity,
+			    int dagger, ParitySpinor x, double a);
+  void cloverDslashXpayDCuda(ParitySpinor res, FullGauge gauge,
+			     FullClover cloverInv, ParitySpinor spinor,
+			     int oddBit, int daggerBit, ParitySpinor x,
+			     double a);
+  void cloverDslashXpaySCuda(ParitySpinor res, FullGauge gauge,
+			     FullClover cloverInv, ParitySpinor spinor,
+			     int oddBit, int daggerBit, ParitySpinor x,
+			     double a);
+  void cloverDslashXpayHCuda(ParitySpinor res, FullGauge gauge,
+			     FullClover cloverInv, ParitySpinor spinor,
+			     int oddBit, int daggerBit, ParitySpinor x,
+			     double a);
+
+  void cloverMatPCCuda(ParitySpinor out, FullGauge gauge,
+		       FullClover cloverInv, ParitySpinor in, double kappa,
+		       ParitySpinor tmp, MatPCType matpc_type, int dagger);
+  void cloverMatPCDagMatPCCuda(ParitySpinor out, FullGauge gauge,
+			       FullClover cloverInv, ParitySpinor in,
+			       double kappa, ParitySpinor tmp,
+			       MatPCType matpc_type);
+  void cloverMatCuda(FullSpinor out, FullGauge gauge, FullClover clover,
+		     FullSpinor in, double kappa, ParitySpinor tmp,
+		     int dagger);
+
+  // routines for applying the clover term alone
+  void cloverCuda(ParitySpinor out, FullGauge gauge, FullClover clover,
+		  ParitySpinor in, int parity);
+  void cloverDCuda(ParitySpinor res, FullGauge gauge, FullClover clover,
+		   ParitySpinor spinor, int oddBit);
+  void cloverSCuda(ParitySpinor res, FullGauge gauge, FullClover clover,
+		   ParitySpinor spinor, int oddBit);
+  void cloverHCuda(ParitySpinor res, FullGauge gauge, FullClover clover,
+		   ParitySpinor spinor, int oddBit);
  
  // -- inv_cg_cuda.cpp
  void invertCgCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge, 
-		    FullGauge gaugeSloppy, ParitySpinor tmp, QudaInvertParam *param);
+		    FullGauge gaugeSloppy, ParitySpinor tmp,
+		    QudaInvertParam *param);
  
  // -- inv_bicgstab_cuda.cpp
  void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge, 

--- a/dslash_test.c
+++ b/dslash_test.c
@@ -9,6 +9,8 @@

 // What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat)
 int test_type = 1;
+// clover-improved? (0 = plain Wilson, 1 = clover)
+int dslash_type = 0;

 QudaGaugeParam gaugeParam;
 QudaInvertParam inv_param;
@@ -49,13 +51,22 @@ void init() {
  gaugeParam.gauge_order = QUDA_QDP_GAUGE_ORDER;
  gaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;
  gaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
-  gauge_param = &gaugeParam;

  inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
  inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
  if (test_type == 2) inv_param.dirac_order = QUDA_DIRAC_ORDER;
  else inv_param.dirac_order = QUDA_DIRAC_ORDER;
  inv_param.kappa = kappa;
+
+  if (dslash_type) {
+    inv_param.dslash_type = QUDA_CLOVER_WILSON_DSLASH;
+    inv_param.clover_cpu_prec = QUDA_SINGLE_PRECISION;
+    inv_param.clover_cuda_prec = QUDA_SINGLE_PRECISION;
+  } else {
+    inv_param.dslash_type = QUDA_WILSON_DSLASH;
+  }
+
+  gauge_param = &gaugeParam;
  invert_param = &inv_param;

  size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
@@ -219,6 +230,10 @@ void dslashTest() {
    
    int flops = test_type ? 1320*2 + 48 : 1320;
    int floats = test_type ? 2*(7*24+8*gaugeParam.packed_size+24)+24 : 7*24+8*gaugeParam.packed_size+24;
+    if (dslash_type) {
+      flops += test_type ? 504*2 : 504;
+      floats += test_type ? 72*2 : 72;
+    }
    printf("GFLOPS = %f\n", 1.0e-9*flops*Vh/secs);
    printf("GiB/s = %f\n\n", Vh*floats*sizeof(float)/(secs*(1<<30)));
    
@@ -230,11 +245,8 @@ void dslashTest() {
      
      if (test_type < 2) strong_check(spinorRef, spinorOdd, Vh, inv_param.cpu_prec);
      else strong_check(spinorRef, spinorGPU, V, inv_param.cpu_prec);    
-    
  }    
-  
  end();
-  
 }

 int main(int argc, char **argv) {

--- a/enum_quda.h
+++ b/enum_quda.h
@@ -17,6 +17,11 @@ extern "C" {
    QUDA_LEX_DIRAC_ORDER // lexicographical order, colour inside spin
  } QudaDiracFieldOrder;  

+  typedef enum QudaCloverFieldOrder_s {
+    QUDA_PACKED_CLOVER_ORDER, // even-odd, packed
+    QUDA_LEX_PACKED_CLOVER_ORDER // lexicographical order, packed
+  } QudaCloverFieldOrder;
+
  typedef enum QudaDslashType_s {
    QUDA_WILSON_DSLASH,
    QUDA_CLOVER_WILSON_DSLASH
@@ -78,6 +83,12 @@ extern "C" {
    QUDA_PERIODIC_T = 1
  } QudaTboundary;

+  typedef enum QudaVerbosity_s {
+    QUDA_SILENT = 0,
+    QUDA_SUMMARIZE = 1,
+    QUDA_VERBOSE = 2
+  } QudaVerbosity;
+
  typedef struct double3_s {
    double x;
    double y;

--- a/inv_bicgstab_quda.cpp
+++ b/inv_bicgstab_quda.cpp
@@ -72,6 +72,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise
  int k=0;
  int xUpdate = 0, rUpdate = 0;

+  if (invert_param->verbosity >= QUDA_VERBOSE)
    printf("%d iterations, r2 = %e\n", k, r2);
  stopwatchStart();
  while (r2 > stop && k<invert_param->maxiter) {
@@ -143,6 +144,8 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise
    }

    k++;
+
+    if (invert_param->verbosity >= QUDA_VERBOSE)
      printf("%d iterations, r2 = %e\n", k, r2);
  }

@@ -154,6 +157,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise
  if (k==invert_param->maxiter) 
    printf("Exceeded maximum iterations %d\n", invert_param->maxiter);

+  if (invert_param->verbosity >= QUDA_SUMMARIZE)
    printf("Residual updates = %d, Solution updates = %d\n", rUpdate, xUpdate);

  float gflops = (1.0e-9*x.volume)*(2*(2*1320+48)*k + (32*k + 8*(k-1))*spinorSiteSize);

--- a/inv_cg_quda.cpp
+++ b/inv_cg_quda.cpp
@@ -59,6 +59,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugePrecise,
  int k=0;
  int xUpdate = 0, rUpdate = 0;

+  if (invert_param->verbosity >= QUDA_VERBOSE)
    printf("%d iterations, r2 = %e\n", k, r2);
  stopwatchStart();
  while (r2 > stop && k<perf->maxiter) {
@@ -110,6 +111,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugePrecise,
    }

    k++;
+    if (invert_param->verbosity >= QUDA_VERBOSE)
      printf("%d iterations, r2 = %e\n", k, r2);
  }

@@ -121,6 +123,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugePrecise,
  if (k==invert_param->maxiter) 
    printf("Exceeded maximum iterations %d\n", invert_param->maxiter);

+  if (invert_param->verbosity >= QUDA_SUMMARIZE)
    printf("Residual updates = %d, Solution updates = %d\n", rUpdate, xUpdate);

  float gflops = k*(1.0e-9*x.volume)*(2*(2*1320+48) + 10*spinorSiteSize);

--- a/invert_quda.cpp
+++ b/invert_quda.cpp
@@ -14,6 +14,9 @@
 FullGauge cudaGaugePrecise; // precise gauge field
 FullGauge cudaGaugeSloppy; // sloppy gauge field

+FullClover cudaCloverPrecise;
+FullClover cudaCloverSloppy;
+
 void printGaugeParam(QudaGaugeParam *param) {

  printf("Gauge Params:\n");
@@ -36,6 +39,7 @@ void printGaugeParam(QudaGaugeParam *param) {
 void printInvertParam(QudaInvertParam *param) {
  printf("kappa = %e\n", param->kappa);
  printf("mass_normalization = %d\n", param->mass_normalization);
+  printf("dslash_type = %d\n", param->dslash_type);
  printf("inv_type = %d\n", param->inv_type);
  printf("tol = %e\n", param->tol);
  printf("iter = %d\n", param->iter);
@@ -45,10 +49,19 @@ void printInvertParam(QudaInvertParam *param) {
  printf("preserve_source = %d\n", param->preserve_source);
  printf("cpu_prec = %d\n", param->cpu_prec);
  printf("cuda_prec = %d\n", param->cuda_prec);
+  printf("cuda_prec_sloppy = %d\n", param->cuda_prec_sloppy);
  printf("dirac_order = %d\n", param->dirac_order);
  printf("spinorGiB = %e\n", param->spinorGiB);
+  if (param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
+    printf("clover_cpu_prec = %d\n", param->clover_cpu_prec);
+    printf("clover_cuda_prec = %d\n", param->clover_cuda_prec);
+    printf("clover_cuda_prec_sloppy = %d\n", param->clover_cuda_prec_sloppy);
+    printf("clover_order = %d\n", param->clover_order);
+    printf("cloverGiB = %e\n", param->cloverGiB);
+  }
  printf("gflops = %e\n", param->gflops);
  printf("secs = %f\n", param->secs);
+  printf("verbosity = %d\n", param->verbosity);
 }

 void initQuda(int dev)
@@ -110,6 +123,14 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)

 }

+// for now, only single-precision clover is supported
+void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *param)
+{
+
+
+  cudaCloverSloppy = cudaCloverPrecise;
+}
+
 void endQuda()
 {
  freeSpinorBuffer();

--- a/invert_quda.h
+++ b/invert_quda.h
@@ -57,10 +57,19 @@ extern "C" {

    QudaDiracFieldOrder dirac_order;

+    QudaPrecision clover_cpu_prec;
+    QudaPrecision clover_cuda_prec;
+    QudaPrecision clover_cuda_prec_sloppy;
+
+    QudaCloverFieldOrder clover_order;
+
    double spinorGiB;
+    double cloverGiB;
    double gflops;
    double secs;

+    QudaVerbosity verbosity;
+
  } QudaInvertParam;

  // Interface functions

--- a/invert_test.c
+++ b/invert_test.c
@@ -53,8 +53,9 @@ int main(int argc, char **argv)
  inv_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
  inv_param.solution_type = QUDA_MAT_SOLUTION;
  inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
-  inv_param.preserve_source = QUDA_PRESERVE_SOURCE_YES;  // preservation doesn't work with reliable?
+  inv_param.preserve_source = QUDA_PRESERVE_SOURCE_YES;
  inv_param.dirac_order = QUDA_DIRAC_ORDER;
+  inv_param.verbosity = QUDA_VERBOSE;

  size_t gSize = (Gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
  size_t sSize = (inv_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);

--- a/quda.h
+++ b/quda.h
@@ -17,6 +17,7 @@
 // The Quda is added to avoid collisions with other libs
 #define GaugeFieldOrder QudaGaugeFieldOrder
 #define DiracFieldOrder QudaDiracFieldOrder
+#define CloverFieldOrder QudaCloverFieldOrder
 #define InverterType QudaInverterType  
 #define Precision QudaPrecision
 #define MatPCType QudaMatPCType
@@ -55,14 +56,14 @@ extern "C" {
    Precision precision;
    int length;
    int volume;
+    int X[4];
    int Nc;
    int Ns;
-    void *clover; // pointer to clover matrix
-    void *cloverInverse; // pointer to inverse of clover matrix
+    void *clover;
+    float *cloverNorm;
  } ParityClover;

  typedef struct {
-    Precision precision;
    ParityClover odd;
    ParityClover even;
  } FullClover;

--- a/read_clover.h
+++ b/read_clover.h
@@ -8,7 +8,7 @@
  double2 C6 = fetch_double2((clover), sid + (18*chi+6)*Vh);	\
  double2 C7 = fetch_double2((clover), sid + (18*chi+7)*Vh);	\
  double2 C8 = fetch_double2((clover), sid + (18*chi+8)*Vh);	\
-  double2 c9 = fetch_double2((clover), sid + (18*chi+9)*Vh);	\
+  double2 C9 = fetch_double2((clover), sid + (18*chi+9)*Vh);	\
  double2 C10 = fetch_double2((clover), sid + (18*chi+10)*Vh);  \
  double2 C11 = fetch_double2((clover), sid + (18*chi+11)*Vh);  \
  double2 C12 = fetch_double2((clover), sid + (18*chi+12)*Vh);  \
@@ -39,7 +39,7 @@
  float4 C6 = tex1Dfetch((clover), sid + (9*chi+6)*Vh);  \
  float4 C7 = tex1Dfetch((clover), sid + (9*chi+7)*Vh);  \
  float4 C8 = tex1Dfetch((clover), sid + (9*chi+8)*Vh);	 \
-  float K = tex1Dfetch((cloverTexNorm), sid+chi*Vh);	 \
+  float K = tex1Dfetch((cloverTexNorm), sid + chi*Vh);	 \
  C0.x *= K; C0.y *= K;	C0.z *= K; C0.w *= K;		 \
  C1.x *= K; C1.y *= K;	C1.z *= K; C1.w *= K;		 \
  C2.x *= K; C2.y *= K;	C2.z *= K; C2.w *= K;		 \