quda: standardized error reporting

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@599 be54200a-260c-0410-bdd7-ce6af2a381ab

quda: standardized error reporting
git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@599 be54200a-260c-0410-bdd7-ce6af2a381ab
12b32dea · rbabich · d8c36033 · 12b32dea · 12b32dea · 12b32dea
Commit 12b32dea authored Dec 11, 2009 by rbabich
--- a/NEWS
+++ b/NEWS
@@ -4,8 +4,6 @@ Version 0.x
 - Introduced new interface functions newQudaGaugeParam() and
  newQudaInvertParam() to allow for enhanced error checking.  See
  invert_test for an example of their use.
- Modified loadCloverQuda() to take gauge_param as an additional
-  parameter.
 - Added auto-tuning blas to improve performance (see README for details).
 - Improved stability of the half precision 8-parameter SU(3)
  reconstruction (with thanks to Guochun Shi).

--- a/include/quda.h
+++ b/include/quda.h
@@ -83,7 +83,7 @@ extern "C" {
  void initQuda(int dev);
  void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param);
  void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param);
-  void loadCloverQuda(void *h_clover, void *h_clovinv, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param);
+  void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param);

  void invertQuda(void *h_x, void *h_b, QudaInvertParam *param);


--- a/include/quda_internal.h
+++ b/include/quda_internal.h
@@ -3,6 +3,10 @@

 #include <cuda_runtime.h>

+#ifdef USE_QMP
+#include <qmp.h>
+#endif
+
 //#define L1 4 // "x" dimension
 //#define L2 4 // "y" dimension
 //#define L3 4 // "z" dimension
@@ -31,6 +35,7 @@
 #define Tboundary QudaTboundary

 #include <enum_quda.h>
+#include <util_quda.h>

 #ifdef __cplusplus
 extern "C" {

--- a/include/util_quda.h
+++ b/include/util_quda.h
 #ifndef _UTIL_QUDA_H
 #define _UTIL_QUDA_H

+#ifdef USE_QMP
+
+#include <qmp.h>
+
+#define printfQuda(...) do {	     \
+  if (QMP_get_node_number() == 0) {  \
+    printf(__VA_ARGS__);             \
+    fflush(stdout);                  \
+  }                                  \
+} while (0)
+
+#define errorQuda(...) do {		    \
+  printf("QUDA error: " __VA_ARGS__);       \
+  printf(" (node %d, " __FILE__ ":%d)\n",   \
+         QMP_get_node_number(), __LINE__);  \
+  QMP_abort(1);				    \
+} while (0)
+
+#else
+
+#define printfQuda(...) do { printf(__VA_ARGS__); fflush(stdout); } while (0)
+
+#define errorQuda(...) do {		     \
+  printf("QUDA error: " __VA_ARGS__);        \
+  printf(" (" __FILE__ ":%d)\n", __LINE__);  \
+  exit(1);				     \
+} while (0)
+
+#endif // USE_QMP
+
+#define warningQuda(...) do {                \
+  printfQuda("QUDA warning: " __VA_ARGS__);  \
+  printfQuda("\n");                          \
+} while (0)
+
+#define checkCudaError() do {                          \
+  cudaError_t error = cudaGetLastError();              \
+  if (error != cudaSuccess)                            \
+    errorQuda("CUDA: %s", cudaGetErrorString(error));  \
+} while (0)
+
 #ifdef __cplusplus
 extern "C" {
 #endif

--- a/lib/blas_quda.cu
+++ b/lib/blas_quda.cu
@@ -45,43 +45,37 @@ void initBlas(void) {
  
  if (!d_reduceFloat) {
    if (cudaMalloc((void**) &d_reduceFloat, REDUCE_MAX_BLOCKS*sizeof(QudaSumFloat)) == cudaErrorMemoryAllocation) {
-      printf("Error allocating device reduction array\n");
-      exit(0);
+      errorQuda("Error allocating device reduction array");
    }
  }

  if (!d_reduceComplex) {
    if (cudaMalloc((void**) &d_reduceComplex, REDUCE_MAX_BLOCKS*sizeof(QudaSumComplex)) == cudaErrorMemoryAllocation) {
-      printf("Error allocating device reduction array\n");
-      exit(0);
+      errorQuda("Error allocating device reduction array");
    }
  }
  
  if (!d_reduceFloat3) {
    if (cudaMalloc((void**) &d_reduceFloat3, REDUCE_MAX_BLOCKS*sizeof(QudaSumFloat3)) == cudaErrorMemoryAllocation) {
-      printf("Error allocating device reduction array\n");
-      exit(0);
+      errorQuda("Error allocating device reduction array");
    }
  }

  if (!h_reduceFloat) {
    if (cudaMallocHost((void**) &h_reduceFloat, REDUCE_MAX_BLOCKS*sizeof(QudaSumFloat)) == cudaErrorMemoryAllocation) {
-      printf("Error allocating host reduction array\n");
-      exit(0);
+      errorQuda("Error allocating host reduction array");
    }
  }

  if (!h_reduceComplex) {
    if (cudaMallocHost((void**) &h_reduceComplex, REDUCE_MAX_BLOCKS*sizeof(QudaSumComplex)) == cudaErrorMemoryAllocation) {
-      printf("Error allocating host reduction array\n");
-      exit(0);
+      errorQuda("Error allocating host reduction array");
    }
  }
  
  if (!h_reduceFloat3) {
    if (cudaMallocHost((void**) &h_reduceFloat3, REDUCE_MAX_BLOCKS*sizeof(QudaSumFloat3)) == cudaErrorMemoryAllocation) {
-      printf("Error allocating host reduction array\n");
-      exit(0);
+      errorQuda("Error allocating host reduction array");
    }
  }

@@ -309,21 +303,18 @@ texture<float, 1, cudaReadModeElementType> texNorm5;

 inline void checkSpinor(ParitySpinor &a, ParitySpinor &b) {
  if (a.precision != b.precision) {
-    printf("checkSpinor error, precisions do not match: %d %d\n", a.precision, b.precision);
-    exit(-1);
+    errorQuda("checkSpinor: precisions do not match: %d %d", a.precision, b.precision);
  }

  if (a.length != b.length) {
-    printf("checkSpinor error, lengths do not match: %d %d\n", a.length, b.length);
-    exit(-1);
+    errorQuda("checkSpinor: lengths do not match: %d %d", a.length, b.length);
  }
 }

 // For kernels with precision conversion built in
 inline void checkSpinorLength(ParitySpinor &a, ParitySpinor &b) {
  if (a.length != b.length) {
-    printf("checkSpinor error, lengths do not match: %d %d\n", a.length, b.length);
-    exit(-1);
+    errorQuda("checkSpinor: lengths do not match: %d %d", a.length, b.length);
  }
 }


--- a/lib/check_params.h
+++ b/lib/check_params.h
@@ -2,8 +2,13 @@

 // This file defines functions to either initialize, check, or print
 // the QUDA gauge and inverter parameters.  It gets included in
-// invert_quda.cpp, after either INIT_PARAM, CHECK_PARAM, or
+// interface_quda.cpp, after either INIT_PARAM, CHECK_PARAM, or
 // PRINT_PARAM is defined.
+//
+// If you're reading this file because it was mentioned in a "QUDA
+// error" message, it probably means that you forgot to set one of the
+// gauge or inverter parameters in your application before calling
+// loadGaugeQuda() or invertQuda().

 #include <float.h>
 #define INVALID_INT QUDA_INVALID_ENUM
@@ -12,18 +17,12 @@
 // define macro to carry out the appropriate action for a given parameter

 #if defined INIT_PARAM
-#define P(x, val) do { ret.x = val; } while (0)
+#define P(x, val) ret.x = val
 #elif defined CHECK_PARAM
-#define P(x, val) do {                           \
-    if (param->x == val) {                       \
-      printf("QUDA error: " #x " undefined.\n"); \
-      exit(1);                                   \
-    }                                            \
-} while (0)
+#define P(x, val) if (param->x == val) errorQuda("Parameter " #x " undefined")
 #elif defined PRINT_PARAM
-#define P(x, val) do {                                                       \
-    printf((val == INVALID_DOUBLE) ? #x " = %g\n" : #x " = %d\n", param->x); \
-} while (0)
+#define P(x, val) \
+  printfQuda((val == INVALID_DOUBLE) ? #x " = %g\n" : #x " = %d\n", param->x)
 #else
 #error INIT_PARAM, CHECK_PARAM, and PRINT_PARAM all undefined in check_params.h
 #endif
@@ -38,7 +37,7 @@ QudaGaugeParam newQudaGaugeParam(void) {
 static void checkGaugeParam(QudaGaugeParam *param) {
 #else
 void printQudaGaugeParam(QudaGaugeParam *param) {
-  printf("QUDA Gauge Parameters:\n");
+  printfQuda("QUDA Gauge Parameters:\n");
 #endif

  P(X[0], INVALID_INT);
@@ -76,7 +75,7 @@ QudaInvertParam newQudaInvertParam(void) {
 static void checkInvertParam(QudaInvertParam *param) {
 #else
 void printQudaInvertParam(QudaInvertParam *param) {
-  printf("QUDA Inverter Parameters:\n");
+  printfQuda("QUDA Inverter Parameters:\n");
 #endif

  P(dslash_type, QUDA_INVALID_DSLASH);

--- a/lib/clover_quda.cpp
+++ b/lib/clover_quda.cpp
@@ -27,16 +27,14 @@ void allocateParityClover(ParityClover *ret, int *X, int pad, Precision precisio

  if (!ret->clover) {
    if (cudaMalloc((void**)&(ret->clover), ret->bytes) == cudaErrorMemoryAllocation) {
-      printf("Error allocating clover term\n");
-      exit(0);
+      errorQuda("Error allocating clover term");
    }   
  }

  if (!ret->cloverNorm) {
    if (precision == QUDA_HALF_PRECISION) {
      if (cudaMalloc((void**)&ret->cloverNorm, ret->bytes/18) == cudaErrorMemoryAllocation) {
-	printf("Error allocating cloverNorm\n");
-	exit(0);
+	errorQuda("Error allocating cloverNorm");
      }
    }
  }
@@ -178,23 +176,19 @@ void loadParityClover(ParityClover ret, void *clover, Precision cpu_prec,
  void *packedClover, *packedCloverNorm;

  if (ret.precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) {
-    printf("QUDA error: cannot have CUDA double precision without double CPU precision\n");
-    exit(-1);
+    errorQuda("Cannot have CUDA double precision without CPU double precision");
  }
  if (clover_order != QUDA_PACKED_CLOVER_ORDER) {
-    printf("QUDA error: invalid clover order\n");
-    exit(-1);
+    errorQuda("Invalid clover_order");
  }

 #ifndef __DEVICE_EMULATION__
  if (cudaMallocHost(&packedClover, ret.bytes) == cudaErrorMemoryAllocation) {
-    printf("Error allocating clover pinned memory\n");
-    exit(0);
+    errorQuda("Error allocating clover pinned memory");
  }  
  if (ret.precision == QUDA_HALF_PRECISION) 
    if (cudaMallocHost(&packedCloverNorm, ret.bytes/18) == cudaErrorMemoryAllocation) {
-      printf("Error allocating clover pinned memory\n");
-      exit(0);
+      errorQuda("Error allocating clover pinned memory");
    } 
 #else
  packedClover = malloc(ret.bytes);
@@ -241,12 +235,10 @@ void loadFullClover(FullClover ret, void *clover, Precision cpu_prec,
  void *packedEven, *packedEvenNorm, *packedOdd, *packedOddNorm;

  if (ret.even.precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) {
-    printf("QUDA error: cannot have CUDA double precision without double CPU precision\n");
-    exit(-1);
+    errorQuda("Cannot have CUDA double precision without CPU double precision");
  }
  if (clover_order != QUDA_LEX_PACKED_CLOVER_ORDER) {
-    printf("QUDA error: invalid clover order\n");
-    exit(-1);
+    errorQuda("Invalid clover order");
  }

 #ifndef __DEVICE_EMULATION__
@@ -321,8 +313,7 @@ void loadCloverField(FullClover ret, void *clover, Precision cpu_prec, CloverFie
    loadParityClover(ret.even, clover, cpu_prec, clover_order);
    loadParityClover(ret.odd, clover_odd, cpu_prec, clover_order);
  } else {
-    printf("QUDA error: CloverFieldOrder %d not supported\n", clover_order);
-    exit(-1);
+    errorQuda("Invalid clover_order");
  }
 }

@@ -330,8 +321,7 @@ void loadCloverField(FullClover ret, void *clover, Precision cpu_prec, CloverFie
 void createCloverField(FullClover *cudaClover, void *cpuClover, int *X, Precision precision, QudaInvertParam invert_param)
 {
  if (invert_param->clover_cpu_prec == QUDA_HALF_PRECISION) {
-    printf("QUDA error: half precision not supported on cpu\n");
-    exit(-1);
+    errorQuda("Half precision not supported on CPU");
  }

  // X should contain the dimensions of the even/odd sublattice

--- a/lib/dslash_common.h
+++ b/lib/dslash_common.h
 static void checkSpinor(ParitySpinor out, ParitySpinor in) {
  if (in.precision != out.precision) {
-    printf("Error in dslash quda: input and out spinor precisions don't match\n");
-    exit(-1);
+    errorQuda("Input and output spinor precisions don't match in dslash_quda");
  }

  if (in.stride != out.stride) {
-    printf("Error in dslash quda: input and out spinor strides don't match\n");
-    exit(-1);
+    errorQuda("Input and output spinor strides don't match in dslash_quda");
  }

 #if (__CUDA_ARCH__ != 130)
  if (in.precision == QUDA_DOUBLE_PRECISION) {
-    printf("Double precision not supported on this GPU\n");
-    exit(-1);    
+    errorQuda("Double precision not supported on this GPU");
  }
 #endif
 }

 static void checkGaugeSpinor(ParitySpinor spinor, FullGauge gauge) {
  if (spinor.volume != gauge.volume) {
-    printf("Error, spinor volume %d doesn't match gauge volume %d\n", spinor.volume, gauge.volume);
-    exit(-1);
+    errorQuda("Spinor volume %d doesn't match gauge volume %d", spinor.volume, gauge.volume);
  }

 #if (__CUDA_ARCH__ != 130)
  if (gauge.precision == QUDA_DOUBLE_PRECISION) {
-    printf("Double precision not supported on this GPU\n");
-    exit(-1);    
+    errorQuda("Double precision not supported on this GPU");
  }
 #endif
 }

 static void checkCloverSpinor(ParitySpinor spinor, FullClover clover) {
  if (spinor.volume != clover.even.volume) {
-    printf("Error, spinor volume %d doesn't match even clover volume %d\n",
-	   spinor.volume, clover.even.volume);
-    exit(-1);
+    errorQuda("Spinor volume %d doesn't match even clover volume %d",
+	      spinor.volume, clover.even.volume);
  }
  if (spinor.volume != clover.odd.volume) {
-    printf("Error, spinor volume %d doesn't match odd clover volume %d\n",
-	   spinor.volume, clover.odd.volume);
-    exit(-1);
+    errorQuda("Spinor volume %d doesn't match odd clover volume %d",
+	      spinor.volume, clover.odd.volume);
  }

 #if (__CUDA_ARCH__ != 130)
  if ((clover.even.precision == QUDA_DOUBLE_PRECISION) ||
      (clover.odd.precision == QUDA_DOUBLE_PRECISION)) {
-    printf("Double precision not supported on this GPU\n");
-    exit(-1);    
+    errorQuda("Double precision not supported on this GPU");
  }
 #endif
 }
-
-
--- a/lib/dslash_quda.cu
+++ b/lib/dslash_quda.cu
@@ -47,8 +47,7 @@ void initDslashConstants(FullGauge gauge, int sp_stride, int cl_stride) {
  cudaMemcpyToSymbol("cl_stride", &cl_stride, sizeof(int));  

  if (Vh%BLOCK_DIM != 0) {
-    printf("Error, volume not a multiple of the thread block size\n");
-    exit(-1);
+    errorQuda("Error, Volume not a multiple of the thread block size");
  }

  int X1 = 2*gauge.X[0];
@@ -122,12 +121,7 @@ void initDslashConstants(FullGauge gauge, int sp_stride, int cl_stride) {
  float h_pi_f = M_PI;
  cudaMemcpyToSymbol("pi_f", &(h_pi_f), sizeof(float));

-  cudaError_t error = cudaGetLastError();
-  cudaGetLastError();
-  if(error != cudaSuccess) {
-    printf("initDslashConstants error: %s\n",  cudaGetErrorString(error));
-    exit(0);
-  }
+  checkCudaError();

  initDslash = 1;
 }
@@ -175,13 +169,7 @@ void dslashCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity,
  } else if (in.precision == QUDA_HALF_PRECISION) {
    dslashHCuda(out, gauge, in, parity, dagger);
  }
-
-  cudaError_t error = cudaGetLastError();
-  cudaGetLastError();
-  if(error != cudaSuccess) {
-    printf("dslashCuda error: %s\n",  cudaGetErrorString(error));
-    exit(0);
-  }
+  checkCudaError();

  dslash_quda_flops += 1320*in.volume;
 }
@@ -386,13 +374,7 @@ void dslashXpayCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int pari
  } else if (in.precision == QUDA_HALF_PRECISION) {
    dslashXpayHCuda(out, gauge, in, parity, dagger, x, a);
  }
-
-  cudaError_t error = cudaGetLastError();
-  cudaGetLastError();
-  if(error != cudaSuccess) {
-    printf("dslashXpayCuda error: %s\n",  cudaGetErrorString(error));
-    exit(0);
-  }
+  checkCudaError();

  dslash_quda_flops += (1320+48)*in.volume;
 }
@@ -616,8 +598,7 @@ void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa,
    dslashCuda(tmp, gauge, in, 0, dagger);
    dslashXpayCuda(out, gauge, tmp, 1, dagger, in, kappa2); 
  } else {
-    printf("QUDA error: matpc_type not valid for plain Wilson\n");
-    exit(-1);
+    errorQuda("matpc_type not valid for plain Wilson");
  }
 }

@@ -666,13 +647,7 @@ void cloverDslashCuda(ParitySpinor out, FullGauge gauge, FullClover cloverInv,
  } else if (in.precision == QUDA_HALF_PRECISION) {
    cloverDslashHCuda(out, gauge, cloverInv, in, parity, dagger);
  }
-
-  cudaError_t error = cudaGetLastError();
-  cudaGetLastError();
-  if(error != cudaSuccess) {
-    printf("cloverDslashCuda error: %s\n",  cudaGetErrorString(error));
-    exit(0);
-  }
+  checkCudaError();

  dslash_quda_flops += (1320+504)*in.volume;
 }
@@ -1179,13 +1154,7 @@ void cloverDslashXpayCuda(ParitySpinor out, FullGauge gauge, FullClover cloverIn
  } else if (in.precision == QUDA_HALF_PRECISION) {
    cloverDslashXpayHCuda(out, gauge, cloverInv, in, parity, dagger, x, a);
  }
-
-  cudaError_t error = cudaGetLastError();
-  cudaGetLastError();
-  if(error != cudaSuccess) {
-    printf("cloverDslashXpayCuda error: %s\n",  cudaGetErrorString(error));
-    exit(0);
-  }
+  checkCudaError();

  dslash_quda_flops += (1320+504+48)*in.volume;
 }
@@ -1725,7 +1694,7 @@ void cloverMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover clover, FullC

  if (((matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) || (matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC))
      && (clover.even.clover == NULL)) {
-    printf("QUDA error: For asymmetric matpc_type, the uninverted clover term must be loaded\n");
+    errorQuda("For asymmetric matpc_type, the uninverted clover term must be loaded");
  }

  // FIXME: For asymmetric, a "dslashCxpay" kernel would improve performance.
@@ -1746,8 +1715,7 @@ void cloverMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover clover, FullC
      cloverDslashCuda(tmp, gauge, cloverInv, in, 0, dagger);
      cloverDslashXpayCuda(out, gauge, cloverInv, tmp, 1, dagger, in, kappa2); 
    } else {
-      printf("QUDA error: invalid matpc_type\n");
-      exit(-1);
+      errorQuda("Invalid matpc_type");
    }
  } else { // symmetric preconditioning, dagger
    if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
@@ -1759,8 +1727,7 @@ void cloverMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover clover, FullC
      cloverDslashCuda(tmp, gauge, cloverInv, out, 0, dagger);
      dslashXpayCuda(out, gauge, tmp, 1, dagger, in, kappa2); 
    } else {
-      printf("QUDA error: invalid matpc_type\n");
-      exit(-1);
+      errorQuda("Invalid matpc_type");
    }
  }
 }
@@ -1803,13 +1770,7 @@ void cloverCuda(ParitySpinor out, FullGauge gauge, FullClover clover,
  } else if (in.precision == QUDA_HALF_PRECISION) {
    cloverHCuda(out, gauge, clover, in, parity);
  }
-
-  cudaError_t error = cudaGetLastError();
-  cudaGetLastError();
-  if(error != cudaSuccess) {
-    printf("cloverCuda error: %s\n",  cudaGetErrorString(error));
-    exit(0);
-  }
+  checkCudaError();

  dslash_quda_flops += 504*in.volume;
 }

--- a/lib/gauge_quda.cpp
+++ b/lib/gauge_quda.cpp
@@ -488,15 +488,13 @@ static void allocateGaugeField(FullGauge *cudaGauge, ReconstructType reconstruct

  if (!cudaGauge->even) {
    if (cudaMalloc((void **)&cudaGauge->even, cudaGauge->bytes) == cudaErrorMemoryAllocation) {
-      printf("Error allocating even gauge field\n");
-      exit(0);
+      errorQuda("Error allocating even gauge field");
    }
  }
   
  if (!cudaGauge->odd) {
    if (cudaMalloc((void **)&cudaGauge->odd, cudaGauge->bytes) == cudaErrorMemoryAllocation) {
-      printf("Error allocating even odd gauge field\n");
-      exit(0);
+      errorQuda("Error allocating even odd gauge field");
    }
  }

@@ -531,22 +529,14 @@ static void loadGaugeField(FloatN *even, FloatN *odd, Float *cpuGauge, GaugeFiel
    packCPSGaugeField(packedEven, (Float*)cpuGauge, 0, reconstruct, Vh, pad);
    packCPSGaugeField(packedOdd,  (Float*)cpuGauge, 1, reconstruct, Vh, pad);    
  } else {
-    printf("Sorry, %d GaugeFieldOrder not supported\n", gauge_order);
-    exit(-1);
-  }
-    
-  cudaError_t error = cudaMemcpy(even, packedEven, bytes, cudaMemcpyHostToDevice);
-  if (error != cudaSuccess) {
-    printf("Error: %s\n", cudaGetErrorString(error));
-    exit(-1);
+    errorQuda("Invalid gauge_order");
  }

-  error = cudaMemcpy(odd,  packedOdd, bytes, cudaMemcpyHostToDevice);
-  if (error != cudaSuccess) {
-    printf("Error: %s\n", cudaGetErrorString(error));
-    exit(-1);
-  }
-  
+  cudaMemcpy(even, packedEven, bytes, cudaMemcpyHostToDevice);
+  checkCudaError();
+    
+  cudaMemcpy(odd,  packedOdd, bytes, cudaMemcpyHostToDevice);
+  checkCudaError();
    
 #ifndef __DEVICE_EMULATION__
  cudaFreeHost(packedEven);
@@ -583,8 +573,7 @@ static void retrieveGaugeField(Float *cpuGauge, FloatN *even, FloatN *odd, Gauge
    unpackCPSGaugeField((Float*)cpuGauge, packedEven, 0, reconstruct, Vh, pad);
    unpackCPSGaugeField((Float*)cpuGauge, packedOdd, 1, reconstruct, Vh, pad);
  } else {
-    printf("Sorry, %d GaugeFieldOrder not supported\n", gauge_order);
-    exit(-1);
+    errorQuda("Invalid gauge_order");
  }
    
 #ifndef __DEVICE_EMULATION__
@@ -602,8 +591,7 @@ void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, Precision cuda_prec,
 		      Tboundary t_boundary, int *XX, double anisotropy, int pad)
 {
  if (cpu_prec == QUDA_HALF_PRECISION) {
-    printf("QUDA error: half precision not supported on cpu\n");
-    exit(-1);
+    errorQuda("Half precision not supported on CPU");
  }

  Anisotropy = anisotropy;
@@ -658,8 +646,7 @@ void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, Precision cuda_prec,
 void restoreGaugeField(void *cpuGauge, FullGauge *cudaGauge, Precision cpu_prec, GaugeFieldOrder gauge_order)
 {
  if (cpu_prec == QUDA_HALF_PRECISION) {
-    printf("QUDA error: half precision not supported on cpu\n");
-    exit(-1);
+    errorQuda("Half precision not supported on CPU");
  }

  if (cudaGauge->precision == QUDA_DOUBLE_PRECISION) {

--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -40,8 +40,7 @@ FullClover cudaCloverInvSloppy;
 static void checkPrecision(QudaPrecision precision)
 {
  if (precision == QUDA_HALF_PRECISION) {
-    printf("Half precision not supported on cpu\n");
-    exit(-1);
+    errorQuda("Half precision not supported on CPU");
  }
 }

@@ -50,29 +49,26 @@ void initQuda(int dev)
  int deviceCount;
  cudaGetDeviceCount(&deviceCount);
  if (deviceCount == 0) {
-    fprintf(stderr, "No devices supporting CUDA.\n");
-    exit(EXIT_FAILURE);
+    errorQuda("No devices supporting CUDA");
  }

  for(int i=0; i<deviceCount; i++) {
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, i);
-    fprintf(stderr, "found device %d: %s\n", i, deviceProp.name);
+    fprintf(stderr, "QUDA: Found device %d: %s\n", i, deviceProp.name);
  }

-  if(dev<0) {
+  if (dev < 0) {
    dev = deviceCount - 1;
-    //dev = 0;
  }

  cudaDeviceProp deviceProp;
  cudaGetDeviceProperties(&deviceProp, dev);
  if (deviceProp.major < 1) {
-    fprintf(stderr, "Device %d does not support CUDA.\n", dev);
-    exit(EXIT_FAILURE);
+    errorQuda("Device %d does not support CUDA", dev);
  }

-  fprintf(stderr, "Using device %d: %s\n", dev, deviceProp.name);
+  fprintf(stderr, "QUDA: Using device %d: %s\n", dev, deviceProp.name);
  cudaSetDevice(dev);

  cudaGaugePrecise.even = NULL;
@@ -126,30 +122,25 @@ void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
  restoreGaugeField(h_gauge, &cudaGaugePrecise, param->cpu_prec, param->gauge_order);
 }

-void loadCloverQuda(void *h_clover, void *h_clovinv, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
+void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
 {
  if (!h_clover && !h_clovinv) {
-    printf("QUDA error: loadCloverQuda() called with neither clover term nor inverse\n");
-    exit(-1);
+    errorQuda("loadCloverQuda() called with neither clover term nor inverse");
  }
  if (inv_param->clover_cpu_prec == QUDA_HALF_PRECISION) {
-    printf("QUDA error: half precision not supported on CPU\n");
-    exit(-1);
+    errorQuda("Half precision not supported on CPU");
  }
  if (cudaGaugePrecise.even == NULL) {
-    printf("QUDA error: gauge field must be loaded before clover\n");
-    exit(-1);
+    errorQuda("Gauge field must be loaded before clover");
  }
  if (inv_param->dslash_type != QUDA_CLOVER_WILSON_DSLASH) {
-    printf("QUDA error: wrong dslash_type in loadCloverQuda()\n");
-    exit(-1);
+    errorQuda("Wrong dslash_type in loadCloverQuda()");
  }

  int X[4];
  for (int i=0; i<4; i++) {
-    X[i] = gauge_param->X[i];
+    X[i] = cudaGaugePrecise.X[i];
  }
-  X[0] /= 2; // dimensions of the even-odd sublattice

  inv_param->cloverGiB = 0;

@@ -172,8 +163,7 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaGaugeParam *gauge_param

  allocateCloverField(&cudaCloverInvPrecise, X, inv_param->cl_pad, inv_param->clover_cuda_prec);
  if (!h_clovinv) {
-    printf("QUDA error: clover term inverse not implemented yet\n");
-    exit(-1);
+    errorQuda("Clover term inverse not implemented yet");
  } else {
    loadCloverField(cudaCloverInvPrecise, h_clovinv, inv_param->clover_cpu_prec, inv_param->clover_order);
  }
@@ -232,8 +222,7 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int parity,
  } else if (inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
    cloverDslashCuda(out, cudaGaugePrecise, cudaCloverInvPrecise, in, parity, dagger);
  } else {
-    printf("QUDA error: unsupported dslash_type\n");
-    exit(-1);
+    errorQuda("Unsupported dslash_type");
  }
  retrieveParitySpinor(h_out, out, inv_param->cpu_prec, inv_param->dirac_order);

@@ -261,8 +250,7 @@ void MatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int dagger)
    cloverMatPCCuda(out, cudaGaugePrecise, cudaCloverPrecise, cudaCloverInvPrecise, in, kappa,
 		    tmp, inv_param->matpc_type, dagger);
  } else {
-    printf("QUDA error: unsupported dslash_type\n");
-    exit(-1);
+    errorQuda("Unsupported dslash_type");
  }
  retrieveParitySpinor(h_out, out, inv_param->cpu_prec, inv_param->dirac_order);

@@ -291,8 +279,7 @@ void MatPCDagMatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
    cloverMatPCDagMatPCCuda(out, cudaGaugePrecise, cudaCloverPrecise, cudaCloverInvPrecise, in, kappa,
 			    tmp, inv_param->matpc_type);
  } else {
-    printf("QUDA error: unsupported dslash_type\n");
-    exit(-1);
+    errorQuda("Unsupported dslash_type");
  }
  retrieveParitySpinor(h_out, out, inv_param->cpu_prec, inv_param->dirac_order);

@@ -321,8 +308,7 @@ void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int dagger)
    cloverMatCuda(out, cudaGaugePrecise, cudaCloverPrecise, in, kappa, tmp, dagger);
    freeParitySpinor(tmp);
  } else {
-    printf("QUDA error: unsupported dslash_type\n");
-    exit(-1);
+    errorQuda("Unsupported dslash_type");
  }
  retrieveSpinorField(h_out, out, inv_param->cpu_prec, inv_param->dirac_order);

@@ -388,8 +374,7 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
 	// in = b_o + k D_oe b_e
 	dslashXpayCuda(in, cudaGaugePrecise, b.even, 1, 0, b.odd, kappa);
      } else {
-	printf("QUDA error: matpc_type not valid for plain Wilson\n");
-	exit(-1);
+	errorQuda("matpc_type not valid for plain Wilson");
      }
    } else if (param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
      if (param->matpc_type == QUDA_MATPC_EVEN_EVEN) {
@@ -415,12 +400,10 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
 	cloverCuda(aux, cudaGaugePrecise, cudaCloverInvPrecise, b.even, 0); // safe even when aux = b.even
 	dslashXpayCuda(in, cudaGaugePrecise, aux, 1, 0, b.odd, kappa);
      } else {
-	printf("QUDA error: invalid matpc_type\n");
-	exit(-1);
+	errorQuda("Invalid matpc_type");
      }
    } else {
-      printf("QUDA error: unsupported dslash_type\n");
-      exit(-1);
+      errorQuda("Unsupported dslash_type");
    }

  } else if (param->solution_type == QUDA_MATPC_SOLUTION || 
@@ -465,8 +448,7 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
    invertBiCGstabCuda(out, in, tmp, param, QUDA_DAG_NO);
    break;
  default:
-    printf("Inverter type %d not implemented\n", param->inv_type);
-    exit(-1);
+    errorQuda("Inverter type %d not implemented", param->inv_type);
  }
  
  if (param->solution_type == QUDA_MAT_SOLUTION) {

--- a/lib/inv_bicgstab_quda.cpp
+++ b/lib/inv_bicgstab_quda.cpp
@@ -13,12 +13,7 @@
 void MatVec(ParitySpinor out, FullGauge gauge,  FullClover clover, FullClover cloverInv, ParitySpinor in, 
 	    QudaInvertParam *invert_param, ParitySpinor tmp, DagType dag_type) {

-  {cudaError_t error = cudaGetLastError();
-  cudaGetLastError();
-  if(error != cudaSuccess) {
-    printf("B4 MatVec: %s\n",  cudaGetErrorString(error));
-    exit(0);
-  }}
+  checkCudaError();

  double kappa = invert_param->kappa;
  if (invert_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER)
@@ -30,14 +25,7 @@ void MatVec(ParitySpinor out, FullGauge gauge,  FullClover clover, FullClover cl
    cloverMatPCCuda(out, gauge, clover, cloverInv, in, kappa, tmp,
 		    invert_param->matpc_type, dag_type);
  }
-
-  cudaError_t error = cudaGetLastError();
-  cudaGetLastError();
-  if(error != cudaSuccess) {
-    printf("MatVec: %s\n",  cudaGetErrorString(error));
-    exit(0);
-  }
-  
+  checkCudaError();
 }

 void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, ParitySpinor r, 
@@ -95,7 +83,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, ParitySpinor r,
  double maxrr = rNorm;
  double maxrx = rNorm;

-  if (invert_param->verbosity >= QUDA_VERBOSE) printf("%d iterations, r2 = %e\n", k, r2);
+  if (invert_param->verbosity >= QUDA_VERBOSE) printfQuda("%d iterations, r2 = %e\n", k, r2);

  blas_quda_flops = 0;
  dslash_quda_flops = 0;
@@ -166,20 +154,20 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, ParitySpinor r,
    }
    
    k++;
-    if (invert_param->verbosity >= QUDA_VERBOSE) printf("%d iterations, r2 = %e\n", k, r2);
+    if (invert_param->verbosity >= QUDA_VERBOSE) printfQuda("%d iterations, r2 = %e\n", k, r2);
  }
  
  if (x.precision != x_sloppy.precision) copyCuda(x, x_sloppy);
  xpyCuda(y, x);
    
-  if (k==invert_param->maxiter) printf("Exceeded maximum iterations %d\n", invert_param->maxiter);
+  if (k==invert_param->maxiter) warningQuda("Exceeded maximum iterations %d", invert_param->maxiter);

-  if (invert_param->verbosity >= QUDA_VERBOSE) printf("Reliable updates = %d\n", rUpdate);
+  if (invert_param->verbosity >= QUDA_VERBOSE) printfQuda("Reliable updates = %d\n", rUpdate);
  
  invert_param->secs += stopwatchReadSeconds();
  
  float gflops = (blas_quda_flops + dslash_quda_flops)*1e-9;
-  //  printf("%f gflops\n", gflops / stopwatchReadSeconds());
+  //  printfQuda("%f gflops\n", gflops / stopwatchReadSeconds());
  invert_param->gflops += gflops;
  invert_param->iter += k;
  
@@ -190,7 +178,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, ParitySpinor r,
  copyCuda(b, src);
    
  if (invert_param->verbosity >= QUDA_SUMMARIZE)
-    printf("Converged after %d iterations, r2 = %e, true_r2 = %e\n", k, sqrt(r2/b2), sqrt(true_res / b2));    
+    printfQuda("Converged after %d iterations, r2 = %e, true_r2 = %e\n", k, sqrt(r2/b2), sqrt(true_res / b2));    
 #endif

  if (invert_param->cuda_prec_sloppy != x.precision) {

--- a/lib/inv_cg_quda.cpp
+++ b/lib/inv_cg_quda.cpp
@@ -64,7 +64,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor b, ParitySpinor y, QudaInvertPara
  int k=0;
  int rUpdate = 0;

-  if (invert_param->verbosity >= QUDA_VERBOSE) printf("%d iterations, r2 = %e\n", k, r2);
+  if (invert_param->verbosity >= QUDA_VERBOSE) printfQuda("%d iterations, r2 = %e\n", k, r2);

  blas_quda_flops = 0;

@@ -112,7 +112,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor b, ParitySpinor y, QudaInvertPara

    k++;
    if (invert_param->verbosity >= QUDA_VERBOSE)
-      printf("%d iterations, r2 = %e\n", k, r2);
+      printfQuda("%d iterations, r2 = %e\n", k, r2);
  }

  if (x.precision != x_sloppy.precision) copyCuda(x, x_sloppy);
@@ -122,13 +122,13 @@ void invertCgCuda(ParitySpinor x, ParitySpinor b, ParitySpinor y, QudaInvertPara
  

  if (k==invert_param->maxiter) 
-    printf("Exceeded maximum iterations %d\n", invert_param->maxiter);
+    warningQuda("Exceeded maximum iterations %d", invert_param->maxiter);

  if (invert_param->verbosity >= QUDA_SUMMARIZE)
-    printf("Reliable updates = %d\n", rUpdate);
+    printfQuda("Reliable updates = %d\n", rUpdate);

  float gflops = (blas_quda_flops + dslash_quda_flops)*1e-9;
-  //  printf("%f gflops\n", gflops / stopwatchReadSeconds());
+  //  printfQuda("%f gflops\n", gflops / stopwatchReadSeconds());
  invert_param->gflops = gflops;
  invert_param->iter = k;

@@ -140,7 +140,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor b, ParitySpinor y, QudaInvertPara
  MatVec(r, cudaGaugePrecise, cudaCloverPrecise, cudaCloverInvPrecise, x, y);
  double true_res = xmyNormCuda(b, r);
  
-  printf("Converged after %d iterations, r2 = %e, true_r2 = %e\n", 
+  printfQuda("Converged after %d iterations, r2 = %e, true_r2 = %e\n", 
 	 k, r2, true_res / b2);
 #endif


--- a/lib/reduce_complex_core.h
+++ b/lib/reduce_complex_core.h
@@ -103,13 +103,11 @@ cuDoubleComplex REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n, int kernel, QudaPre
  setBlock(kernel, n, precision);
  
  if (n % blasBlock.x != 0) {
-    printf("ERROR reduce_complex(): length %d must be a multiple of %d\n", n, blasBlock.x);
-    exit(-1);
+    errorQuda("reduce_complex: length %d must be a multiple of %d", n, blasBlock.x);
  }
  
  if (blasBlock.x > REDUCE_MAX_BLOCKS) {
-    printf("ERROR reduce_complex: block size greater then maximum permitted\n");
-    exit(-1);
+    errorQuda("reduce_complex: block size greater than maximum permitted");
  }
  
 #if (REDUCE_TYPE == REDUCE_KAHAN)
@@ -129,16 +127,12 @@ cuDoubleComplex REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n, int kernel, QudaPre
  } else if (blasBlock.x == 1024) {
    REDUCE_FUNC_NAME(Kernel)<1024><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceComplex, n);
  } else {
-    printf("Reduction not implemented for %d threads\n", blasBlock.x);
-    exit(-1);
+    errorQuda("Reduction not implemented for %d threads", blasBlock.x);
  }

  // copy result from device to host, and perform final reduction on CPU
-  cudaError_t error = cudaMemcpy(h_reduceComplex, d_reduceComplex, blasGrid.x*sizeof(QudaSumComplex), cudaMemcpyDeviceToHost);
-  if (error != cudaSuccess) {
-    printf("Error: %s\n", cudaGetErrorString(error));
-    exit(-1);
-  }
+  cudaMemcpy(h_reduceComplex, d_reduceComplex, blasGrid.x*sizeof(QudaSumComplex), cudaMemcpyDeviceToHost);
+  checkCudaError();
  
  cuDoubleComplex gpu_result;
  gpu_result.x = 0;

--- a/lib/reduce_core.h
+++ b/lib/reduce_core.h
@@ -85,13 +85,11 @@ double REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n, int kernel, QudaPrecision pr
  setBlock(kernel, n, precision);
  
  if (n % blasBlock.x != 0) {
-    printf("ERROR reduce_core: length %d must be a multiple of %d\n", n, blasBlock.x);
-    exit(-1);
+    errorQuda("reduce_core: length %d must be a multiple of %d", n, blasBlock.x);
  }

  if (blasBlock.x > REDUCE_MAX_BLOCKS) {
-    printf("ERROR reduce_core: block size greater then maximum permitted\n");
-    exit(-1);
+    errorQuda("reduce_core: block size greater then maximum permitted");
  }
  
 #if (REDUCE_TYPE == REDUCE_KAHAN)
@@ -111,16 +109,12 @@ double REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n, int kernel, QudaPrecision pr
  } else if (blasBlock.x == 1024) {
    REDUCE_FUNC_NAME(Kernel)<1024><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat, n);
  } else {
-    printf("Reduction not implemented for %d threads\n", blasBlock.x);
-    exit(-1);
+    errorQuda("Reduction not implemented for %d threads", blasBlock.x);
  }

  // copy result from device to host, and perform final reduction on CPU
-  cudaError_t error = cudaMemcpy(h_reduceFloat, d_reduceFloat, blasGrid.x*sizeof(QudaSumFloat), cudaMemcpyDeviceToHost);
-  if (error != cudaSuccess) {
-    printf("Error: %s\n", cudaGetErrorString(error));
-    exit(-1);
-  }
+  cudaMemcpy(h_reduceFloat, d_reduceFloat, blasGrid.x*sizeof(QudaSumFloat), cudaMemcpyDeviceToHost);
+  checkCudaError();

  double cpu_sum = 0;
  for (int i = 0; i < blasGrid.x; i++) cpu_sum += h_reduceFloat[i];

--- a/lib/reduce_triple_core.h
+++ b/lib/reduce_triple_core.h
@@ -117,13 +117,11 @@ double3 REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n, int kernel, QudaPrecision p
  setBlock(kernel, n, precision);
  
  if (n % blasBlock.x != 0) {
-    printf("ERROR reduce_triple_core: length %d must be a multiple of %d\n", n, blasBlock.x);
-    exit(-1);
+    errorQuda("reduce_triple_core: length %d must be a multiple of %d", n, blasBlock.x);
  }
  
  if (blasBlock.x > REDUCE_MAX_BLOCKS) {
-    printf("ERROR reduce_triple_core: block size greater then maximum permitted\n");
-    exit(-1);
+    errorQuda("reduce_triple_core: block size greater then maximum permitted");
  }
  
 #if (REDUCE_TYPE == REDUCE_KAHAN)
@@ -143,16 +141,12 @@ double3 REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n, int kernel, QudaPrecision p
  } else if (blasBlock.x == 1024) {
    REDUCE_FUNC_NAME(Kernel)<1024><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat3, n);
  } else {
-    printf("Reduction not implemented for %d threads\n", blasBlock.x);
-    exit(-1);
+    errorQuda("Reduction not implemented for %d threads", blasBlock.x);
  }

  // copy result from device to host, and perform final reduction on CPU
-  cudaError_t error = cudaMemcpy(h_reduceFloat3, d_reduceFloat3, blasGrid.x*sizeof(QudaSumFloat3), cudaMemcpyDeviceToHost);
-  if (error != cudaSuccess) {
-    printf("Error: %s\n", cudaGetErrorString(error));
-    exit(-1);
-  }
+  cudaMemcpy(h_reduceFloat3, d_reduceFloat3, blasGrid.x*sizeof(QudaSumFloat3), cudaMemcpyDeviceToHost);
+  checkCudaError();
  
  double3 gpu_result;
  gpu_result.x = 0;

--- a/lib/spinor_quda.cpp
+++ b/lib/spinor_quda.cpp
@@ -35,14 +35,12 @@ ParitySpinor allocateParitySpinor(int *X, Precision precision, int pad) {
  else ret.bytes = ret.length*sizeof(short);

  if (cudaMalloc((void**)&ret.spinor, ret.bytes) == cudaErrorMemoryAllocation) {
-    printf("Error allocating spinor\n");
-    exit(0);
+    errorQuda("Error allocating spinor");
  }
  
  if (precision == QUDA_HALF_PRECISION) {
    if (cudaMalloc((void**)&ret.spinorNorm, ret.bytes/12) == cudaErrorMemoryAllocation) {
-      printf("Error allocating spinorNorm\n");
-      exit(0);
+      errorQuda("Error allocating spinorNorm");
    }
  }

@@ -370,8 +368,7 @@ void loadParitySpinor(ParitySpinor ret, void *spinor, Precision cpu_prec,
 		      DiracFieldOrder dirac_order) {

  if (ret.precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) {
-    printf("Error, cannot have CUDA double precision without double CPU precision\n");
-    exit(-1);
+    errorQuda("Cannot have CUDA double precision without CPU double precision");
  }

  if (ret.precision != QUDA_HALF_PRECISION) {
@@ -462,8 +459,7 @@ void loadSpinorField(FullSpinor ret, void *spinor, Precision cpu_prec, DiracFiel
    loadParitySpinor(ret.even, spinor_odd, cpu_prec, dirac_order);
    loadParitySpinor(ret.odd, spinor, cpu_prec, dirac_order);
  } else {
-    printf("DiracFieldOrder %d not supported\n", dirac_order);
-    exit(-1);
+    errorQuda("Invalid dirac_order");
  }
 }

@@ -543,8 +539,7 @@ void retrieveSpinorField(void *res, FullSpinor spinor, Precision cpu_prec, Dirac
    retrieveParitySpinor(res, spinor.odd, cpu_prec, dirac_order);
    retrieveParitySpinor(res_odd, spinor.even, cpu_prec, dirac_order);
  } else {
-    printf("DiracFieldOrder %d not supported\n", dirac_order);
-    exit(-1);
+    errorQuda("Invalid dirac_order");
  }
  
 }

--- a/tests/dslash_test.c
+++ b/tests/dslash_test.c
@@ -52,7 +52,7 @@ void init() {
  gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T;

  gauge_param.cpu_prec = QUDA_DOUBLE_PRECISION;
-  gauge_param.cuda_prec = QUDA_DOUBLE_PRECISION;
+  gauge_param.cuda_prec = QUDA_SINGLE_PRECISION;
  gauge_param.reconstruct = QUDA_RECONSTRUCT_12;
  gauge_param.reconstruct_sloppy = gauge_param.reconstruct;
  gauge_param.cuda_prec_sloppy = gauge_param.cuda_prec;
@@ -69,7 +69,7 @@ void init() {
  inv_param.matpc_type = QUDA_MATPC_ODD_ODD;

  inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
-  inv_param.cuda_prec = QUDA_DOUBLE_PRECISION;
+  inv_param.cuda_prec = QUDA_SINGLE_PRECISION;

  gauge_param.ga_pad = 0;
  inv_param.sp_pad = 0;
@@ -143,7 +143,7 @@ void init() {
  gauge = cudaGaugePrecise;

  if (clover_yes) {
-    loadCloverQuda(NULL, hostCloverInv, &gauge_param, &inv_param);
+    loadCloverQuda(NULL, hostCloverInv, &inv_param);
    clover = cudaCloverPrecise;
    cloverInv = cudaCloverInvPrecise;
  }

--- a/tests/invert_test.c
+++ b/tests/invert_test.c
@@ -29,9 +29,9 @@ int main(int argc, char **argv)
  gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T;

  gauge_param.cpu_prec = QUDA_DOUBLE_PRECISION;
-  gauge_param.cuda_prec = QUDA_DOUBLE_PRECISION;
+  gauge_param.cuda_prec = QUDA_SINGLE_PRECISION;
  gauge_param.reconstruct = QUDA_RECONSTRUCT_12;
-  gauge_param.cuda_prec_sloppy = QUDA_DOUBLE_PRECISION;
+  gauge_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
  gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12;
  gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;

@@ -55,8 +55,8 @@ int main(int argc, char **argv)
  inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION;

  inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
-  inv_param.cuda_prec = QUDA_DOUBLE_PRECISION;
-  inv_param.cuda_prec_sloppy = QUDA_DOUBLE_PRECISION;
+  inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
+  inv_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
  inv_param.preserve_source = QUDA_PRESERVE_SOURCE_YES;
  inv_param.dirac_order = QUDA_DIRAC_ORDER;

@@ -66,8 +66,8 @@ int main(int argc, char **argv)

  if (clover_yes) {
    inv_param.clover_cpu_prec = QUDA_DOUBLE_PRECISION;
-    inv_param.clover_cuda_prec = QUDA_DOUBLE_PRECISION;
-    inv_param.clover_cuda_prec_sloppy = QUDA_DOUBLE_PRECISION;
+    inv_param.clover_cuda_prec = QUDA_SINGLE_PRECISION;
+    inv_param.clover_cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
    inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER;
  }
  inv_param.verbosity = QUDA_VERBOSE;
@@ -102,7 +102,7 @@ int main(int argc, char **argv)

  initQuda(device);
  loadGaugeQuda((void*)gauge, &gauge_param);
-  if (clover_yes) loadCloverQuda(NULL, clover_inv, &gauge_param, &inv_param);
+  if (clover_yes) loadCloverQuda(NULL, clover_inv, &inv_param);

  invertQuda(spinorOut, spinorIn, &inv_param);