Initial pass at production code

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@284 be54200a-260c-0410-bdd7-ce6af2a381ab

Initial pass at production code
git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@284 be54200a-260c-0410-bdd7-ce6af2a381ab
ca9d56f0 · mikeaclark · ca9d56f0 · ca9d56f0 · ca9d56f0 · ca9d56f0
Commit ca9d56f0 authored Jan 17, 2009 by mikeaclark
--- a/Makefile.tmpl
+++ b/Makefile.tmpl
+CUFILES		:= dslash_cuda.cu blas_cuda.cu
+CCFILES		:= inv_bicgstab_cuda.cpp inv_cg.cpp util_cuda.cpp field_cuda.cpp
+
+CUDA_INSTALL_PATH = /usr/local/cuda
+INCLUDES = -I. -I$(CUDA_INSTALL_PATH)/include
+LIB = -L$(CUDA_INSTALL_PATH)/lib -lcudart -lcublas
+
+CC = gcc
+CFLAGS = -Wall -O3 -std=c99 $(INCLUDES)
+CXX = g++
+CXXFLAGS = -Wall $(INCLUDES) -DUNIX -O3 
+NVCC = $(CUDA_INSTALL_PATH)/bin/nvcc 
+NVCCFLAGS = $(INCLUDES) -DUNIX -O3
+LDFLAGS = -fPIC $(LIB)
+CCOBJECTS = $(CCFILES:.cpp=.o)
+CUOBJECTS = $(CUFILES:.cu=.o)
+
+all: dslash_test invert_test su3_test
+
+ILIB = libquda.a
+ILIB_OBJS = inv_bicgstab_quda.o inv_cg_quda.o dslash_quda.o blas_quda.o util_quda.o \
+	dslash_reference.o blas_reference.o invert_quda.o field_quda.o
+ILIB_DEPS = $(ILIB_OBJS) blas_quda.h quda.h util_quda.h invert_quda.h field_quda.h enum_quda.h
+
+$(ILIB): $(ILIB_DEPS)
+	ar cru $@ $(ILIB_OBJS)
+
+invert_test: invert_test.o $(ILIB)
+	$(CXX) $(LDFLAGS) $< $(ILIB) -o $@
+
+dslash_test: dslash_test.o $(ILIB)
+	$(CXX) $(LDFLAGS) $< $(ILIB) -o $@
+
+su3_test: su3_test.o $(ILIB)
+	$(CXX) $(LDFLAGS) $< $(ILIB) -o $@
+
+clean:
+	-rm -f *.o dslash_test invert_test su3_test $(ILIB)
+
+%.o: %.c
+	$(CC) $(CFLAGS) $< -c -o $@
+
+%.o: %.cpp
+	$(CXX) $(CXXFLAGS) $< -c -o $@
+
+%.o: %.cu
+	$(NVCC) $(NVCCFLAGS) $< -c -o $@
--- a/blas_quda.cu
+++ b/blas_quda.cu
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <quda.h>
+#include <util_quda.h>
+
+#define REDUCE_DOUBLE_PRECISION
+
+#define REDUCE_THREADS 128
+#define REDUCE_MAX_BLOCKS 64
+
+void zeroCuda(float* dst, int len) {
+    // cuda's floating point format, IEEE-754, represents the floating point
+    // zero as 4 zero bytes
+    cudaMemset(dst, 0, len*sizeof(float));
+}
+
+void copyCuda(float* dst, float *src, int len) {
+    cudaMemcpy(dst, src, len*sizeof(float), cudaMemcpyDeviceToDevice);
+}
+
+__global__ void axpbyKernel(float a, float *x, float b, float *y, int len) {
+    unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
+    unsigned int gridSize = gridDim.x*blockDim.x;
+    while (i < len) {
+        y[i] = a*x[i] + b*y[i];
+        i += gridSize;
+    } 
+}
+
+// performs the operation y[i] = a*x[i] + b*y[i]
+void axpbyCuda(float a, float *x, float b, float *y, int len) {
+    int blocks = min(REDUCE_MAX_BLOCKS, max(len/REDUCE_THREADS, 1));
+    dim3 dimBlock(REDUCE_THREADS, 1, 1);
+    dim3 dimGrid(blocks, 1, 1);
+    axpbyKernel<<<dimGrid, dimBlock>>>(a, x, b, y, len);
+}
+
+// performs the operation y[i] = a*x[i] + y[i]
+void axpyCuda(float a, float *x, float *y, int len) {
+    axpbyCuda(a, x, 1.0, y, len);
+}
+
+// performs the operation y[i] = x[i] + a*y[i]
+void xpayCuda(float *x, float a, float *y, int len) {
+    axpbyCuda(1.0, x, a, y, len);
+}
+
+// performs the operation y[i] -= x[i] (minus x plus y)
+void mxpyCuda(float *x, float *y, int len) {
+    axpbyCuda(-1.0, x, 1.0, y, len);
+}
+
+// performs the operation x[i] = a*x[i]
+void axCuda(float a, float *x, int len) {
+    axpbyCuda(0.0, x, a, x, len);
+}
+
+__global__ void caxpyKernel(float2 a, float2 *x, float2 *y, int len) {
+
+    unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
+    unsigned int gridSize = gridDim.x*blockDim.x;
+    while (i < len) {
+	float Xx = x[i].x;
+	float Xy = x[i].y;
+        y[i].x += a.x*Xx - a.y*Xy;
+	y[i].y += a.y*Xx + a.x*Xy;
+        i += gridSize;
+    } 
+
+}
+
+// performs the operation y[i] += a*x[i]
+void caxpyCuda(float2 a, float2 *x, float2 *y, int len) {
+    int blocks = min(REDUCE_MAX_BLOCKS, max(len/REDUCE_THREADS, 1));
+    dim3 dimBlock(REDUCE_THREADS, 1, 1);
+    dim3 dimGrid(blocks, 1, 1);
+    caxpyKernel<<<dimGrid, dimBlock>>>(a, x, y, len);
+}
+
+__global__ void caxpbyKernel(float2 a, float2 *x, float2 b, float2 *y, int len) {
+
+    unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
+    unsigned int gridSize = gridDim.x*blockDim.x;
+    while (i < len) {
+	float Xx = x[i].x;
+	float Xy = x[i].y;
+	float Yx = y[i].x;
+	float Yy = y[i].y;
+        y[i].x = a.x*Xx + b.x*Yx - a.y*Xy - b.y*Yy;
+	y[i].y = a.y*Xx + b.y*Yx + a.x*Xy + b.x*Yy;
+        i += gridSize;
+    } 
+
+}
+
+// performs the operation y[i] = c*x[i] + b*y[i]
+void caxbyCuda(float2 a, float2 *x, float2 b, float2 *y, int len) {
+    int blocks = min(REDUCE_MAX_BLOCKS, max(len/REDUCE_THREADS, 1));
+    dim3 dimBlock(REDUCE_THREADS, 1, 1);
+    dim3 dimGrid(blocks, 1, 1);
+    caxpbyKernel<<<dimGrid, dimBlock>>>(a, x, b, y, len);   
+}
+
+__global__ void cxpaypbzKernel(float2 *x, float2 a, float2 *y, 
+                               float2 b, float2 *z, int len) {
+
+    unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
+    unsigned int gridSize = gridDim.x*blockDim.x;
+    while (i < len) {
+	float Xx = x[i].x;
+	float Xy = x[i].y;
+	float Yx = y[i].x;
+	float Yy = y[i].y;
+	float Zx = z[i].x;
+	float Zy = z[i].y;
+	
+	Xx += a.x*Yx - a.y*Yy;
+ 	Xy += a.y*Yx + a.x*Yy;
+	Xx += b.x*Zx - b.y*Zy;
+	Xy += b.y*Zx + b.x*Zy;
+
+	z[i].x = Xx;
+	z[i].y = Xy;
+        i += gridSize;
+    } 
+
+}
+
+// performs the operation z[i] = x[i] + a*y[i] + b*z[i]
+void cxpaypbzCuda(float2 *x, float2 a, float2 *y, float2 b, float2 *z, int len) {
+    int blocks = min(REDUCE_MAX_BLOCKS, max(len/REDUCE_THREADS, 1));
+    dim3 dimBlock(REDUCE_THREADS, 1, 1);
+    dim3 dimGrid(blocks, 1, 1);
+    cxpaypbzKernel<<<dimGrid, dimBlock>>>(x, a, y, b, z, len);   
+}
+
+__global__ void axpyZpbxKernel(float a, float *x, float *y, float *z, float b, int len) {
+    unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
+    unsigned int gridSize = gridDim.x*blockDim.x;
+    while (i < len) {
+        float x_i = x[i];
+        y[i] = a*x_i + y[i];
+        x[i] = z[i] + b*x_i;
+        i += gridSize;
+    }
+}
+
+// performs the operations: {y[i] = a x[i] + y[i]; x[i] = z[i] + b x[i]}
+void axpyZpbxCuda(float a, float *x, float *y, float *z, float b, int len) {
+    int blocks = min(REDUCE_MAX_BLOCKS, max(len/REDUCE_THREADS, 1));
+    dim3 dimBlock(REDUCE_THREADS, 1, 1);
+    dim3 dimGrid(blocks, 1, 1);
+    axpyZpbxKernel<<<dimGrid, dimBlock>>>(a, x, y, z, b, len);
+}
+
+__global__ void caxpbypzYmbwKernel(float2 a, float2 *x, float2 b, float2 *y, 
+	                           float2 *z, float2 *w, int len) {
+
+    unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
+    unsigned int gridSize = gridDim.x*blockDim.x;
+    while (i < len) {
+	float Xx = x[i].x;
+	float Xy = x[i].y;
+	float Yx = y[i].x;
+	float Yy = y[i].y;
+	float Wx = w[i].x;
+	float Wy = w[i].y;
+
+        float Zx = a.x*Xx - a.y*Xy;
+	float Zy = a.y*Xx + a.x*Xy;
+	Zx += b.x*Yx - b.y*Yy;
+	Zy += b.y*Yx + b.x*Yy;
+	Yx -= b.x*Wx - b.y*Wy;
+	Yy -= b.y*Wx + b.x*Wy;	
+
+	z[i].x += Zx;
+	z[i].y += Zy;
+	y[i].x = Yx;
+	y[i].y = Yy;
+        i += gridSize;
+    } 
+
+}
+
+// performs the operation z[i] = a*x[i] + b*y[i] + z[i] and y[i] -= b*w[i]
+void caxpbypzYmbwCuda(float2 a, float2 *x, float2 b, float2 *y,
+                      float2 *z, float2 *w, int len) {
+    int blocks = min(REDUCE_MAX_BLOCKS, max(len/REDUCE_THREADS, 1));
+    dim3 dimBlock(REDUCE_THREADS, 1, 1);
+    dim3 dimGrid(blocks, 1, 1);
+    caxpbypzYmbwKernel<<<dimGrid, dimBlock>>>(a, x, b, y, z, w, len);   
+}
+
+
+// performs the operation y[i] = a*x[i] + y[i], and returns norm(y)
+// float axpyNormCuda(float a, float *x, float *y, int len);
+
+
+// Computes c = a + b in "double single" precision.
+__device__ void dsadd(float &c0, float &c1, const float a0, const float a1, const float b0, const float b1) {
+    // Compute dsa + dsb using Knuth's trick.
+    float t1 = a0 + b0;
+    float e = t1 - a0;
+    float t2 = ((b0 - e) + (a0 - (t1 - e))) + a1 + b1;
+    // The result is t1 + t2, after normalization.
+    c0 = e = t1 + t2;
+    c1 = t2 - (e - t1);
+}
+
+// Computes c = a + b in "double single" precision (complex version)
+__device__ void zcadd(float2 &c0, float2 &c1, const float2 a0, const float2 a1, const float2 b0, const float2 b1) {
+    // Compute dsa + dsb using Knuth's trick.
+    float t1 = a0.x + b0.x;
+    float e = t1 - a0.x;
+    float t2 = ((b0.x - e) + (a0.x - (t1 - e))) + a1.x + b1.x;
+    // The result is t1 + t2, after normalization.
+    c0.x = e = t1 + t2;
+    c1.x = t2 - (e - t1);
+
+    // Compute dsa + dsb using Knuth's trick.
+    t1 = a0.y + b0.y;
+    e = t1 - a0.y;
+    t2 = ((b0.y - e) + (a0.y - (t1 - e))) + a1.y + b1.y;
+    // The result is t1 + t2, after normalization.
+    c0.y = e = t1 + t2;
+    c1.y = t2 - (e - t1);
+}
+
+// Computes c = a + b in "double single" precision (float3 version)
+__device__ void dsadd3(float3 &c0, float3 &c1, const float3 a0, const float3 a1, const float3 b0, const float3 b1) {
+    // Compute dsa + dsb using Knuth's trick.
+    float t1 = a0.x + b0.x;
+    float e = t1 - a0.x;
+    float t2 = ((b0.x - e) + (a0.x - (t1 - e))) + a1.x + b1.x;
+    // The result is t1 + t2, after normalization.
+    c0.x = e = t1 + t2;
+    c1.x = t2 - (e - t1);
+
+    // Compute dsa + dsb using Knuth's trick.
+    t1 = a0.y + b0.y;
+    e = t1 - a0.y;
+    t2 = ((b0.y - e) + (a0.y - (t1 - e))) + a1.y + b1.y;
+    // The result is t1 + t2, after normalization.
+    c0.y = e = t1 + t2;
+    c1.y = t2 - (e - t1);
+
+    // Compute dsa + dsb using Knuth's trick.
+    t1 = a0.z + b0.z;
+    e = t1 - a0.z;
+    t2 = ((b0.z - e) + (a0.z - (t1 - e))) + a1.z + b1.z;
+    // The result is t1 + t2, after normalization.
+    c0.z = e = t1 + t2;
+    c1.z = t2 - (e - t1);
+}
+
+//
+// float sumCuda(float *a, int n) {}
+//
+#define REDUCE_FUNC_NAME(suffix) sum##suffix
+#define REDUCE_TYPES float *a
+#define REDUCE_PARAMS a
+#define REDUCE_AUXILIARY(i)
+#define REDUCE_OPERATION(i) a[i]
+#include "reduce_core.h"
+#undef REDUCE_FUNC_NAME
+#undef REDUCE_TYPES
+#undef REDUCE_PARAMS
+#undef REDUCE_AUXILIARY
+#undef REDUCE_OPERATION
+
+//
+// float normCuda(float *a, int n) {}
+//
+#define REDUCE_FUNC_NAME(suffix) norm##suffix
+#define REDUCE_TYPES float *a
+#define REDUCE_PARAMS a
+#define REDUCE_AUXILIARY(i)
+#define REDUCE_OPERATION(i) (a[i]*a[i])
+#include "reduce_core.h"
+#undef REDUCE_FUNC_NAME
+#undef REDUCE_TYPES
+#undef REDUCE_PARAMS
+#undef REDUCE_AUXILIARY
+#undef REDUCE_OPERATION
+
+//
+// float reDotProductCuda(float *a, float *b, int n) {}
+//
+#define REDUCE_FUNC_NAME(suffix) reDotProduct##suffix
+#define REDUCE_TYPES float *a, float *b
+#define REDUCE_PARAMS a, b
+#define REDUCE_AUXILIARY(i)
+#define REDUCE_OPERATION(i) (a[i]*b[i])
+#include "reduce_core.h"
+#undef REDUCE_FUNC_NAME
+#undef REDUCE_TYPES
+#undef REDUCE_PARAMS
+#undef REDUCE_AUXILIARY
+#undef REDUCE_OPERATION
+
+
+//
+// float2 cDotProductCuda(float2 *a, float2 *b, int n) {}
+//
+#define REDUCE_FUNC_NAME(suffix) cDotProduct##suffix
+#define REDUCE_TYPES float2 *a, float2 *b
+#define REDUCE_PARAMS a, b
+#define REDUCE_REAL_AUXILIARY(i)
+#define REDUCE_IMAG_AUXILIARY(i)
+#define REDUCE_REAL_OPERATION(i) (a[i].x*b[i].x + a[i].y*b[i].y)
+#define REDUCE_IMAG_OPERATION(i) (a[i].x*b[i].y - a[i].y*b[i].x)
+#include "reduce_complex_core.h"
+#undef REDUCE_FUNC_NAME
+#undef REDUCE_TYPES
+#undef REDUCE_PARAMS
+#undef REDUCE_REAL_AUXILIARY
+#undef REDUCE_IMAG_AUXILIARY
+#undef REDUCE_REAL_OPERATION
+#undef REDUCE_IMAG_OPERATION
+
+
+
+//
+// float3 cDotProductNormACuda(float2 *a, float2 *b, int n) {}
+//
+#define REDUCE_FUNC_NAME(suffix) cDotProductNormA##suffix
+#define REDUCE_TYPES float2 *a, float2 *b
+#define REDUCE_PARAMS a, b
+#define REDUCE_X_AUXILIARY(i)
+#define REDUCE_Y_AUXILIARY(i)
+#define REDUCE_Z_AUXILIARY(i)
+#define REDUCE_X_OPERATION(i) (a[i].x*b[i].x + a[i].y*b[i].y)
+#define REDUCE_Y_OPERATION(i) (a[i].x*b[i].y - a[i].y*b[i].x)
+#define REDUCE_Z_OPERATION(i) (a[i].x*a[i].x + a[i].y*a[i].y)
+#include "reduce_triple_core.h"
+#undef REDUCE_FUNC_NAME
+#undef REDUCE_TYPES
+#undef REDUCE_PARAMS
+#undef REDUCE_X_AUXILIARY
+#undef REDUCE_Y_AUXILIARY
+#undef REDUCE_Z_AUXILIARY
+#undef REDUCE_X_OPERATION
+#undef REDUCE_Y_OPERATION
+#undef REDUCE_Z_OPERATION
+
+
+//
+// float3 cDotProductNormBCuda(float2 *a, float2 *b, int n) {}
+//
+#define REDUCE_FUNC_NAME(suffix) cDotProductNormB##suffix
+#define REDUCE_TYPES float2 *a, float2 *b
+#define REDUCE_PARAMS a, b
+#define REDUCE_X_AUXILIARY(i)
+#define REDUCE_Y_AUXILIARY(i)
+#define REDUCE_Z_AUXILIARY(i)
+#define REDUCE_X_OPERATION(i) (a[i].x*b[i].x + a[i].y*b[i].y)
+#define REDUCE_Y_OPERATION(i) (a[i].x*b[i].y - a[i].y*b[i].x)
+#define REDUCE_Z_OPERATION(i) (b[i].x*b[i].x + b[i].y*b[i].y)
+#include "reduce_triple_core.h"
+#undef REDUCE_FUNC_NAME
+#undef REDUCE_TYPES
+#undef REDUCE_PARAMS
+#undef REDUCE_X_AUXILIARY
+#undef REDUCE_Y_AUXILIARY
+#undef REDUCE_Z_AUXILIARY
+#undef REDUCE_X_OPERATION
+#undef REDUCE_Y_OPERATION
+#undef REDUCE_Z_OPERATION
+
+
+//
+// float3 caxpbypzYmbwcDotProductWYNormYCuda(float2 a, float2 *x, float2 b, float2 *y, 
+// float2 *z, float2 *w, float2 *u, int len)
+//
+#define REDUCE_FUNC_NAME(suffix) caxpbypzYmbwcDotProductWYNormY##suffix
+#define REDUCE_TYPES float2 a, float2 *x, float2 b, float2 *y, float2 *z, float2 *w, float2 *u
+#define REDUCE_PARAMS a, x, b, y, z, w, u
+#define REDUCE_X_AUXILIARY(i) \
+	float Xx = x[i].x; \
+	float Xy = x[i].y; \
+	float Yx = y[i].x; \
+	float Yy = y[i].y; \
+	float Wx = w[i].x; \
+	float Wy = w[i].y;
+#define REDUCE_Y_AUXILIARY(i) \
+        float Zx = a.x*Xx - a.y*Xy; \
+	float Zy = a.y*Xx + a.x*Xy; \
+	Zx += b.x*Yx - b.y*Yy; \
+	Zy += b.y*Yx + b.x*Yy; \
+	Yx -= b.x*Wx - b.y*Wy; \
+	Yy -= b.y*Wx + b.x*Wy;	
+#define REDUCE_Z_AUXILIARY(i) \
+	z[i].x += Zx; \
+	z[i].y += Zy; \
+	y[i].x = Yx; \
+	y[i].y = Yy;
+#define REDUCE_X_OPERATION(i) (u[i].x*y[i].x + u[i].y*y[i].y)
+#define REDUCE_Y_OPERATION(i) (u[i].x*y[i].y - u[i].y*y[i].x)
+#define REDUCE_Z_OPERATION(i) (y[i].x*y[i].x + y[i].y*y[i].y)
+#include "reduce_triple_core.h"
+#undef REDUCE_FUNC_NAME
+#undef REDUCE_TYPES
+#undef REDUCE_PARAMS
+#undef REDUCE_X_AUXILIARY
+#undef REDUCE_Y_AUXILIARY
+#undef REDUCE_Z_AUXILIARY
+#undef REDUCE_X_OPERATION
+#undef REDUCE_Y_OPERATION
+#undef REDUCE_Z_OPERATION
+
+//
+// float axpyNormCuda(float a, float *x, float *y, n){}
+//
+// First performs the operation y[i] = a*x[i] + y[i]
+// Second returns the norm of y
+//
+
+#define REDUCE_FUNC_NAME(suffix) axpyNorm##suffix
+#define REDUCE_TYPES float a, float *x, float *y
+#define REDUCE_PARAMS a, x, y
+#define REDUCE_AUXILIARY(i) y[i] = a*x[i] + y[i]
+#define REDUCE_OPERATION(i) (y[i]*y[i])
+#include "reduce_core.h"
+#undef REDUCE_FUNC_NAME
+#undef REDUCE_TYPES
+#undef REDUCE_PARAMS
+#undef REDUCE_AUXILIARY
+#undef REDUCE_OPERATION
+
+
+//
+// float2 xpaycDotzyCuda(float2 *x, float a, float2 *y, float2 *z, int n) {}
+//
+// First performs the operation y = x - a*y
+// Second returns complex dot product (z,y)
+//
+
+#define REDUCE_FUNC_NAME(suffix) xpaycDotzy##suffix
+#define REDUCE_TYPES float2 *x, float a, float2 *y, float2 *z
+#define REDUCE_PARAMS x, a, y, z
+#define REDUCE_REAL_AUXILIARY(i) y[i].x = x[i].x + a*y[i].x
+#define REDUCE_IMAG_AUXILIARY(i) y[i].y = x[i].y + a*y[i].y
+#define REDUCE_REAL_OPERATION(i) (z[i].x*y[i].x + z[i].y*y[i].y)
+#define REDUCE_IMAG_OPERATION(i) (z[i].x*y[i].y - z[i].y*y[i].x)
+#include "reduce_complex_core.h"
+#undef REDUCE_FUNC_NAME
+#undef REDUCE_TYPES
+#undef REDUCE_PARAMS
+#undef REDUCE_REAL_AUXILIARY
+#undef REDUCE_IMAG_AUXILIARY
+#undef REDUCE_REAL_OPERATION
+#undef REDUCE_IMAG_OPERATION
+
+
+
+double cpuDouble(float *data, int size) {
+    double sum = 0;
+    for (int i = 0; i < size; i++)
+        sum += data[i];
+    return sum;
+}
+
+void blasTest() {
+    int n = 3*1<<24;
+    float *h_data = (float *)malloc(n*sizeof(float));
+    float *d_data;
+    if (cudaMalloc((void **)&d_data,  n*sizeof(float))) {
+      printf("Error allocating d_data\n");
+      exit(0);
+    }
+    
+    for (int i = 0; i < n; i++) {
+        h_data[i] = rand()/(float)RAND_MAX - 0.5; // n-1.0-i;
+    }
+    
+    cudaMemcpy(d_data, h_data, n*sizeof(float), cudaMemcpyHostToDevice);
+    
+    cudaThreadSynchronize();
+    stopwatchStart();
+    int LOOPS = 20;
+    for (int i = 0; i < LOOPS; i++) {
+        sumCuda(d_data, n);
+    }
+    cudaThreadSynchronize();
+    float secs = stopwatchReadSeconds();
+    
+    printf("%f GiB/s\n", 1.e-9*n*sizeof(float)*LOOPS / secs);
+    printf("Device: %f MiB\n", (float)n*sizeof(float) / (1 << 20));
+    printf("Shared: %f KiB\n", (float)REDUCE_THREADS*sizeof(float) / (1 << 10));
+
+    float correctDouble = cpuDouble(h_data, n);
+    printf("Total: %f\n", correctDouble);
+    printf("CUDA db: %f\n", fabs(correctDouble-sumCuda(d_data, n)));
+    
+    cudaFree(d_data) ;
+    free(h_data);
+}
+
+void axpbyTest() {
+    int n = 3 * 1 << 20;
+    float *h_x = (float *)malloc(n*sizeof(float));
+    float *h_y = (float *)malloc(n*sizeof(float));
+    float *h_res = (float *)malloc(n*sizeof(float));
+    
+    float *d_x, *d_y;
+    if (cudaMalloc((void **)&d_x,  n*sizeof(float))) {
+      printf("Error allocating d_x\n");
+      exit(0);
+    }
+    if (cudaMalloc((void **)&d_y,  n*sizeof(float))) {
+      printf("Error allocating d_y\n");
+      exit(0);
+    }
+
+
+    
+    for (int i = 0; i < n; i++) {
+        h_x[i] = 1;
+        h_y[i] = 2;
+    }
+    
+    cudaMemcpy(d_x, h_x, n*sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_y, h_y, n*sizeof(float), cudaMemcpyHostToDevice);
+    
+    axpbyCuda(4, d_x, 3, d_y, n/2);
+    
+    cudaMemcpy( h_res, d_y, n*sizeof(float), cudaMemcpyDeviceToHost);
+
+    for (int i = 0; i < n; i++) {
+        float expect = (i < n/2) ? 4*h_x[i] + 3*h_y[i] : h_y[i];
+        if (h_res[i] != expect)
+            printf("FAILED %d : %f != %f\n", i, h_res[i], h_y[i]);
+    }
+    
+    cudaFree(d_y);
+    cudaFree(d_x);
+    free(h_x);
+    free(h_y);
+}
--- a/blas_quda.h
+++ b/blas_quda.h
+#ifndef _QUDA_BLAS_H
+#define _QUDA_BLAS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// ---------- blas_cuda.cu ----------
+
+void zeroCuda(float* dst, int cnt);
+void copyCuda(float* dst, float *src, int len);
+
+void axpbyCuda(float a, float *x, float b, float *y, int len);
+void axpyCuda(float a, float *x, float *y, int len);
+void axCuda(float a, float *x, int len);
+void xpayCuda(float *x, float a, float *y, int len);
+void mxpyCuda(float *x, float *y, int len);
+
+void axpyZpbxCuda(float a, float *x, float *y, float *z, float b, int len);
+float axpyNormCuda(float a, float *x, float *y, int len);
+
+float sumCuda(float *a, int n);
+float normCuda(float *a, int n);
+float reDotProductCuda(float *a, float *b, int n);
+
+void blasTest();
+void axpbyTest();
+
+void caxpbyCuda(float2 a, float2 *x, float2 b, float2 *y, int len);
+void caxpyCuda(float2 a, float2 *x, float2 *y, int len);
+void cxpaypbzCuda(float2 *x, float2 b, float2 *y, float2 c, float2 *z, int len);
+float2 cDotProductCuda(float2*, float2*, int len);
+void caxpbypzYmbwCuda(float2, float2*, float2, float2*, float2*, float2*, int len);
+float3 cDotProductNormACuda(float2 *a, float2 *b, int n);
+float3 cDotProductNormBCuda(float2 *a, float2 *b, int n);
+float3 caxpbypzYmbwcDotProductWYNormYCuda(float2 a, float2 *x, float2 b, float2 *y, 
+					  float2 *z, float2 *w, float2 *u, int len);
+
+float2 xpaycDotzyCuda(float2 *x, float a, float2 *y, float2 *z, int len);
+
+// ---------- blas_reference.cpp ----------
+
+void zero(float* a, int cnt);
+void copy(float* a, float *b, int len);
+
+void ax(float a, float *x, int len);
+
+void axpbyCuda(float a, float *x, float b, float *y, int len);
+void axpy(float a, float *x, float *y, int len);
+void xpay(float *x, float a, float *y, int len);
+void mxpy(float *x, float *y, int len);
+
+float norm(float *vector, int len);
+float reDotProduct(float *v1, float *v2, int len);
+float imDotProduct(float *v1, float *v2, int len);
+double normD(float *vector, int len);
+double reDotProductD(float *v1, float *v2, int len);
+double imDotProductD(float *v1, float *v2, int len);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _QUDA_BLAS_H
--- a/blas_reference.cpp
+++ b/blas_reference.cpp
+#include <quda.h>
+
+
+// sets all elements of the destination vector to zero
+void zero(float* a, int cnt) {
+    for (int i = 0; i < cnt; i++)
+        a[i] = 0;
+}
+
+// copy one spinor to the other
+void copy(float* a, float *b, int len) {
+  for (int i = 0; i < len; i++) a[i] = b[i];
+}
+
+// performs the operation x[i] *= a
+void ax(float a, float *x, int len) {
+    for (int i=0; i<len; i++) x[i] *= a;
+}
+
+// performs the operation y[i] = a*x[i] + b*y[i]
+void axpby(float a, float *x, float b, float *y, int len) {
+    for (int i=0; i<len; i++) y[i] = a*x[i] + b*y[i];
+}
+
+// performs the operation y[i] = a*x[i] + y[i]
+void axpy(float a, float *x, float *y, int len) {
+    for (int i=0; i<len; i++) y[i] += a*x[i];
+}
+
+// performs the operation y[i] = x[i] + a*y[i]
+void xpay(float *x, float a, float *y, int len) {
+    for (int i=0; i<len; i++) y[i] = x[i] + a*y[i];
+}
+
+// performs the operation y[i] -= x[i] (minus x plus y)
+void mxpy(float *x, float *y, int len) {
+    for (int i=0; i<len; i++) y[i] -= x[i];
+}
+
+
+// returns the square of the L2 norm of the vector
+float norm(float *v, int len) {
+
+  float sum=0.0;
+  for (int i=0; i<len; i++) {
+    sum += v[i]*v[i];
+  }
+
+  return sum;
+}
+
+// returns the real part of the dot product of 2 complex valued vectors
+float reDotProduct(float *v1, float *v2, int len) {
+
+  float dot=0.0;
+  for (int i=0; i<len; i++) {
+    dot += v1[i]*v2[i];
+  }
+
+  return dot;
+}
+
+// returns the imaginary part of the dot product of 2 complex valued vectors
+float imDotProduct(float *v1, float *v2, int len) {
+
+  float dot=0.0;
+  for (int i=0; i<len; i+=2) {
+    dot += v1[i]*v2[i+1] - v1[i+1]*v2[i];
+  }
+
+  return dot;
+}
+
+// returns the square of the L2 norm of the vector
+double normD(float *v, int len) {
+
+  double sum=0.0;
+  for (int i=0; i<len; i++) {
+    sum += v[i]*v[i];
+  }
+
+  return sum;
+}
+
+// returns the real part of the dot product of 2 complex valued vectors
+double reDotProductD(float *v1, float *v2, int len) {
+
+  double dot=0.0;
+  for (int i=0; i<len; i++) {
+    dot += v1[i]*v2[i];
+  }
+
+  return dot;
+}
+
+// returns the imaginary part of the dot product of 2 complex valued vectors
+double imDotProductD(float *v1, float *v2, int len) {
+
+  double dot=0.0;
+  for (int i=0; i<len; i+=2) {
+    dot += v1[i]*v2[i+1] - v1[i+1]*v2[i];
+  }
+
+  return dot;
+}
--- a/dslash_core.h
+++ b/dslash_core.h
--- a/dslash_cuda_gen.py
+++ b/dslash_cuda_gen.py
+
+### complex numbers ########################################################################
+
+def complexify(a):
+    return [complex(x) for x in a]
+
+def complexToStr(c):
+    def fltToString(a):
+        if a == int(a): return `int(a)`
+        else: return `a`
+    
+    def imToString(a):
+        if a == 0: return "0i"
+        elif a == -1: return "-i"
+        elif a == 1: return "i"
+        else: return fltToString(a)+"i"
+    
+    re = c.real
+    im = c.imag
+    if re == 0 and im == 0: return "0"
+    elif re == 0: return imToString(im)
+    elif im == 0: return fltToString(re)
+    else:
+        im_str = "-"+imToString(-im) if im < 0 else "+"+imToString(im)
+        return fltToString(re)+im_str
+
+
+### projector matrices ########################################################################
+
+id = complexify([
+    1, 0, 0, 0,
+    0, 1, 0, 0,
+    0, 0, 1, 0,
+    0, 0, 0, 1
+])
+
+gamma1 = complexify([
+    0,  0, 0, 1j,
+    0,  0, 1j, 0,
+    0, -1j, 0, 0,
+    -1j,  0, 0, 0
+])
+
+gamma2 = complexify([
+    0, 0, 0, -1,
+    0, 0, 1,  0,
+    0, 1, 0,  0,
+    -1, 0, 0,  0
+])
+
+gamma3 = complexify([
+    0, 0, 1j,  0,
+    0, 0, 0, -1j,
+    -1j, 0, 0,  0,
+    0, 1j, 0,  0
+])
+
+gamma4 = complexify([
+    0, 0, 1, 0,
+    0, 0, 0, 1,
+    1, 0, 0, 0,
+    0, 1, 0, 0
+])
+
+
+def gplus(g1, g2):
+    return [x+y for (x,y) in zip(g1,g2)]
+
+def gminus(g1, g2):
+    return [x-y for (x,y) in zip(g1,g2)]
+
+def projectorToStr(p):
+    out = ""
+    for i in range(0, 4):
+        for j in range(0,4):
+            out += complexToStr(p[4*i+j]) + " "
+        out += "\n"
+    return out
+
+projectors = [
+    gminus(id,gamma1), gplus(id,gamma1),
+    gminus(id,gamma2), gplus(id,gamma2),
+    gminus(id,gamma3), gplus(id,gamma3),
+    gminus(id,gamma4), gplus(id,gamma4)
+]
+
+
+### code generation  ########################################################################
+
+### parameters
+sharedFloats = 19
+dagger = False
+
+def block(code):
+    lines = ''.join(["    "+line+"\n" for line in code.splitlines()])
+    return "{\n"+lines+"}\n"
+
+def sign(x):
+    if x==1: return "+"
+    elif x==-1: return "-"
+
+def nthFloat4(n):
+    return `(n/4)` + "." + ["x", "y", "z", "w"][n%4]
+
+def in_re(s, c): return "i"+`s`+`c`+"_re"
+def in_im(s, c): return "i"+`s`+`c`+"_im"
+def g_re(d, m, n): return ("g" if (d%2==0) else "gT")+`m`+`n`+"_re"
+def g_im(d, m, n): return ("g" if (d%2==0) else "gT")+`m`+`n`+"_im"
+def out_re(s, c): return "o"+`s`+`c`+"_re"
+def out_im(s, c): return "o"+`s`+`c`+"_im"
+def h1_re(h, c): return ["a","b"][h]+`c`+"_re"
+def h1_im(h, c): return ["a","b"][h]+`c`+"_im"
+def h2_re(h, c): return ["A","B"][h]+`c`+"_re"
+def h2_im(h, c): return ["A","B"][h]+`c`+"_im"
+
+
+
+def prolog():
+    str = []
+    str.append("// *** CUDA DSLASH ***\n\n" if not dagger else "// *** CUDA DSLASH DAGGER ***\n\n")
+    str.append("#define SHARED_FLOATS_PER_THREAD "+`sharedFloats`+"\n")
+    str.append("#define SHARED_BYTES (BLOCK_DIM*SHARED_FLOATS_PER_THREAD*sizeof(float))\n\n")
+    
+    for s in range(0,4):
+        for c in range(0,3):
+            i = 3*s+c
+            str.append("#define "+in_re(s,c)+" I"+nthFloat4(2*i+0)+"\n")
+            str.append("#define "+in_im(s,c)+" I"+nthFloat4(2*i+1)+"\n")
+    str.append("\n")
+    for m in range(0,3):
+        for n in range(0,3):
+            i = 3*m+n
+            str.append("#define "+g_re(0,m,n)+" G"+nthFloat4(2*i+0)+"\n")
+            str.append("#define "+g_im(0,m,n)+" G"+nthFloat4(2*i+1)+"\n")
+    str.append("\n")
+    for m in range(0,3):
+        for n in range(0,3):
+            i = 3*m+n
+            str.append("#define "+g_re(1,m,n)+" (+"+g_re(0,n,m)+")\n")
+            str.append("#define "+g_im(1,m,n)+" (-"+g_im(0,n,m)+")\n")
+    str.append("\n")
+    
+    for s in range(0,4):
+        for c in range(0,3):
+            i = 3*s+c
+            if 2*i < sharedFloats:
+                str.append("#define "+out_re(s,c)+" s["+`(2*i+0)`+"]\n")
+            else:
+                str.append("volatile float "+out_re(s,c)+";\n")
+            if 2*i+1 < sharedFloats:
+                str.append("#define "+out_im(s,c)+" s["+`(2*i+1)`+"]\n")
+            else:
+                str.append("volatile float "+out_im(s,c)+";\n")
+    str.append("\n")
+    
+    str.append(
+"""
+// Performs the complex conjugated accumulation: a += b* c*
+#define ACC_CONJ_PROD(a, b, c) \\
+    a##_re += b##_re * c##_re - b##_im * c##_im, \\
+    a##_im -= b##_re * c##_im + b##_im * c##_re
+
+#define READ_GAUGE_MATRIX(gauge, dir) \\
+    float4 G0 = tex1Dfetch((gauge), ga_idx + ((dir/2)*3+0)*Nh); \\
+    float4 G1 = tex1Dfetch((gauge), ga_idx + ((dir/2)*3+1)*Nh); \\
+    float4 G2 = tex1Dfetch((gauge), ga_idx + ((dir/2)*3+2)*Nh); \\
+    float4 G3 = make_float4(0,0,0,0); \\
+    float4 G4 = make_float4(0,0,0,0); \\
+    ACC_CONJ_PROD(g20, +g01, +g12); \\
+    ACC_CONJ_PROD(g20, -g02, +g11); \\
+    ACC_CONJ_PROD(g21, +g02, +g10); \\
+    ACC_CONJ_PROD(g21, -g00, +g12); \\
+    ACC_CONJ_PROD(g22, +g00, +g11); \\
+    ACC_CONJ_PROD(g22, -g01, +g10); \\
+    float u0 = (dir < 6 ? SPATIAL_SCALING : (ga_idx >= (L4-1)*L1h*L2*L3 ? TIME_SYMMETRY : 1)); \\
+    G3.x*=u0; G3.y*=u0; G3.z*=u0; G3.w*=u0; G4.x*=u0; G4.y*=u0;
+
+#define READ_SPINOR(spinor) \\
+    float4 I0 = tex1Dfetch((spinor), sp_idx + 0*Nh); \\
+    float4 I1 = tex1Dfetch((spinor), sp_idx + 1*Nh); \\
+    float4 I2 = tex1Dfetch((spinor), sp_idx + 2*Nh); \\
+    float4 I3 = tex1Dfetch((spinor), sp_idx + 3*Nh); \\
+    float4 I4 = tex1Dfetch((spinor), sp_idx + 4*Nh); \\
+    float4 I5 = tex1Dfetch((spinor), sp_idx + 5*Nh);
+
+int sid = BLOCK_DIM*blockIdx.x + threadIdx.x;
+int boundaryCrossings = sid/L1h + sid/(L2*L1h) + sid/(L3*L2*L1h);
+int X = 2*sid + (boundaryCrossings + oddBit) % 2;
+int x4 = X/(L3*L2*L1);
+int x3 = (X/(L2*L1)) % L3;
+int x2 = (X/L1) % L2;
+int x1 = X % L1;
+
+""")
+    
+    str.append("extern __shared__ float s_data[];\n")
+    str.append("volatile float *s = s_data+SHARED_FLOATS_PER_THREAD*threadIdx.x;\n\n")
+    
+    for s in range(0,4):
+        for c in range(0,3):
+            str.append(out_re(s,c) + " = " + out_im(s,c)+" = 0;\n")
+    str.append("\n")
+    
+    return ''.join(str)
+# end def prolog
+
+
+
+def epilog():
+    str = []
+    str.append(
+"""
+#ifdef DSLASH_XPAY
+    float4 accum0 = tex1Dfetch(accumTex, sid + 0*Nh);
+    float4 accum1 = tex1Dfetch(accumTex, sid + 1*Nh);
+    float4 accum2 = tex1Dfetch(accumTex, sid + 2*Nh);
+    float4 accum3 = tex1Dfetch(accumTex, sid + 3*Nh);
+    float4 accum4 = tex1Dfetch(accumTex, sid + 4*Nh);
+    float4 accum5 = tex1Dfetch(accumTex, sid + 5*Nh);
+""")
+    
+    for s in range(0,4):
+        for c in range(0,3):
+            i = 3*s+c
+            str.append("    "+out_re(s,c) +" = a*"+out_re(s,c)+" + accum"+nthFloat4(2*i+0)+";\n")
+            str.append("    "+out_im(s,c) +" = a*"+out_im(s,c)+" + accum"+nthFloat4(2*i+1)+";\n")
+    str.append("#endif\n\n")
+    
+    str.append(
+"""
+#ifdef WRITE_FLOAT4
+// this code exhibits a hardware bug in our C870 card
+g_out[0*Nh+sid] = make_float4(o00_re, o00_im, o01_re, o01_im);
+g_out[1*Nh+sid] = make_float4(o02_re, o02_im, o10_re, o10_im);
+g_out[2*Nh+sid] = make_float4(o11_re, o11_im, o12_re, o12_im);
+g_out[3*Nh+sid] = make_float4(o20_re, o20_im, o21_re, o21_im);
+g_out[4*Nh+sid] = make_float4(o22_re, o22_im, o30_re, o30_im);
+g_out[5*Nh+sid] = make_float4(o31_re, o31_im, o32_re, o32_im);
+#endif
+
+#ifdef WRITE_FLOAT1_SMEM
+int t = threadIdx.x;
+int B = BLOCK_DIM;
+int b = blockIdx.x;
+int f = SHARED_FLOATS_PER_THREAD;
+__syncthreads();
+for (int i = 0; i < 6; i++) // spinor indices
+    for (int c = 0; c < 4; c++) // components of float4
+        ((float*)g_out)[i*(Nh*4) + b*(B*4) + c*(B) + t] = s_data[(c*B/4 + t/4)*(f) + i*(4) + t%4];
+#endif
+
+#ifdef WRITE_FLOAT1_STAGGERED
+// the alternative to writing float4's directly: almost as fast, a lot more confusing
+int t = threadIdx.x;
+int B = BLOCK_DIM;
+int b = blockIdx.x;
+int f = SHARED_FLOATS_PER_THREAD;
+__syncthreads();
+for (int i = 0; i < 4; i++) // spinor indices
+    for (int c = 0; c < 4; c++) // components of float4
+        ((float*)g_out)[i*(Nh*4) + b*(B*4) + c*(B) + t] = s_data[(c*B/4 + t/4)*(f) + i*(4) + t%4];
+__syncthreads();
+s[0] = o22_re;
+s[1] = o22_im;
+s[2] = o30_re;
+s[3] = o30_im;
+s[4] = o31_re;
+s[5] = o31_im;
+s[6] = o32_re;
+s[7] = o32_im;
+__syncthreads();
+for (int i = 0; i < 2; i++)
+    for (int c = 0; c < 4; c++)
+        ((float*)g_out)[(i+4)*(Nh*4) + b*(B*4) + c*(B) + t] = s_data[(c*B/4 + t/4)*(f) + i*(4) + t%4];
+#endif
+
+""")
+    return ''.join(str)
+# end def prolog
+
+
+
+def gen(dir):
+    projIdx = dir if not dagger else dir + (1 - 2*(dir%2))
+    projStr = projectorToStr(projectors[projIdx])
+    def proj(i,j):
+        return projectors[projIdx][4*i+j]
+    
+    # if row(i) = (j, c), then the i'th row of the projector can be represented
+    # as a multiple of the j'th row: row(i) = c row(j)
+    def row(i):
+        assert i==2 or i==3
+        if proj(i,0) == 0j:
+            return (1, proj(i,1))
+        if proj(i,1) == 0j:
+            return (0, proj(i,0))
+    
+    str = []
+    
+    projName = "P"+`dir/2`+["-","+"][projIdx%2]
+    str.append("// Projector "+projName+"\n")
+    for l in projStr.splitlines():
+        str.append("// "+l+"\n")
+    str.append("\n")
+    
+    if dir == 0: str.append("int sp_idx = ((x1==L1-1) ? X-(L1-1) : X+1) / 2;\n")
+    if dir == 1: str.append("int sp_idx = ((x1==0)    ? X+(L1-1) : X-1) / 2;\n")
+    if dir == 2: str.append("int sp_idx = ((x2==L2-1) ? X-(L2-1)*L1 : X+L1) / 2;\n")
+    if dir == 3: str.append("int sp_idx = ((x2==0)    ? X+(L2-1)*L1 : X-L1) / 2;\n")
+    if dir == 4: str.append("int sp_idx = ((x3==L3-1) ? X-(L3-1)*L2*L1 : X+L2*L1) / 2;\n")
+    if dir == 5: str.append("int sp_idx = ((x3==0)    ? X+(L3-1)*L2*L1 : X-L2*L1) / 2;\n")
+    if dir == 6: str.append("int sp_idx = ((x4==L4-1) ? X-(L4-1)*L3*L2*L1 : X+L3*L2*L1) / 2;\n")
+    if dir == 7: str.append("int sp_idx = ((x4==0)    ? X+(L4-1)*L3*L2*L1 : X-L3*L2*L1) / 2;\n")
+    
+    ga_idx = "sid" if dir % 2 == 0 else "sp_idx"
+    str.append("int ga_idx = "+ga_idx+";\n\n")
+    
+    project = []
+    project.append("// read spinor from device memory\n")
+    project.append("READ_SPINOR(spinorTex);\n\n")
+    project.append("// project spinor into half spinors\n")
+    for h in range(0, 2):
+        for c in range(0, 3):
+            strRe = []
+            strIm = []
+            for s in range(0, 4):
+                re = proj(h,s).real
+                im = proj(h,s).imag
+                if re==0 and im==0: ()
+                elif im==0:
+                    strRe.append(sign(re)+in_re(s,c))
+                    strIm.append(sign(re)+in_im(s,c))
+                elif re==0:
+                    strRe.append(sign(-im)+in_im(s,c))
+                    strIm.append(sign(im)+in_re(s,c))
+            project.append("float "+h1_re(h,c)+ " = "+''.join(strRe)+";\n")
+            project.append("float "+h1_im(h,c)+ " = "+''.join(strIm)+";\n")
+        project.append("\n")
+    
+    ident = []
+    ident.append("// identity gauge matrix\n")
+    for m in range(0,3):
+        for h in range(0,2):
+            ident.append("float "+h2_re(h,m)+" = " + h1_re(h,m) + "; ")
+            ident.append("float "+h2_im(h,m)+" = " + h1_im(h,m) + ";\n")
+    ident.append("\n")
+    
+    mult = []
+    mult.append("// read gauge matrix from device memory\n")
+    mult.append("READ_GAUGE_MATRIX(gauge"+`dir%2`+"Tex, "+`dir`+");\n\n")
+    for m in range(0,3):
+        mult.append("// multiply row "+`m`+"\n")
+        for h in range(0,2):
+            re = ["float "+h2_re(h,m)+" ="]
+            im = ["float "+h2_im(h,m)+" ="]
+            for c in range(0,3):
+                re.append(" + ("+g_re(dir,m,c)+" * "+h1_re(h,c)+" - "+g_im(dir,m,c)+" * "+h1_im(h,c)+")")
+                im.append(" + ("+g_re(dir,m,c)+" * "+h1_im(h,c)+" + "+g_im(dir,m,c)+" * "+h1_re(h,c)+")")
+            mult.append(''.join(re)+";\n")
+            mult.append(''.join(im)+";\n")
+        mult.append("\n")
+    
+    reconstruct = []
+    for m in range(0,3):
+        reconstruct.append(out_re(0, m) + " += " + h2_re(0,m) + ";\n")
+        reconstruct.append(out_im(0, m) + " += " + h2_im(0,m) + ";\n")
+        reconstruct.append(out_re(1, m) + " += " + h2_re(1,m) + ";\n")
+        reconstruct.append(out_im(1, m) + " += " + h2_im(1,m) + ";\n")
+    
+        for s in range(2,4):
+            (h,c) = row(s)
+            re = c.real
+            im = c.imag
+            if im == 0:
+                reconstruct.append(out_re(s, m) + " " + sign(re) + "= " + h2_re(h,m) + ";\n")
+                reconstruct.append(out_im(s, m) + " " + sign(re) + "= " + h2_im(h,m) + ";\n")
+            elif re == 0:
+                reconstruct.append(out_re(s, m) + " " + sign(-im) + "= " + h2_im(h,m) + ";\n")
+                reconstruct.append(out_im(s, m) + " " + sign(+im) + "= " + h2_re(h,m) + ";\n")
+        
+        reconstruct.append("\n")
+        
+    if dir >= 6:
+        str.append("if (GAUGE_FIXED && ga_idx >= L1h*L2*L3) ")
+        str.append(block(''.join(project) + ''.join(ident) + ''.join(reconstruct)))
+        str.append("else ")
+        str.append(block(''.join(project) + ''.join(mult) + ''.join(reconstruct)))
+    else:
+        str.append(''.join(project) + ''.join(mult) + ''.join(reconstruct))
+    
+    return block(''.join(str))+"\n"
+# end def gen
+
+
+
+def generate():
+    return prolog() + gen(0) + gen(1) + gen(2) + gen(3) + gen(4) + gen(5) + gen(6) + gen(7) + epilog()
+
+# dagger = False
+dagger = True
+print generate()
--- a/dslash_dagger_core.h
+++ b/dslash_dagger_core.h
--- a/dslash_def.h
+++ b/dslash_def.h
+// All dslash definitions 
+
+__global__ void
+dslashSingle12Kernel(float4* g_out, int oddBit) {
+#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_12
+#define GAUGE0TEX gauge0TexSingle
+#define GAUGE1TEX gauge1TexSingle
+#include "dslash_core.h"
+#undef GAUGE1TEX
+#undef GAUGE0TEX
+#undef READ_GAUGE_MATRIX
+}
+
+__global__ void
+dslashSingle12DaggerKernel(float4* g_out, int oddBit) {
+#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_12
+#define GAUGE0TEX gauge0TexSingle
+#define GAUGE1TEX gauge1TexSingle
+#include "dslash_dagger_core.h"
+#undef GAUGE1TEX
+#undef GAUGE0TEX
+#undef READ_GAUGE_MATRIX
+}
+
+
+#define DSLASH_XPAY
+
+__global__ void
+dslashSingle12XpayKernel(float4* g_out, int oddBit, float a) {
+#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_12
+#define GAUGE0TEX gauge0TexSingle
+#define GAUGE1TEX gauge1TexSingle
+#include "dslash_core.h"
+#undef GAUGE1TEX
+#undef GAUGE0TEX
+#undef READ_GAUGE_MATRIX
+}
+
+__global__ void
+dslashSingle12DaggerXpayKernel(float4* g_out, int oddBit, float a) {
+#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_12
+#define GAUGE0TEX gauge0TexSingle
+#define GAUGE1TEX gauge1TexSingle
+#include "dslash_dagger_core.h"
+#undef GAUGE1TEX
+#undef GAUGE0TEX
+#undef READ_GAUGE_MATRIX
+}
+
+#undef DSLASH_XPAY
+
+__global__ void
+dslashSingle8Kernel(float4* g_out, int oddBit) {
+#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_8
+#define GAUGE0TEX gauge0TexSingle
+#define GAUGE1TEX gauge1TexSingle
+#include "dslash_core.h"
+#undef GAUGE1TEX
+#undef GAUGE0TEX
+#undef READ_GAUGE_MATRIX
+}
+
+__global__ void
+dslashSingle8DaggerKernel(float4* g_out, int oddBit) {
+#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_8
+#define GAUGE0TEX gauge0TexSingle
+#define GAUGE1TEX gauge1TexSingle
+#include "dslash_dagger_core.h"
+#undef GAUGE1TEX
+#undef GAUGE0TEX
+#undef READ_GAUGE_MATRIX
+}
+
+
+#define DSLASH_XPAY
+
+__global__ void
+dslashSingle8XpayKernel(float4* g_out, int oddBit, float a) {
+#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_8
+#define GAUGE0TEX gauge0TexSingle
+#define GAUGE1TEX gauge1TexSingle
+#include "dslash_core.h"
+#undef GAUGE1TEX
+#undef GAUGE0TEX
+#undef READ_GAUGE_MATRIX
+}
+
+__global__ void
+dslashSingle8DaggerXpayKernel(float4* g_out, int oddBit, float a) {
+#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_8
+#define GAUGE0TEX gauge0TexSingle
+#define GAUGE1TEX gauge1TexSingle
+#include "dslash_dagger_core.h"
+#undef GAUGE1TEX
+#undef GAUGE0TEX
+#undef READ_GAUGE_MATRIX
+}
+
+#undef DSLASH_XPAY
+
+//////////////////////////
+
+__global__ void
+dslashHalf12Kernel(float4* g_out, int oddBit) {
+#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_12
+#define GAUGE0TEX gauge0TexHalf
+#define GAUGE1TEX gauge1TexHalf
+#include "dslash_core.h"
+#undef GAUGE1TEX
+#undef GAUGE0TEX
+#undef READ_GAUGE_MATRIX
+}
+
+__global__ void
+dslashHalf12DaggerKernel(float4* g_out, int oddBit) {
+#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_12
+#define GAUGE0TEX gauge0TexHalf
+#define GAUGE1TEX gauge1TexHalf
+#include "dslash_dagger_core.h"
+#undef GAUGE1TEX
+#undef GAUGE0TEX
+#undef READ_GAUGE_MATRIX
+}
+
+
+#define DSLASH_XPAY
+
+__global__ void
+dslashHalf12XpayKernel(float4* g_out, int oddBit, float a) {
+#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_12
+#define GAUGE0TEX gauge0TexHalf
+#define GAUGE1TEX gauge1TexHalf
+#include "dslash_core.h"
+#undef GAUGE1TEX
+#undef GAUGE0TEX
+#undef READ_GAUGE_MATRIX
+}
+
+__global__ void
+dslashHalf12DaggerXpayKernel(float4* g_out, int oddBit, float a) {
+#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_12
+#define GAUGE0TEX gauge0TexHalf
+#define GAUGE1TEX gauge1TexHalf
+#include "dslash_dagger_core.h"
+#undef GAUGE1TEX
+#undef GAUGE0TEX
+#undef READ_GAUGE_MATRIX
+}
+
+#undef DSLASH_XPAY
+
+__global__ void
+dslashHalf8Kernel(float4* g_out, int oddBit) {
+#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_8_HALF
+#define GAUGE0TEX gauge0TexHalf
+#define GAUGE1TEX gauge1TexHalf
+#include "dslash_core.h"
+#undef GAUGE1TEX
+#undef GAUGE0TEX
+#undef READ_GAUGE_MATRIX
+}
+
+__global__ void
+dslashHalf8DaggerKernel(float4* g_out, int oddBit) {
+#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_8_HALF
+#define GAUGE0TEX gauge0TexHalf
+#define GAUGE1TEX gauge1TexHalf
+#include "dslash_dagger_core.h"
+#undef GAUGE1TEX
+#undef GAUGE0TEX
+#undef READ_GAUGE_MATRIX
+}
+
+
+#define DSLASH_XPAY
+
+__global__ void
+dslashHalf8XpayKernel(float4* g_out, int oddBit, float a) {
+#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_8_HALF
+#define GAUGE0TEX gauge0TexHalf
+#define GAUGE1TEX gauge1TexHalf
+#include "dslash_core.h"
+#undef GAUGE1TEX
+#undef GAUGE0TEX
+#undef READ_GAUGE_MATRIX
+}
+
+__global__ void
+dslashHalf8DaggerXpayKernel(float4* g_out, int oddBit, float a) {
+#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_8_HALF
+#define GAUGE0TEX gauge0TexHalf
+#define GAUGE1TEX gauge1TexHalf
+#include "dslash_dagger_core.h"
+#undef GAUGE1TEX
+#undef GAUGE0TEX
+#undef READ_GAUGE_MATRIX
+}
+
+#undef DSLASH_XPAY
+
+
+
--- a/dslash_quda.cu
+++ b/dslash_quda.cu
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <dslash_quda.h>
+
+// ----------------------------------------------------------------------
+// Cuda code
+
+// Half precision gauge field
+texture<short4, 1, cudaReadModeNormalizedFloat> gauge0TexHalf;
+texture<short4, 1, cudaReadModeNormalizedFloat> gauge1TexHalf;
+
+// Single precision gauge field
+texture<float4, 1, cudaReadModeElementType> gauge0TexSingle;
+texture<float4, 1, cudaReadModeElementType> gauge1TexSingle;
+
+texture<float4, 1, cudaReadModeElementType> spinorTex;
+texture<float4, 1, cudaReadModeElementType> accumTex;
+
+QudaGaugeParam *gauge_param;
+
+__constant__ int X1;
+__constant__ int X2;
+__constant__ int X3;
+__constant__ int X4;
+__constant__ int X1h;
+
+__constant__ float anisotropy;
+__constant__ float t_boundary;
+__constant__ int gauge_fixed;
+
+#define WRITE_SPINOR WRITE_SPINOR_FLOAT4
+//#define WRITE_SPINOR WRITE_FLOAT1_SMEM
+//#define WRITE_SPINOR WRITE_SPINOR_FLOAT1_STAGGERED
+
+#include <dslash_def.h>
+
+void setCudaGaugeParam() {
+  cudaMemcpyToSymbol("anisotropy", &(gauge_param->anisotropy), sizeof(float));
+  float t_bc = (gauge_param->t_boundary == QUDA_PERIODIC_T) ? 1.0 : -1.0;
+  cudaMemcpyToSymbol("t_boundary", &(t_bc), sizeof(float));
+  int gf = (gauge_param->gauge_fix == QUDA_GAUGE_FIXED_YES) ? 1 : 0;
+  cudaMemcpyToSymbol("gauge_fixed", &(gf), sizeof(int));
+}
+
+
+// ----------------------------------------------------------------------
+
+void dslashCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor, 
+		int oddBit, int daggerBit) {
+
+  int packed_gauge_bytes = (gauge_param->reconstruct == QUDA_RECONSTRUCT_12) ?
+    PACKED12_GAUGE_BYTES : PACKED8_GAUGE_BYTES;
+  if (gauge_param->cuda_prec == QUDA_HALF_PRECISION) packed_gauge_bytes/=2;
+
+  if (gauge_param->cuda_prec == QUDA_SINGLE_PRECISION) {
+    if (oddBit) {
+      cudaBindTexture(0, gauge0TexSingle, gauge.odd, packed_gauge_bytes); 
+      cudaBindTexture(0, gauge1TexSingle, gauge.even, packed_gauge_bytes);
+    } else {
+      cudaBindTexture(0, gauge0TexSingle, gauge.even, packed_gauge_bytes);
+      cudaBindTexture(0, gauge1TexSingle, gauge.odd, packed_gauge_bytes); 
+    }
+  } else {
+    if (oddBit) {
+      cudaBindTexture(0, gauge0TexHalf, gauge.odd, packed_gauge_bytes); 
+      cudaBindTexture(0, gauge1TexHalf, gauge.even, packed_gauge_bytes);
+    } else {
+      cudaBindTexture(0, gauge0TexHalf, gauge.even, packed_gauge_bytes);
+      cudaBindTexture(0, gauge1TexHalf, gauge.odd, packed_gauge_bytes); 
+    }
+  }
+
+  cudaBindTexture(0, spinorTex, spinor, SPINOR_BYTES); 
+  
+  dim3 gridDim(GRID_DIM, 1, 1);
+  dim3 blockDim(BLOCK_DIM, 1, 1);
+
+  if (gauge_param->cuda_prec == QUDA_SINGLE_PRECISION) {
+    if (gauge_param->reconstruct == QUDA_RECONSTRUCT_12) {
+      if (!daggerBit) {
+	dslashSingle12Kernel <<<gridDim, blockDim, SHARED_BYTES>>> 
+	  ((float4 *)res, oddBit);
+      } else {
+	dslashSingle12DaggerKernel <<<gridDim, blockDim, SHARED_BYTES>>> 
+	  ((float4 *)res, oddBit);
+      }
+    } else {
+      if (!daggerBit) {
+	dslashSingle8Kernel <<<gridDim, blockDim, SHARED_BYTES>>> 
+	  ((float4 *)res, oddBit);
+      } else {
+	dslashSingle8DaggerKernel <<<gridDim, blockDim, SHARED_BYTES>>> 
+	  ((float4 *)res, oddBit);
+      }
+    }
+  } else {
+    if (gauge_param->reconstruct == QUDA_RECONSTRUCT_12) {
+      if (!daggerBit) {
+	dslashHalf12Kernel <<<gridDim, blockDim, SHARED_BYTES>>> 
+	  ((float4 *)res, oddBit);
+      } else {
+	dslashHalf12DaggerKernel <<<gridDim, blockDim, SHARED_BYTES>>> 
+	  ((float4 *)res, oddBit);
+      }
+    } else {
+      if (!daggerBit) {
+	dslashHalf8Kernel <<<gridDim, blockDim, SHARED_BYTES>>> 
+	  ((float4 *)res, oddBit);
+      } else {
+	dslashHalf8DaggerKernel <<<gridDim, blockDim, SHARED_BYTES>>> 
+	  ((float4 *)res, oddBit);
+      }
+    }
+  }
+}
+
+void dslashXpayCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor, 
+		    int oddBit, int daggerBit, ParitySpinor x, float a) {
+  int packed_gauge_bytes = (gauge_param->reconstruct == QUDA_RECONSTRUCT_12) ?
+    PACKED12_GAUGE_BYTES : PACKED8_GAUGE_BYTES;
+  if (gauge_param->cuda_prec == QUDA_HALF_PRECISION) packed_gauge_bytes/=2;
+
+  if (gauge_param->cuda_prec == QUDA_SINGLE_PRECISION) {
+    if (oddBit) {
+      cudaBindTexture(0, gauge0TexSingle, gauge.odd, packed_gauge_bytes); 
+      cudaBindTexture(0, gauge1TexSingle, gauge.even, packed_gauge_bytes);
+    } else {
+      cudaBindTexture(0, gauge0TexSingle, gauge.even, packed_gauge_bytes);
+      cudaBindTexture(0, gauge1TexSingle, gauge.odd, packed_gauge_bytes); 
+    }
+  } else {
+    if (oddBit) {
+      cudaBindTexture(0, gauge0TexHalf, gauge.odd, packed_gauge_bytes); 
+      cudaBindTexture(0, gauge1TexHalf, gauge.even, packed_gauge_bytes);
+    } else {
+      cudaBindTexture(0, gauge0TexHalf, gauge.even, packed_gauge_bytes);
+      cudaBindTexture(0, gauge1TexHalf, gauge.odd, packed_gauge_bytes); 
+    }
+  }
+
+  cudaBindTexture(0, spinorTex, spinor, SPINOR_BYTES); 
+  cudaBindTexture(0, accumTex, x, SPINOR_BYTES); 
+  
+  dim3 gridDim(GRID_DIM, 1, 1);
+  dim3 blockDim(BLOCK_DIM, 1, 1);
+    
+  if (gauge_param->cuda_prec == QUDA_SINGLE_PRECISION) {
+    if (gauge_param->reconstruct == QUDA_RECONSTRUCT_12) {
+      if (!daggerBit) {
+	dslashSingle12XpayKernel <<<gridDim, blockDim, SHARED_BYTES>>> 
+	  ((float4 *)res, oddBit, a);
+      } else {
+	dslashSingle12DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES>>> 
+	  ((float4 *)res, oddBit, a);
+      }
+    } else if (gauge_param->reconstruct == QUDA_RECONSTRUCT_8) {
+      if (!daggerBit) {
+	dslashSingle8XpayKernel <<<gridDim, blockDim, SHARED_BYTES>>> 
+	  ((float4 *)res, oddBit, a);
+      }
+      else {
+	dslashSingle8DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES>>> 
+	  ((float4 *)res, oddBit, a);
+      }
+    }
+  } else {
+    if (gauge_param->reconstruct == QUDA_RECONSTRUCT_12) {
+      if (!daggerBit) {
+	dslashHalf12XpayKernel <<<gridDim, blockDim, SHARED_BYTES>>> 
+	  ((float4 *)res, oddBit, a);
+    } else {
+	dslashHalf12DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES>>> 
+	  ((float4 *)res, oddBit, a);
+    }
+    } else if (gauge_param->reconstruct == QUDA_RECONSTRUCT_8) {
+      if (!daggerBit) {
+	dslashHalf8XpayKernel <<<gridDim, blockDim, SHARED_BYTES>>> 
+	  ((float4 *)res, oddBit, a);
+      }
+      else {
+	dslashHalf8DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES>>> 
+	  ((float4 *)res, oddBit, a);
+      }
+    }
+  }
+
+}
+
+int dslashCudaSharedBytes() {
+  return SHARED_BYTES;
+}
+
+// Apply the even-odd preconditioned Dirac operator
+void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, float kappa, 
+	       ParitySpinor tmp, MatPCType matpc_type) {
+  float kappa2 = -kappa*kappa;
+
+  if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
+    dslashCuda(tmp, gauge, in, 1, 0);
+    dslashXpayCuda(out, gauge, tmp, 0, 0, in, kappa2); 
+  } else {
+    dslashCuda(tmp, gauge, in, 0, 0);
+    dslashXpayCuda(out, gauge, tmp, 1, 0, in, kappa2); 
+  }
+}
+
+// Apply the even-odd preconditioned Dirac operator
+void MatPCDagCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, float kappa, 
+		  ParitySpinor tmp, MatPCType matpc_type) {
+  float kappa2 = -kappa*kappa;
+
+  if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
+    dslashCuda(tmp, gauge, in, 1, 1);
+    dslashXpayCuda(out, gauge, tmp, 0, 1, in, kappa2);
+  } else {
+    dslashCuda(tmp, gauge, in, 0, 1);
+    dslashXpayCuda(out, gauge, tmp, 1, 1, in, kappa2);
+  }
+}
+
+// Apply the even-odd preconditioned Dirac operator
+cuComplex MatPCcDotWXCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, float kappa, 
+		     ParitySpinor tmp, ParitySpinor d, MatPCType matpc_type) {
+  float kappa2 = -kappa*kappa;
+
+  if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
+    dslashCuda(tmp, gauge, in, 1, 0);
+    dslashCuda(out, gauge, tmp, 0, 0); 
+  } else {
+    dslashCuda(tmp, gauge, in, 0, 0);
+    dslashCuda(out, gauge, tmp, 1, 0); 
+  }
+
+  // out = in - kappa2*out, dot = (d, out)
+  return xpaycDotzyCuda((float2*)in, kappa2, (float2*)out, (float2*)d, Nh*spinorSiteSize/2);
+}
+
+// Apply the even-odd preconditioned Dirac operator
+cuComplex MatPCDagcDotWXCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, float kappa, 
+			ParitySpinor tmp, ParitySpinor d, MatPCType matpc_type) {
+  float kappa2 = -kappa*kappa;
+
+  if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
+    dslashCuda(tmp, gauge, in, 1, 1);
+    dslashCuda(out, gauge, tmp, 0, 1);
+  } else {
+    dslashCuda(tmp, gauge, in, 0, 1);
+    dslashCuda(out, gauge, tmp, 1, 1);
+  }
+
+  // out = in - kappa2*out, dot = (d, out)
+  return xpaycDotzyCuda((float2*)in, kappa2, (float2*)out, (float2*)d, Nh*spinorSiteSize/2);
+}
+
+void MatPCDagMatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, 
+		       float kappa, ParitySpinor tmp, MatPCType matpc_type) {
+  MatPCCuda(out, gauge, in, kappa, tmp, matpc_type);
+  MatPCDagCuda(out, gauge, out, kappa, tmp, matpc_type);
+}
--- a/dslash_quda.h
+++ b/dslash_quda.h
+#ifndef _QUDA_DSLASH_H
+#define _QUDA_DSLASH_H
+
+#include <cuComplex.h>
+
+#include <quda.h>
+
+#define packed12GaugeSiteSize 12 // real numbers per link, using SU(3) reconstruction
+#define packed8GaugeSiteSize 8 // real numbers per link, using SU(3) reconstruction
+
+#define gaugeSiteSize 18 // real numbers per link
+#define spinorSiteSize 24 // real numbers per spinor
+
+#define BLOCK_DIM (64) // threads per block
+#define GRID_DIM (Nh/BLOCK_DIM) // there are Nh threads in total
+
+#define SPINOR_BYTES (Nh*spinorSiteSize*sizeof(float))
+
+#define PACKED12_GAUGE_BYTES (4*Nh*packed12GaugeSiteSize*sizeof(float))
+#define PACKED8_GAUGE_BYTES (4*Nh*packed8GaugeSiteSize*sizeof(float))
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern QudaGaugeParam *gauge_param;
+
+// ---------- dslash_quda.cu ----------
+
+  void setCudaGaugeParam();
+
+  void dslashCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
+		  int oddBit, int daggerBit);
+  void dslashXpayCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor, 
+		      int oddBit, int daggerBit, ParitySpinor x, float a);
+  int  dslashCudaSharedBytes();
+  
+  void MatPCCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven, 
+		 float kappa, ParitySpinor tmp, MatPCType matpc_type);
+  void MatPCDagCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven, 
+		    float kappa, ParitySpinor tmp, MatPCType matpc_type);
+  void MatPCDagMatPCCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven,
+			 float kappa, ParitySpinor tmp, MatPCType matpc_type);
+  
+  cuComplex MatPCcDotWXCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven, 
+			    float kappa, ParitySpinor tmp, ParitySpinor d, MatPCType matpc_type);
+  cuComplex MatPCDagcDotWXCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven, 
+			       float kappa, ParitySpinor tmp, ParitySpinor d, MatPCType matpc_type);
+  
+  // ---------- dslash_reference.cpp ----------
+  
+  void dslashReference(float *res, float **gauge, float *spinorField, 
+		       int oddBit, int daggerBit);
+  
+  void Mat(float *out, float **gauge, float *in, float kappa);
+  void MatDag(float *out, float **gauge, float *in, float kappa);
+  void MatDagMat(float *out, float **gauge, float *in, float kappa);
+  
+  void MatPC(float *out, float **gauge, float *in, float kappa, MatPCType matpc_type);
+  void MatPCDag(float *out, float **gauge, float *in, float kappa, MatPCType matpc_type);
+  void MatPCDagMatPC(float *out, float **gauge, float *in, float kappa, MatPCType matpc_type);
+  
+  // -- inv_cg_cuda.cpp
+  void invertCgCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge, 
+		    ParitySpinor tmp, QudaInvertParam *param);
+  
+// -- inv_bicgstab_cuda.cpp
+void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge, 
+			ParitySpinor tmp, QudaInvertParam *param, DagType dag_type);
+  
+// ---------- cg_reference.cpp ----------  
+void cgReference(float *out, float **gauge, float *in, float kappa, float tol);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _QUDA_DLASH_H
--- a/dslash_reference.cpp
+++ b/dslash_reference.cpp
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include <quda.h>
+#include <util_quda.h>
+
+void sum(float *dst, float *a, float *b, int cnt) {
+    for (int i = 0; i < cnt; i++)
+        dst[i] = a[i] + b[i];
+}
+
+// i represents a "half index" into an even or odd "half lattice".
+// when oddBit={0,1} the half lattice is {even,odd}.
+// 
+// the displacements, such as dx, refer to the full lattice coordinates. 
+//
+// neighborIndex() takes a "half index", displaces it, and returns the
+// new "half index", which can be an index into either the even or odd lattices.
+// displacements of magnitude one always interchange odd and even lattices.
+//
+int neighborIndex(int i, int oddBit, int dx4, int dx3, int dx2, int dx1) {
+    int X = fullLatticeIndex(i, oddBit);
+    int x4 = X/(L3*L2*L1);
+    int x3 = (X/(L2*L1)) % L3;
+    int x2 = (X/L1) % L2;
+    int x1 = X % L1;
+
+    // assert (oddBit == (x+y+z+t)%2);
+    
+    x4 = (x4+dx4+L4) % L4;
+    x3 = (x3+dx3+L3) % L3;
+    x2 = (x2+dx2+L2) % L2;
+    x1 = (x1+dx1+L1) % L1;
+    
+    return (x4*(L3*L2*L1) + x3*(L2*L1) + x2*(L1) + x1) / 2;
+}
+
+float *gaugeLink(int i, int dir, int oddBit, float **gaugeEven, float **gaugeOdd) {
+    float **gaugeField;
+    int j;
+    
+    if (dir % 2 == 0) {
+        j = i;
+        gaugeField = (oddBit ? gaugeOdd : gaugeEven);
+    }
+    else {
+        switch (dir) {
+            case 1: j = neighborIndex(i, oddBit, 0, 0, 0, -1); break;
+            case 3: j = neighborIndex(i, oddBit, 0, 0, -1, 0); break;
+            case 5: j = neighborIndex(i, oddBit, 0, -1, 0, 0); break;
+            case 7: j = neighborIndex(i, oddBit, -1, 0, 0, 0); break;
+            default: j = -1; break;
+        }
+        gaugeField = (oddBit ? gaugeEven : gaugeOdd);
+    }
+
+    return &gaugeField[dir/2][j*(3*3*2)];
+}
+
+float *spinorNeighbor(int i, int dir, int oddBit, float *spinorField) {
+    int j;
+    switch (dir) {
+        case 0: j = neighborIndex(i, oddBit, 0, 0, 0, +1); break;
+        case 1: j = neighborIndex(i, oddBit, 0, 0, 0, -1); break;
+        case 2: j = neighborIndex(i, oddBit, 0, 0, +1, 0); break;
+        case 3: j = neighborIndex(i, oddBit, 0, 0, -1, 0); break;
+        case 4: j = neighborIndex(i, oddBit, 0, +1, 0, 0); break;
+        case 5: j = neighborIndex(i, oddBit, 0, -1, 0, 0); break;
+        case 6: j = neighborIndex(i, oddBit, +1, 0, 0, 0); break;
+        case 7: j = neighborIndex(i, oddBit, -1, 0, 0, 0); break;
+        default: j = -1; break;
+    }
+    
+    return &spinorField[j*(4*3*2)];
+}
+
+
+
+void dot(float* res, float* a, float* b) {
+    res[0] = res[1] = 0;
+    for (int m = 0; m < 3; m++) {
+        float a_re = a[2*m+0];
+        float a_im = a[2*m+1];
+        float b_re = b[2*m+0];
+        float b_im = b[2*m+1];
+        res[0] += a_re * b_re - a_im * b_im;
+        res[1] += a_re * b_im + a_im * b_re;
+    }
+}
+
+void su3_transpose(float *res, float *mat) {
+    for (int m = 0; m < 3; m++) {
+        for (int n = 0; n < 3; n++) {
+            res[m*(3*2) + n*(2) + 0] = + mat[n*(3*2) + m*(2) + 0];
+            res[m*(3*2) + n*(2) + 1] = - mat[n*(3*2) + m*(2) + 1];
+        }
+    }
+}
+
+void su3_mul(float *res, float *mat, float *vec) {
+    for (int n = 0; n < 3; n++) {
+        dot(&res[n*(2)], &mat[n*(3*2)], vec);
+    }
+}
+
+void su3_Tmul(float *res, float *mat, float *vec) {
+    float matT[3*3*2];
+    su3_transpose(matT, mat);
+    su3_mul(res, matT, vec);
+}
+
+const float projector[8][4][4][2] = {
+    {
+      {{1,0}, {0,0}, {0,0}, {0,-1}},
+      {{0,0}, {1,0}, {0,-1}, {0,0}},
+      {{0,0}, {0,1}, {1,0}, {0,0}},
+      {{0,1}, {0,0}, {0,0}, {1,0}}
+    },
+    {
+      {{1,0}, {0,0}, {0,0}, {0,1}},
+      {{0,0}, {1,0}, {0,1}, {0,0}},
+      {{0,0}, {0,-1}, {1,0}, {0,0}},
+      {{0,-1}, {0,0}, {0,0}, {1,0}}
+    },
+    {
+      {{1,0}, {0,0}, {0,0}, {1,0}},
+      {{0,0}, {1,0}, {-1,0}, {0,0}},
+      {{0,0}, {-1,0}, {1,0}, {0,0}},
+      {{1,0}, {0,0}, {0,0}, {1,0}}
+    },
+    {
+      {{1,0}, {0,0}, {0,0}, {-1,0}},
+      {{0,0}, {1,0}, {1,0}, {0,0}},
+      {{0,0}, {1,0}, {1,0}, {0,0}},
+      {{-1,0}, {0,0}, {0,0}, {1,0}}
+    },
+    {
+      {{1,0}, {0,0}, {0,-1}, {0,0}},
+      {{0,0}, {1,0}, {0,0}, {0,1}},
+      {{0,1}, {0,0}, {1,0}, {0,0}},
+      {{0,0}, {0,-1}, {0,0}, {1,0}}
+    },
+    {
+      {{1,0}, {0,0}, {0,1}, {0,0}},
+      {{0,0}, {1,0}, {0,0}, {0,-1}},
+      {{0,-1}, {0,0}, {1,0}, {0,0}},
+      {{0,0}, {0,1}, {0,0}, {1,0}}
+    },
+    {
+      {{1,0}, {0,0}, {-1,0}, {0,0}},
+      {{0,0}, {1,0}, {0,0}, {-1,0}},
+      {{-1,0}, {0,0}, {1,0}, {0,0}},
+      {{0,0}, {-1,0}, {0,0}, {1,0}}
+    },
+    {
+      {{1,0}, {0,0}, {1,0}, {0,0}},
+      {{0,0}, {1,0}, {0,0}, {1,0}},
+      {{1,0}, {0,0}, {1,0}, {0,0}},
+      {{0,0}, {1,0}, {0,0}, {1,0}}
+    }
+};
+
+
+// todo pass projector
+void multiplySpinorByDiracProjector(float *res, int projIdx, float *spinorIn) {
+    zero(res, 4*3*2);
+
+    for (int s = 0; s < 4; s++) {
+        for (int t = 0; t < 4; t++) {
+            float projRe = projector[projIdx][s][t][0];
+            float projIm = projector[projIdx][s][t][1];
+            
+            for (int m = 0; m < 3; m++) {
+                float spinorRe = spinorIn[t*(3*2) + m*(2) + 0];
+                float spinorIm = spinorIn[t*(3*2) + m*(2) + 1];
+                res[s*(3*2) + m*(2) + 0] += projRe*spinorRe - projIm*spinorIm;
+                res[s*(3*2) + m*(2) + 1] += projRe*spinorIm + projIm*spinorRe;
+            }
+        }
+    }
+}
+
+
+//
+// dslashReference()
+//
+// if oddBit is zero: calculate odd parity spinor elements (using even parity spinor)
+// if oddBit is one:  calculate even parity spinor elements
+//
+// if daggerBit is zero: perform ordinary dslash operator
+// if daggerBit is one:  perform hermitian conjugate of dslash
+//
+void dslashReference(float *res, float **gaugeFull, float *spinorField, int oddBit, int daggerBit) {
+    zero(res, Nh*4*3*2);
+    
+    float *gaugeEven[4], *gaugeOdd[4];
+    for (int dir = 0; dir < 4; dir++) {  
+        gaugeEven[dir] = gaugeFull[dir];
+        gaugeOdd[dir]  = gaugeFull[dir]+Nh*gaugeSiteSize;
+    }
+    
+    for (int i = 0; i < Nh; i++) {
+        for (int dir = 0; dir < 8; dir++) {
+            float *gauge = gaugeLink(i, dir, oddBit, gaugeEven, gaugeOdd);
+            float *spinor = spinorNeighbor(i, dir, oddBit, spinorField);
+            
+            float projectedSpinor[4*3*2], gaugedSpinor[4*3*2];
+            int projIdx = 2*(dir/2)+(dir+daggerBit)%2;
+            multiplySpinorByDiracProjector(projectedSpinor, projIdx, spinor);
+            
+            for (int s = 0; s < 4; s++) {
+                if (dir % 2 == 0)
+                    su3_mul(&gaugedSpinor[s*(3*2)], gauge, &projectedSpinor[s*(3*2)]);
+                else
+                    su3_Tmul(&gaugedSpinor[s*(3*2)], gauge, &projectedSpinor[s*(3*2)]);
+            }
+            
+            sum(&res[i*(4*3*2)], &res[i*(4*3*2)], gaugedSpinor, 4*3*2);
+        }
+    }
+}
+
+void Mat(float *out, float **gauge, float *in, float kappa) {
+    float *inEven = in;
+    float *inOdd  = in + Nh*spinorSiteSize;
+    float *outEven = out;
+    float *outOdd = out + Nh*spinorSiteSize;
+    
+    // full dslash operator
+    dslashReference(outOdd, gauge, inEven, 1, 0);
+    dslashReference(outEven, gauge, inOdd, 0, 0);
+    
+    // lastly apply the kappa term
+    xpay(in, -kappa, out, N*spinorSiteSize);
+}
+
+void MatDag(float *out, float **gauge, float *in, float kappa) {
+    float *inEven = in;
+    float *inOdd  = in + Nh*spinorSiteSize;
+    float *outEven = out;
+    float *outOdd = out + Nh*spinorSiteSize;
+    
+    // full dslash operator
+    dslashReference(outOdd, gauge, inEven, 1, 1);
+    dslashReference(outEven, gauge, inOdd, 0, 1);
+    
+    // lastly apply the kappa term
+    xpay(in, -kappa, out, N*spinorSiteSize);
+}
+
+void MatDagMat(float *out, float **gauge, float *in, float kappa) {
+    float *tmp = (float*)malloc(N*spinorSiteSize*sizeof(float));
+    Mat(tmp, gauge, in, kappa);
+    MatDag(out, gauge, tmp, kappa);
+    free(tmp);
+}
+
+// Apply the even-odd preconditioned Dirac operator
+void MatPC(float *outEven, float **gauge, float *inEven, float kappa,
+	   MatPCType matpc_type) {
+
+    float *tmp = (float*)malloc(Nh*spinorSiteSize*sizeof(float));
+    
+    // full dslash operator
+    if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
+      dslashReference(tmp, gauge, inEven, 1, 0);
+      dslashReference(outEven, gauge, tmp, 0, 0);
+    } else {
+      dslashReference(tmp, gauge, inEven, 0, 0);
+      dslashReference(outEven, gauge, tmp, 1, 0);
+    }    
+
+    // lastly apply the kappa term
+    float kappa2 = -kappa*kappa;
+    xpay(inEven, kappa2, outEven, Nh*spinorSiteSize);
+    free(tmp);
+}
+
+// Apply the even-odd preconditioned Dirac operator
+void MatPCDag(float *outEven, float **gauge, float *inEven, float kappa, 
+	      MatPCType matpc_type) {
+
+    float *tmp = (float*)malloc(Nh*spinorSiteSize*sizeof(float));    
+
+    // full dslash operator
+    if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
+      dslashReference(tmp, gauge, inEven, 1, 1);
+      dslashReference(outEven, gauge, tmp, 0, 1);
+    } else {
+      dslashReference(tmp, gauge, inEven, 0, 1);
+      dslashReference(outEven, gauge, tmp, 1, 1);
+    }
+    
+    float kappa2 = -kappa*kappa;
+    xpay(inEven, kappa2, outEven, Nh*spinorSiteSize);
+    free(tmp);
+}
+
+void MatPCDagMatPC(float *out, float **gauge, float *in, float kappa, 
+		   MatPCType matpc_type) {
+    float *tmp = (float*)malloc(Nh*spinorSiteSize*sizeof(float));
+    MatPC(tmp, gauge, in, kappa, matpc_type);
+    MatPCDag(out, gauge, tmp, kappa, matpc_type);
+    free(tmp);
+}
--- a/dslash_test.c
+++ b/dslash_test.c
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <quda.h>
+#include <util_quda.h>
+#include <field_quda.h>
+
+#define FULL_WILSON 1
+
+QudaGaugeParam param;
+FullGauge cudaGauge;
+FullSpinor cudaSpinor;
+ParitySpinor tmp;
+
+float *hostGauge[4];
+float *spinor, *spinorRef;
+float *spinorEven, *spinorOdd;
+    
+float kappa = 1.0;
+int ODD_BIT = 0;
+int DAGGER_BIT = 0;
+    
+void printSpinorHalfField(float *spinor) {
+  printSpinor(&spinor[0*spinorSiteSize]);
+  printf("...\n");
+  printSpinor(&spinor[(Nh-1)*spinorSiteSize]);
+  printf("\n");    
+}
+
+void init() {
+
+  param.cpu_prec = QUDA_SINGLE_PRECISION;
+  param.cuda_prec = QUDA_HALF_PRECISION;
+  param.X = L1;
+  param.Y = L2;
+  param.Z = L3;
+  param.T = L4;
+  param.anisotropy = 2.3;
+  param.reconstruct = QUDA_RECONSTRUCT_8;
+  param.gauge_order = QUDA_QDP_GAUGE_ORDER;
+  param.t_boundary = QUDA_ANTI_PERIODIC_T;
+  param.gauge_fix = QUDA_GAUGE_FIXED_NO;
+  gauge_param = &param;
+
+  // construct input fields
+  for (int dir = 0; dir < 4; dir++) {
+    hostGauge[dir] = (float*)malloc(N*gaugeSiteSize*sizeof(float));
+  }
+
+  spinor = (float*)malloc(N*spinorSiteSize*sizeof(float));
+  spinorRef = (float*)malloc(N*spinorSiteSize*sizeof(float));
+  spinorEven = spinor;
+  spinorOdd = spinor + Nh*spinorSiteSize;
+    
+  printf("Randomizing fields...");
+  constructGaugeField(hostGauge);
+  constructSpinorField(spinor);
+  printf("done.\n"); fflush(stdout);
+  
+  int dev = 0;
+  cudaSetDevice(dev);
+
+  printf("Sending fields to GPU..."); fflush(stdout);
+  cudaGauge = loadGaugeField(hostGauge);
+  cudaSpinor = allocateSpinorField();
+  tmp = allocateParitySpinor();
+
+  loadSpinorField(cudaSpinor, (void*)spinor, QUDA_SINGLE_PRECISION, 
+		  QUDA_SINGLE_PRECISION, QUDA_DIRAC_ORDER);
+}
+
+void end() {
+  // release memory
+  for (int dir = 0; dir < 4; dir++) free(hostGauge[dir]);
+  free(spinor);
+  free(spinorRef);
+  freeSpinorField(cudaSpinor);
+  freeSpinorBuffer();
+  freeParitySpinor(tmp);
+}
+
+void dslashRef() {
+  
+  // compare to dslash reference implementation
+  printf("Calculating reference implementation...");
+  fflush(stdout);
+  if (FULL_WILSON)
+    MatPC(spinorRef, hostGauge, spinorEven, kappa, QUDA_MATPC_EVEN_EVEN);
+  else
+    dslashReference(spinorRef, hostGauge, spinorEven, ODD_BIT, DAGGER_BIT);
+  printf("done.\n");
+    
+}
+
+double dslashCUDA() {
+
+  // execute kernel
+  const int LOOPS = 200;
+  printf("Executing %d kernel loops...", LOOPS);
+  fflush(stdout);
+  stopwatchStart();
+  for (int i = 0; i < LOOPS; i++) {
+    if (FULL_WILSON)
+      MatPCCuda(cudaSpinor.odd, cudaGauge, cudaSpinor.even, kappa, tmp, QUDA_MATPC_EVEN_EVEN);
+    else
+      dslashCuda(cudaSpinor.odd, cudaGauge, cudaSpinor.even, ODD_BIT, DAGGER_BIT);
+  }
+    
+  // check for errors
+  cudaError_t stat = cudaGetLastError();
+  if (stat != cudaSuccess)
+    printf("with ERROR: %s\n", cudaGetErrorString(stat));
+
+  cudaThreadSynchronize();
+  double secs = stopwatchReadSeconds() / LOOPS;
+  printf("done.\n\n");
+
+  return secs;
+}
+
+void dslashTest() {
+
+  init();
+
+  float spinorGiB = (float)Nh*spinorSiteSize*sizeof(float) / (1 << 30);
+  float sharedKB = (float)dslashCudaSharedBytes() / (1 << 10);
+  printf("\nSpinor mem: %.3f GiB\n", spinorGiB);
+  printf("Gauge mem: %.3f GiB\n", param.gaugeGiB);
+  printf("Shared mem: %.3f KB\n", sharedKB);
+
+  int attempts = 100;
+  dslashRef();
+  for (int i=0; i<attempts; i++) {
+    
+    double secs = dslashCUDA();
+    retrieveParitySpinor(spinorOdd, cudaSpinor.odd, QUDA_SINGLE_PRECISION, 
+			 QUDA_SINGLE_PRECISION, QUDA_DIRAC_ORDER);
+    
+    // print timing information
+    printf("%fms per loop\n", 1000*secs);
+    int flops = FULL_WILSON ? 1320*2 + 48 : 1320;
+    int floats = FULL_WILSON ? 2*(7*24+8*param.packed_size+24)+24 : 7*24+8*param.packed_size+24;
+    printf("GFLOPS = %f\n", 1.0e-9*flops*Nh/secs);
+    printf("GiB/s = %f\n\n", Nh*floats*sizeof(float)/(secs*(1<<30)));
+    
+    int res = compareFloats(spinorOdd, spinorRef, Nh*4*3*2, 1e-1);
+    printf("%d Test %s\n", i, (1 == res) ? "PASSED" : "FAILED");
+  }  
+
+  end();
+
+}
+
+int main(int argc, char **argv) {
+  dslashTest();
+}
+
+
+  /*
+  printf("Reference:\n");
+  printSpinorHalfField(spinorRef);
+    
+  printf("\nCUDA:\n");
+  printSpinorHalfField(spinorOdd);
+
+  int fail_check = 12;
+  int fail[fail_check];
+  for (int f=0; f<fail_check; f++) fail[f] = 0;
+
+  int iter[24];
+  for (int i=0; i<24; i++) iter[i] = 0;
+
+  for (int i=0; i<Nh; i++) {
+    for (int j=0; j<24; j++) {
+      int is = i*24+j;
+      float diff = fabs(spinorRef[is]-spinorOdd[is]);
+      for (int f=0; f<fail_check; f++)
+	if (diff > pow(10,-(f+1))) fail[f]++;
+      //if (diff > 1e-1) printf("%d %d %e\n", i, j, diff);
+      if (diff > 1e-6) iter[j]++;
+    }
+  }
+    
+  //for (int i=0; i<24; i++) printf("%d fails = %d\n", i, iter[i]);
+
+  for (int f=0; f<fail_check; f++) {
+    printf("%e Failures: %d / %d  = %e\n", pow(10,-(f+1)), fail[f], Nh*24, fail[f] / (float)(Nh*24));
+  }
+
+  if (stat != cudaSuccess)
+    printf("Cuda failed with ERROR: %s\n", cudaGetErrorString(stat));
+  */ 
--- a/enum_quda.h
+++ b/enum_quda.h
+#ifndef _ENUM_QUDA_H
+#define _ENUM_QUDA_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+  typedef enum QudaGaugeFieldOrder_s {
+    QUDA_QDP_GAUGE_ORDER, // even-odd, row-column colour
+    QUDA_CPS_WILSON_GAUGE_ORDER, // even-odd, column-row colour
+  } QudaGaugeFieldOrder;
+
+  typedef enum QudaDiracFieldOrder_s {
+    QUDA_DIRAC_ORDER, // even-odd, colour inside spin
+    QUDA_QDP_DIRAC_ORDER, // even-odd, spin inside colour
+    QUDA_CPS_WILSON_DIRAC_ORDER, // odd-even, colour inside spin
+  } QudaDiracFieldOrder;  
+
+  typedef enum QudaInverterType_s {
+    QUDA_CG_INVERTER,
+    QUDA_BICGSTAB_INVERTER
+  } QudaInverterType;
+
+  typedef enum QudaPrecision_s {
+    QUDA_HALF_PRECISION,
+    QUDA_SINGLE_PRECISION,
+    QUDA_DOUBLE_PRECISION
+  } QudaPrecision;
+
+  // Whether the preconditioned matrix is (1-k^2 Deo Doe) or (1-k^2 Doe Deo)
+  typedef enum QudaMatPCType_s {
+    QUDA_MATPC_EVEN_EVEN,
+    QUDA_MATPC_ODD_ODD
+  } QudaMatPCType;
+
+  // The different solutions supported
+  typedef enum QudaSolutionType_s {
+    QUDA_MAT_SOLUTION,
+    QUDA_MATPC_SOLUTION,
+    QUDA_MATPCDAG_SOLUTION,
+    QUDA_MATPCDAG_MATPC_SOLUTION,
+  } QudaSolutionType;
+
+  typedef enum QudaMassNormalization_s {
+    QUDA_KAPPA_NORMALIZATION,
+    QUDA_MASS_NORMALIZATION
+  } QudaMassNormalization;
+
+  typedef enum QudaPreserveSource_s {
+    QUDA_PRESERVE_SOURCE_NO, // use the source for the residual
+    QUDA_PRESERVE_SOURCE_YES // keep the source intact
+  } QudaPreserveSource;
+
+  typedef enum QudaReconstructType_s {
+    QUDA_RECONSTRUCT_NO, // store all 18 real numbers epxlicitly
+    QUDA_RECONSTRUCT_8, // reconstruct from 8 real numbers
+    QUDA_RECONSTRUCT_12 // reconstruct from 12 real numbers
+  } QudaReconstructType;
+
+  typedef enum QudaGaugeFixed_s {
+    QUDA_GAUGE_FIXED_NO, // No gauge fixing
+    QUDA_GAUGE_FIXED_YES // Gauge field stored in temporal gauge
+  } QudaGaugeFixed;
+
+  typedef enum QudaDagType_s {
+    QUDA_DAG_NO,
+    QUDA_DAG_YES
+  } QudaDagType;
+  
+  typedef enum QudaTboundary_s {
+    QUDA_ANTI_PERIODIC_T = -1,
+    QUDA_PERIODIC_T = 1
+  } QudaTboundary;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _ENUM_QUDA_H
--- a/field_quda.cpp
+++ b/field_quda.cpp
--- a/field_quda.h
+++ b/field_quda.h
+#ifndef _QUDA_FIELD_H
+#define _QUDA_FIELD_H
+
+#include <enum_quda.h>
+#include <dslash_quda.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+  ParitySpinor allocateParitySpinor();
+  FullSpinor allocateSpinorField();
+  
+  void freeGaugeField(FullGauge gauge);
+  void freeParitySpinor(ParitySpinor spinor);
+  void freeSpinorField(FullSpinor spinor);
+  void freeSpinorBuffer();
+
+  FullGauge loadGaugeField(void *gauge);
+  
+  void loadParitySpinor(ParitySpinor, void *spinor, Precision cpu_prec, Precision cuda_prec, 
+			DiracFieldOrder dirac_order);
+  void loadSpinorField(FullSpinor, void *spinor, Precision cpu_prec,  Precision cuda_prec, 
+		       DiracFieldOrder dirac_order);
+  
+  void retrieveParitySpinor(void *res, ParitySpinor spinor, Precision cpu_prec,  Precision cuda_prec, 
+			    DiracFieldOrder dirac_order);
+  void retrieveSpinorField(void *res, FullSpinor spinor, Precision cpu_prec,  Precision cuda_prec, 
+			   DiracFieldOrder dirac_order);
+  
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _QUDA_FIELD_H
--- a/inv_bicgstab_quda.cpp
+++ b/inv_bicgstab_quda.cpp
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <cuComplex.h>
+
+#include <quda.h>
+#include <util_quda.h>
+#include <field_quda.h>
+
+void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge, 
+			ParitySpinor tmp, QudaInvertParam *perf, DagType dag_type)
+{
+  int len = Nh*spinorSiteSize;
+
+  ParitySpinor r = allocateParitySpinor();
+  ParitySpinor p = allocateParitySpinor();
+  ParitySpinor v = allocateParitySpinor();
+  ParitySpinor t = allocateParitySpinor();
+
+  copyCuda((float *)r, (float *)b, len);
+  zeroCuda((float *)x, len);
+
+  float b2 = normCuda((float *)b, len);
+  float r2 = b2;
+  float stop = b2*perf->tol*perf->tol; // stopping condition of solver
+
+  cuComplex rho = make_cuFloatComplex(1.0f, 0.0f);
+  cuComplex rho0 = rho;
+  cuComplex alpha = make_cuFloatComplex(1.0f, 0.0f);
+  cuComplex omega = make_cuFloatComplex(1.0f, 0.0f);
+  cuComplex beta;
+
+  cuComplex rv;
+  cuComplex rho_rho0;
+  cuComplex alpha_omega;
+  cuComplex beta_omega;
+  cuComplex one = make_cuFloatComplex(1.0f, 0.0f);
+
+  float3 rho_r2;
+  float3 omega_t2;
+
+  int k=0;
+  //printf("%d iterations, r2 = %e\n", k, r2);
+  stopwatchStart();
+  while (r2 > stop && k<perf->maxiter) {
+
+    if (k==0) {
+      rho = make_cuFloatComplex(r2, 0.0);
+      copyCuda((float *)p, (float *)r, len);
+    } else {
+      alpha_omega = cuCdivf(alpha, omega);
+      rho_rho0 = cuCdivf(rho, rho0);
+      beta = cuCmulf(rho_rho0, alpha_omega);
+
+      // p = r - beta*omega*v + beta*(p)
+      beta_omega = cuCmulf(beta, omega); beta_omega.x *= -1.0f; beta_omega.y *= -1.0f;
+      cxpaypbzCuda((float2*)r, beta_omega, (float2*)v, beta, (float2*)p, len/2); // 8
+    }
+
+    if (dag_type == QUDA_DAG_NO) 
+      rv = MatPCcDotWXCuda(v, gauge, p, perf->kappa, tmp, b, perf->matpc_type);
+    else 
+      rv = MatPCDagcDotWXCuda(v, gauge, p, perf->kappa, tmp, b, perf->matpc_type);
+
+    alpha = cuCdivf(rho, rv);
+
+    // r -= alpha*v
+    alpha.x *= -1.0f; alpha.y *= -1.0f;
+    caxpyCuda(alpha, (float2*)v, (float2*)r, len/2); // 4
+    alpha.x *= -1.0f; alpha.y *= -1.0f;
+
+    if (dag_type == QUDA_DAG_NO) MatPCCuda(t, gauge, r, perf->kappa, tmp, perf->matpc_type);
+    else  MatPCDagCuda(t, gauge, r, perf->kappa, tmp, perf->matpc_type);
+
+    // omega = (t, r) / (t, t)
+    omega_t2 = cDotProductNormACuda((float2*)t, (float2*)r, len/2); // 6
+    omega.x = omega_t2.x / omega_t2.z; omega.y = omega_t2.y/omega_t2.z;
+
+    //x += alpha*p + omega*r, r -= omega*t, r2 = (r,r), rho = (r0, r)
+    rho_r2 = caxpbypzYmbwcDotProductWYNormYCuda(alpha, (float2*)p, omega, (float2*)r, 
+						(float2*)x, (float2*)t, (float2*)b, len/2);
+    rho0 = rho; rho.x = rho_r2.x; rho.y = rho_r2.y; r2 = rho_r2.z;
+
+    k++;
+    //printf("%d iterations, r2 = %e, x2 = %e\n", k, r2, normCuda((float*)x, len));
+  }
+  perf->secs += stopwatchReadSeconds();
+
+  //if (k==maxiters) printf("Exceeded maximum iterations %d\n", maxiters);
+
+  float gflops = (1.0e-9*Nh)*(2*(2*1320+48)*k + (32*k + 8*(k-1))*spinorSiteSize);
+  //printf("%f gflops\n", k*gflops / stopwatchReadSeconds());
+  perf->gflops += gflops;
+  perf->iter += k;
+
+#if 0
+  // Calculate the true residual
+  if (dag_type == QUDA_DAG_NO) MatPCCuda(t, gauge, x, perf->kappa, tmp, perf->matpc_type);
+  else MatPCDagCuda(t, gauge, x, perf->kappa, tmp, perf->matpc_type);
+  copyCuda((float *)r, (float *)b, len);
+  mxpyCuda((float *)t, (float *)r, len);
+  double true_res = normCuda((float *)r, len);
+  
+  printf("Converged after %d iterations, r2 = %e, true_r2 = %e\n", 
+	 k, r2, true_res / b2);
+#endif
+
+  freeParitySpinor(r);
+  freeParitySpinor(v);
+  freeParitySpinor(t);
+  freeParitySpinor(p);
+
+  return;
+}
--- a/inv_cg_quda.cpp
+++ b/inv_cg_quda.cpp
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include <quda.h>
+#include <util_quda.h>
+#include <field_quda.h>
+
+void invertCgCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge, 
+		  ParitySpinor tmp, QudaInvertParam *perf)
+{
+  int len = Nh*spinorSiteSize;
+
+  ParitySpinor r;
+  ParitySpinor p = allocateParitySpinor();
+  ParitySpinor Ap = allocateParitySpinor();
+
+  float b2 = normCuda((float *)b, len);
+  float r2 = b2;
+  float r2_old;
+  float stop = r2*perf->tol*perf->tol; // stopping condition of solver
+
+  float alpha, beta, pAp;
+
+  if (perf->preserve_source == QUDA_PRESERVE_SOURCE_YES) {
+    r = allocateParitySpinor();
+    copyCuda((float *)r, (float *)b, len);
+  } else {
+    r = b;
+  }
+  copyCuda((float *)p, (float *)r, len);
+  zeroCuda((float *)x, len);
+
+  int k=0;
+  //printf("%d iterations, r2 = %e\n", k, r2);
+  stopwatchStart();
+  while (r2 > stop && k<perf->maxiter) {
+    MatPCDagMatPCCuda(Ap, gauge, p, perf->kappa, tmp, perf->matpc_type);
+
+    pAp = reDotProductCuda((float *)p, (float *)Ap, len);
+
+    alpha = r2 / pAp;        
+    r2_old = r2;
+    r2 = axpyNormCuda(-alpha, (float *)Ap, (float *)r, len);
+
+    beta = r2 / r2_old;
+
+    axpyZpbxCuda(alpha, (float *)p, (float *)x, (float *)r, beta, len);
+
+    k++;
+    //printf("%d iterations, r2 = %e\n", k, r2);
+  }
+  perf->secs = stopwatchReadSeconds();
+
+  //if (k==maxiters)
+  //printf("Exceeded maximum iterations %d\n", maxiters);
+
+  float gflops = k*(1.0e-9*Nh)*(2*(2*1320+48) + 10*spinorSiteSize);
+  //printf("%f gflops\n", k*gflops / stopwatchReadSeconds());
+  perf->gflops = gflops;
+  perf->iter = k;
+
+#if 0
+  // Calculate the true residual
+  MatPCDagMatPCCuda(Ap, gauge, x, perf->kappa, tmp, perf->matpc_type);
+  copyCuda((float *)r, (float *)b, len);
+  mxpyCuda((float *)Ap, (float *)r, len);
+  double true_res = normCuda((float *)r, len);
+  
+  printf("Converged after %d iterations, r2 = %e, true_r2 = %e\n", 
+	 k, r2, true_res / b2);
+#endif
+
+  if (perf->preserve_source == QUDA_PRESERVE_SOURCE_YES) freeParitySpinor(r);
+  freeParitySpinor(p);
+  freeParitySpinor(Ap);
+
+  return;
+}
--- a/invert_quda.cpp
+++ b/invert_quda.cpp
--- a/invert_quda.h
+++ b/invert_quda.h
--- a/invert_test.c
+++ b/invert_test.c