quda: cleaned up header files (invert_quda.cpp is now interface_quda.cpp)

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@593 be54200a-260c-0410-bdd7-ce6af2a381ab

quda: cleaned up header files (invert_quda.cpp is now interface_quda.cpp)
git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@593 be54200a-260c-0410-bdd7-ce6af2a381ab
e5f59d3a · rbabich · c3ded658 · e5f59d3a · e5f59d3a · e5f59d3a
Commit e5f59d3a authored Dec 09, 2009 by rbabich
--- a/include/blas_quda.h
+++ b/include/blas_quda.h
-#include <cuComplex.h>
-#include <enum_quda.h>
-
 #ifndef _QUDA_BLAS_H
 #define _QUDA_BLAS_H

+#include <cuComplex.h>
+#include <quda_internal.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif

--- a/include/clover_quda.h
+++ b/include/clover_quda.h
+#ifndef _CLOVER_QUDA_H
+#define _CLOVER_QUDA_H
+
+#include <quda_internal.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+  void allocateParityClover(ParityClover *, int *X, int pad,
+			    Precision precision);
+  void allocateCloverField(FullClover *, int *X, int pad, Precision precision);
+
+  void freeParityClover(ParityClover *clover);
+  void freeCloverField(FullClover *clover);
+
+  void loadParityClover(ParityClover ret, void *clover, Precision cpu_prec,
+			CloverFieldOrder clover_order);
+  void loadFullClover(FullClover ret, void *clover, Precision cpu_prec,
+		      CloverFieldOrder clover_order);
+  void loadCloverField(FullClover ret, void *clover, Precision cpu_prec,
+		       CloverFieldOrder clover_order);
+
+  /* void createCloverField(FullClover *cudaClover, void *cpuClover, int *X,
+                         Precision precision); */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _CLOVER_QUDA_H
--- a/include/dslash_quda.h
+++ b/include/dslash_quda.h
 #ifndef _DSLASH_QUDA_H
 #define _DSLASH_QUDA_H

-#include <cuComplex.h>
-#include <quda.h>
 #include <quda_internal.h>

 #ifdef __cplusplus
 extern "C" {
 #endif

-  extern FullGauge cudaGaugePrecise;
-  extern FullGauge cudaGaugeSloppy;
-
-  extern FullClover cudaCloverPrecise;
-  extern FullClover cudaCloverSloppy;
-
-  extern FullClover cudaCloverInvPrecise;
-  extern FullClover cudaCloverInvSloppy;
-
-// ---------- dslash_quda.cu ----------
+  extern unsigned long long dslash_quda_flops;
+  extern unsigned long long dslash_quda_bytes;

  int dslashCudaSharedBytes(Precision spinor_prec, int blockDim);

@@ -107,17 +97,6 @@ extern "C" {
  void cloverHCuda(ParitySpinor res, FullGauge gauge, FullClover clover,
 		   ParitySpinor spinor, int oddBit);

-  // -- inv_cg_cuda.cpp
-  void invertCgCuda(ParitySpinor x, ParitySpinor b, ParitySpinor tmp,
-		    QudaInvertParam *param);
-  
-  // -- inv_bicgstab_cuda.cpp
-  void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, ParitySpinor tmp, 
-			  QudaInvertParam *param, DagType dag_type);
-  
-  extern unsigned long long dslash_quda_flops;
-  extern unsigned long long dslash_quda_bytes;
-
 #ifdef __cplusplus
 }
 #endif

--- a/include/gauge_quda.h
+++ b/include/gauge_quda.h
 #ifndef _GAUGE_QUDA_H
 #define _GAUGE_QUDA_H

-#include <enum_quda.h>
-#include <dslash_quda.h>
+#include <quda_internal.h>

 #ifdef __cplusplus
 extern "C" {

--- a/include/invert_quda.h
+++ b/include/invert_quda.h
+#ifndef _INVERT_QUDA_H
+#define _INVERT_QUDA_H
+
+#include <quda_internal.h>
+#include <quda.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+  extern FullGauge cudaGaugePrecise;
+  extern FullGauge cudaGaugeSloppy;
+
+  extern FullClover cudaCloverPrecise;
+  extern FullClover cudaCloverSloppy;
+
+  extern FullClover cudaCloverInvPrecise;
+  extern FullClover cudaCloverInvSloppy;
+
+  // -- inv_cg_cuda.cpp
+  void invertCgCuda(ParitySpinor x, ParitySpinor b, ParitySpinor tmp,
+		    QudaInvertParam *param);
+  
+  // -- inv_bicgstab_cuda.cpp
+  void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, ParitySpinor tmp, 
+			  QudaInvertParam *param, DagType dag_type);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _INVERT_QUDA_H
--- a/include/quda.h
+++ b/include/quda.h
@@ -78,7 +78,7 @@ extern "C" {

  } QudaInvertParam;

-  // Interface functions, found in invert_quda.cpp
+  // Interface functions, found in interface_quda.cpp

  void initQuda(int dev);
  void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param);

--- a/include/quda_internal.h
+++ b/include/quda_internal.h
@@ -100,7 +100,4 @@ extern "C" {
 }
 #endif

-#include <blas_quda.h>
-#include <dslash_quda.h>
-
 #endif // _QUDA_INTERNAL_H
--- a/include/spinor_quda.h
+++ b/include/spinor_quda.h
-#ifndef _QUDA_SPINOR_H
-#define _QUDA_SPINOR_H
+#ifndef _SPINOR_QUDA_H
+#define _SPINOR_QUDA_H

-#include <enum_quda.h>
-#include <dslash_quda.h>
+#include <quda_internal.h>

 #ifdef __cplusplus
 extern "C" {
 #endif

-  // -- spinor_quda.cpp
-
  ParitySpinor allocateParitySpinor(int *X, Precision precision, int stride);
  FullSpinor allocateSpinorField(int *X, Precision precision, int stride);
  
@@ -30,26 +27,8 @@ extern "C" {
  void spinorHalfPack(float *c, short *s0, float *f0);
  void spinorHalfUnpack(float *f0, float *c, short *s0);

-  // -- clover_quda.cpp
-
-  void allocateParityClover(ParityClover *, int *X, int pad, Precision precision);
-  void allocateCloverField(FullClover *, int *X, int pad, Precision precision);
-
-  void freeParityClover(ParityClover *clover);
-  void freeCloverField(FullClover *clover);
-
-  void loadParityClover(ParityClover ret, void *clover, Precision cpu_prec,
-			CloverFieldOrder clover_order);
-  void loadFullClover(FullClover ret, void *clover, Precision cpu_prec,
-		      CloverFieldOrder clover_order);
-  void loadCloverField(FullClover ret, void *clover, Precision cpu_prec,
-		       CloverFieldOrder clover_order);
-
-  /* void createCloverField(FullClover *cudaClover, void *cpuClover, int *X,
-                         Precision precision); */
-
 #ifdef __cplusplus
 }
 #endif

-#endif // _QUDA_SPINOR_H
+#endif // _SPINOR_QUDA_H
--- a/include/util_quda.h
+++ b/include/util_quda.h
 #ifndef _UTIL_QUDA_H
 #define _UTIL_QUDA_H

-#include <quda_internal.h>
-
 #ifdef __cplusplus
 extern "C" {
 #endif

--- a/lib/Makefile
+++ b/lib/Makefile
 include ../make.inc

 QUDA = libquda.a
-QUDA_OBJS = blas_quda.o clover_quda.o dslash_quda.o gauge_quda.o      \
-	inv_bicgstab_quda.o inv_cg_quda.o invert_quda.o spinor_quda.o \
+QUDA_OBJS = blas_quda.o clover_quda.o dslash_quda.o gauge_quda.o         \
+	inv_bicgstab_quda.o inv_cg_quda.o interface_quda.o spinor_quda.o \
 	util_quda.o

 # header files, found in include/
-QUDA_HDRS = blas_quda.h dslash_quda.h enum_quda.h gauge_quda.h quda.h \
-	quda_internal.h spinor_quda.h util_quda.h
+QUDA_HDRS = blas_quda.h clover_quda.h dslash_quda.h enum_quda.h gauge_quda.h \
+	invert_quda.h quda.h quda_internal.h spinor_quda.h util_quda.h

 # files containing complex macros and other code fragments to be inlined,
 # found in lib/
-QUDA_INLN = blas_param.h check_params.h clover_def.h dslash_common.h	\
-	dslash_def.h dslash_textures.h io_spinor.h read_clover.h	\
-	read_gauge.h reduce_complex_core.h reduce_core.h		\
+QUDA_INLN = blas_param.h check_params.h clover_def.h dslash_common.h	 \
+	dslash_def.h dslash_textures.h io_spinor.h read_clover.h	 \
+	read_gauge.h reduce_complex_core.h reduce_core.h		 \
 	reduce_triple_core.h

 # files generated by the scripts in lib/generate/, found in lib/dslash_core/

--- a/lib/blas_quda.cu
+++ b/lib/blas_quda.cu
@@ -2,6 +2,7 @@
 #include <stdio.h>

 #include <quda_internal.h>
+#include <blas_quda.h>

 #define REDUCE_MAX_BLOCKS 2048


--- a/lib/clover_quda.cpp
+++ b/lib/clover_quda.cpp
@@ -3,7 +3,7 @@
 #include <math.h>

 #include <quda_internal.h>
-#include <spinor_quda.h>
+#include <clover_quda.h>

 void allocateParityClover(ParityClover *ret, int *X, int pad, Precision precision)
 {

--- a/lib/dslash_constants.h
+++ b/lib/dslash_constants.h
@@ -36,4 +36,3 @@ __constant__ float pi_f;
 // double precision constants
 __constant__ double anisotropy;
 __constant__ double t_boundary;
-
--- a/lib/dslash_def.h
+++ b/lib/dslash_def.h
 // dslash_def.h - Dslash kernel definitions

-// There are currently 64 different variants of the Dslash kernel,
+// There are currently 288 different variants of the Dslash kernel,
 // each one characterized by a set of 6 options, where each option can
-// take one of two values (2^6 = 64).  This file is structured so that
-// the C preprocessor loops through all 64 variants (in a manner
-// resembling a binary counter), sets the appropriate macros, and
-// defines the corresponding functions.
+// take one of several values (3*3*4*2*2*2 = 288).  This file is
+// structured so that the C preprocessor loops through all 288
+// variants (in a manner resembling a counter), sets the appropriate
+// macros, and defines the corresponding functions.
 //
 // As an example of the function naming conventions, consider
 //
@@ -250,7 +250,7 @@ DD_FUNC(DD_GPREC_F, DD_SPREC_F, DD_CPREC_F, DD_RECON_F, DD_DAG_F, DD_XPAY_F)(DD_
 #define DD_SPREC 2
 #else

-#undef DD_SPREC // from here
+#undef DD_SPREC
 #define DD_SPREC 0

 #if (DD_CPREC==0)
@@ -263,7 +263,7 @@ DD_FUNC(DD_GPREC_F, DD_SPREC_F, DD_CPREC_F, DD_RECON_F, DD_DAG_F, DD_XPAY_F)(DD_
 #undef DD_CPREC
 #define DD_CPREC 3

-#else // to here
+#else

 #undef DD_LOOP
 #undef DD_DAG
@@ -271,9 +271,9 @@ DD_FUNC(DD_GPREC_F, DD_SPREC_F, DD_CPREC_F, DD_RECON_F, DD_DAG_F, DD_XPAY_F)(DD_
 #undef DD_RECON
 #undef DD_GPREC
 #undef DD_SPREC
-#undef DD_CPREC //
+#undef DD_CPREC

-#endif // DD_CPREC //
+#endif // DD_CPREC
 #endif // DD_SPREC
 #endif // DD_GPREC
 #endif // DD_RECON

--- a/lib/dslash_quda.cu
+++ b/lib/dslash_quda.cu
@@ -5,8 +5,8 @@
 #include <dslash_quda.h>
 #include <spinor_quda.h> // not needed once call to allocateParitySpinor() is removed

-#include<dslash_textures.h>
-#include<dslash_constants.h>
+#include <dslash_textures.h>
+#include <dslash_constants.h>

 unsigned long long dslash_quda_flops;
 unsigned long long dslash_quda_bytes;
@@ -33,7 +33,7 @@ int dslashCudaSharedBytes(Precision precision) {

 #include <dslash_common.h>

-int initDslash = 0;
+static int initDslash = 0;

 void initDslashConstants(FullGauge gauge, int sp_stride, int cl_stride) {
  int Vh = gauge.volume;
@@ -160,8 +160,6 @@ static void bindGaugeTex(FullGauge gauge, int oddBit) {
  }
 }

-// ----------------------------------------------------------------------
-
 // ----------------------------------------------------------------------
 // plain Wilson Dslash:


--- a/lib/gauge_quda.cpp
+++ b/lib/gauge_quda.cpp
 #include <stdlib.h>
 #include <stdio.h>
+#include <math.h>

 #include <quda_internal.h>
 #include <gauge_quda.h>

--- a/lib/invert_quda.cpp
+++ b/lib/invert_quda.cpp
@@ -4,8 +4,12 @@

 #include <quda.h>
 #include <quda_internal.h>
-#include <spinor_quda.h>
 #include <gauge_quda.h>
+#include <spinor_quda.h>
+#include <clover_quda.h>
+#include <blas_quda.h>
+#include <dslash_quda.h>
+#include <invert_quda.h>

 #define spinorSiteSize 24 // real numbers per spinor


--- a/lib/inv_bicgstab_quda.cpp
+++ b/lib/inv_bicgstab_quda.cpp
@@ -3,10 +3,11 @@
 #include <math.h>
 #include <cuComplex.h>

-#include <quda.h>
 #include <quda_internal.h>
 #include <spinor_quda.h>
-
+#include <blas_quda.h>
+#include <dslash_quda.h>
+#include <invert_quda.h>
 #include <util_quda.h>

 void MatVec(ParitySpinor out, FullGauge gauge,  FullClover clover, FullClover cloverInv, ParitySpinor in, 

--- a/lib/inv_cg_quda.cpp
+++ b/lib/inv_cg_quda.cpp
@@ -2,10 +2,12 @@
 #include <stdlib.h>
 #include <math.h>

-#include <quda.h>
 #include <quda_internal.h>
-#include <util_quda.h>
 #include <spinor_quda.h>
+#include <blas_quda.h>
+#include <dslash_quda.h>
+#include <invert_quda.h>
+#include <util_quda.h>

 void MatVec(ParitySpinor out, FullGauge gauge,  FullClover clover, FullClover cloverInv, ParitySpinor in, 
 	    QudaInvertParam *invert_param, ParitySpinor tmp) {

--- a/lib/io_spinor.h
+++ b/lib/io_spinor.h
-#define READ_SPINOR_DOUBLE(spinor)		     \
+#define READ_SPINOR_DOUBLE(spinor)	                          \
  double2 I0 = fetch_double2((spinor), sp_idx + 0*(sp_stride));   \
  double2 I1 = fetch_double2((spinor), sp_idx + 1*(sp_stride));   \
  double2 I2 = fetch_double2((spinor), sp_idx + 2*(sp_stride));   \
@@ -9,10 +9,10 @@
  double2 I7 = fetch_double2((spinor), sp_idx + 7*(sp_stride));   \
  double2 I8 = fetch_double2((spinor), sp_idx + 8*(sp_stride));   \
  double2 I9 = fetch_double2((spinor), sp_idx + 9*(sp_stride));   \
-  double2 I10 = fetch_double2((spinor), sp_idx + 10*(sp_stride));   \
+  double2 I10 = fetch_double2((spinor), sp_idx + 10*(sp_stride)); \
  double2 I11 = fetch_double2((spinor), sp_idx + 11*(sp_stride));

-#define READ_SPINOR_DOUBLE_UP(spinor)		     \
+#define READ_SPINOR_DOUBLE_UP(spinor)		                  \
  double2 I0 = fetch_double2((spinor), sp_idx + 0*(sp_stride));   \
  double2 I1 = fetch_double2((spinor), sp_idx + 1*(sp_stride));   \
  double2 I2 = fetch_double2((spinor), sp_idx + 2*(sp_stride));   \
@@ -20,15 +20,15 @@
  double2 I4 = fetch_double2((spinor), sp_idx + 4*(sp_stride));   \
  double2 I5 = fetch_double2((spinor), sp_idx + 5*(sp_stride));

-#define READ_SPINOR_DOUBLE_DOWN(spinor)		     \
+#define READ_SPINOR_DOUBLE_DOWN(spinor)		                  \
  double2 I6 = fetch_double2((spinor), sp_idx + 6*(sp_stride));   \
  double2 I7 = fetch_double2((spinor), sp_idx + 7*(sp_stride));   \
  double2 I8 = fetch_double2((spinor), sp_idx + 8*(sp_stride));   \
  double2 I9 = fetch_double2((spinor), sp_idx + 9*(sp_stride));   \
-  double2 I10 = fetch_double2((spinor), sp_idx + 10*(sp_stride));   \
+  double2 I10 = fetch_double2((spinor), sp_idx + 10*(sp_stride)); \
  double2 I11 = fetch_double2((spinor), sp_idx + 11*(sp_stride));

-#define READ_SPINOR_SINGLE(spinor)		     \
+#define READ_SPINOR_SINGLE(spinor)		              \
  float4 I0 = tex1Dfetch((spinor), sp_idx + 0*(sp_stride));   \
  float4 I1 = tex1Dfetch((spinor), sp_idx + 1*(sp_stride));   \
  float4 I2 = tex1Dfetch((spinor), sp_idx + 2*(sp_stride));   \
@@ -36,102 +36,102 @@
  float4 I4 = tex1Dfetch((spinor), sp_idx + 4*(sp_stride));   \
  float4 I5 = tex1Dfetch((spinor), sp_idx + 5*(sp_stride));

-#define READ_SPINOR_SINGLE_UP(spinor)		     \
+#define READ_SPINOR_SINGLE_UP(spinor)		              \
  float4 I0 = tex1Dfetch((spinor), sp_idx + 0*(sp_stride));   \
  float4 I1 = tex1Dfetch((spinor), sp_idx + 1*(sp_stride));   \
  float4 I2 = tex1Dfetch((spinor), sp_idx + 2*(sp_stride));   \

-#define READ_SPINOR_SINGLE_DOWN(spinor)		     \
+#define READ_SPINOR_SINGLE_DOWN(spinor)                       \
  float4 I3 = tex1Dfetch((spinor), sp_idx + 3*(sp_stride));   \
  float4 I4 = tex1Dfetch((spinor), sp_idx + 4*(sp_stride));   \
  float4 I5 = tex1Dfetch((spinor), sp_idx + 5*(sp_stride));

-#define READ_SPINOR_HALF(spinor)		     \
+#define READ_SPINOR_HALF(spinor)                              \
  float4 I0 = tex1Dfetch((spinor), sp_idx + 0*(sp_stride));   \
  float4 I1 = tex1Dfetch((spinor), sp_idx + 1*(sp_stride));   \
  float4 I2 = tex1Dfetch((spinor), sp_idx + 2*(sp_stride));   \
  float4 I3 = tex1Dfetch((spinor), sp_idx + 3*(sp_stride));   \
  float4 I4 = tex1Dfetch((spinor), sp_idx + 4*(sp_stride));   \
  float4 I5 = tex1Dfetch((spinor), sp_idx + 5*(sp_stride));   \
-  float C = tex1Dfetch((spinorTexNorm), sp_idx);     \
-  I0.x *= C; I0.y *= C;	I0.z *= C; I0.w *= C;	     \
-  I1.x *= C; I1.y *= C;	I1.z *= C; I1.w *= C;	     \
-  I2.x *= C; I2.y *= C;	I2.z *= C; I2.w *= C;        \
-  I3.x *= C; I3.y *= C;	I3.z *= C; I3.w *= C;	     \
-  I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C;	     \
+  float C = tex1Dfetch((spinorTexNorm), sp_idx);              \
+  I0.x *= C; I0.y *= C;	I0.z *= C; I0.w *= C;	              \
+  I1.x *= C; I1.y *= C;	I1.z *= C; I1.w *= C;	              \
+  I2.x *= C; I2.y *= C;	I2.z *= C; I2.w *= C;                 \
+  I3.x *= C; I3.y *= C;	I3.z *= C; I3.w *= C;	              \
+  I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C;	              \
  I5.x *= C; I5.y *= C;	I5.z *= C; I5.w *= C;					     

-#define READ_SPINOR_HALF_UP(spinor)		     \
+#define READ_SPINOR_HALF_UP(spinor)		              \
  float4 I0 = tex1Dfetch((spinor), sp_idx + 0*(sp_stride));   \
  float4 I1 = tex1Dfetch((spinor), sp_idx + 1*(sp_stride));   \
  float4 I2 = tex1Dfetch((spinor), sp_idx + 2*(sp_stride));   \
-  float C = tex1Dfetch((spinorTexNorm), sp_idx);     \
-  I0.x *= C; I0.y *= C;	I0.z *= C; I0.w *= C;	     \
-  I1.x *= C; I1.y *= C;	I1.z *= C; I1.w *= C;	     \
-  I2.x *= C; I2.y *= C;	I2.z *= C; I2.w *= C;        \
+  float C = tex1Dfetch((spinorTexNorm), sp_idx);              \
+  I0.x *= C; I0.y *= C;	I0.z *= C; I0.w *= C;	              \
+  I1.x *= C; I1.y *= C;	I1.z *= C; I1.w *= C;	              \
+  I2.x *= C; I2.y *= C;	I2.z *= C; I2.w *= C;                 \

-#define READ_SPINOR_HALF_DOWN(spinor)		     \
+#define READ_SPINOR_HALF_DOWN(spinor)		              \
  float4 I3 = tex1Dfetch((spinor), sp_idx + 3*(sp_stride));   \
  float4 I4 = tex1Dfetch((spinor), sp_idx + 4*(sp_stride));   \
  float4 I5 = tex1Dfetch((spinor), sp_idx + 5*(sp_stride));   \
-  float C = tex1Dfetch((spinorTexNorm), sp_idx);     \
-  I3.x *= C; I3.y *= C;	I3.z *= C; I3.w *= C;	     \
-  I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C;	     \
+  float C = tex1Dfetch((spinorTexNorm), sp_idx);              \
+  I3.x *= C; I3.y *= C;	I3.z *= C; I3.w *= C;	              \
+  I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C;	              \
  I5.x *= C; I5.y *= C;	I5.z *= C; I5.w *= C;					     

-#define READ_ACCUM_DOUBLE(spinor)				\
-  double2 accum0 = fetch_double2((spinor), sid + 0*(sp_stride));		\
-  double2 accum1 = fetch_double2((spinor), sid + 1*(sp_stride));		\
-  double2 accum2 = fetch_double2((spinor), sid + 2*(sp_stride));		\
-  double2 accum3 = fetch_double2((spinor), sid + 3*(sp_stride));		\
-  double2 accum4 = fetch_double2((spinor), sid + 4*(sp_stride));		\
-  double2 accum5 = fetch_double2((spinor), sid + 5*(sp_stride));		\
-  double2 accum6 = fetch_double2((spinor), sid + 6*(sp_stride));		\
-  double2 accum7 = fetch_double2((spinor), sid + 7*(sp_stride));		\
-  double2 accum8 = fetch_double2((spinor), sid + 8*(sp_stride));		\
-  double2 accum9 = fetch_double2((spinor), sid + 9*(sp_stride));		\
-  double2 accum10 = fetch_double2((spinor), sid + 10*(sp_stride));	\
+#define READ_ACCUM_DOUBLE(spinor)				   \
+  double2 accum0 = fetch_double2((spinor), sid + 0*(sp_stride));   \
+  double2 accum1 = fetch_double2((spinor), sid + 1*(sp_stride));   \
+  double2 accum2 = fetch_double2((spinor), sid + 2*(sp_stride));   \
+  double2 accum3 = fetch_double2((spinor), sid + 3*(sp_stride));   \
+  double2 accum4 = fetch_double2((spinor), sid + 4*(sp_stride));   \
+  double2 accum5 = fetch_double2((spinor), sid + 5*(sp_stride));   \
+  double2 accum6 = fetch_double2((spinor), sid + 6*(sp_stride));   \
+  double2 accum7 = fetch_double2((spinor), sid + 7*(sp_stride));   \
+  double2 accum8 = fetch_double2((spinor), sid + 8*(sp_stride));   \
+  double2 accum9 = fetch_double2((spinor), sid + 9*(sp_stride));   \
+  double2 accum10 = fetch_double2((spinor), sid + 10*(sp_stride)); \
  double2 accum11 = fetch_double2((spinor), sid + 11*(sp_stride));	

-#define READ_ACCUM_SINGLE(spinor)			\
-  float4 accum0 = tex1Dfetch((spinor), sid + 0*(sp_stride));	\
-  float4 accum1 = tex1Dfetch((spinor), sid + 1*(sp_stride));	\
-  float4 accum2 = tex1Dfetch((spinor), sid + 2*(sp_stride));	\
-  float4 accum3 = tex1Dfetch((spinor), sid + 3*(sp_stride));	\
-  float4 accum4 = tex1Dfetch((spinor), sid + 4*(sp_stride));	\
+#define READ_ACCUM_SINGLE(spinor)                                  \
+  float4 accum0 = tex1Dfetch((spinor), sid + 0*(sp_stride));       \
+  float4 accum1 = tex1Dfetch((spinor), sid + 1*(sp_stride));       \
+  float4 accum2 = tex1Dfetch((spinor), sid + 2*(sp_stride));       \
+  float4 accum3 = tex1Dfetch((spinor), sid + 3*(sp_stride));       \
+  float4 accum4 = tex1Dfetch((spinor), sid + 4*(sp_stride));       \
  float4 accum5 = tex1Dfetch((spinor), sid + 5*(sp_stride)); 

-#define READ_ACCUM_HALF(spinor)					     \
-  float4 accum0 = tex1Dfetch((spinor), sid + 0*(sp_stride));		     \
-  float4 accum1 = tex1Dfetch((spinor), sid + 1*(sp_stride));		     \
-  float4 accum2 = tex1Dfetch((spinor), sid + 2*(sp_stride));		     \
-  float4 accum3 = tex1Dfetch((spinor), sid + 3*(sp_stride));		     \
-  float4 accum4 = tex1Dfetch((spinor), sid + 4*(sp_stride));		     \
-  float4 accum5 = tex1Dfetch((spinor), sid + 5*(sp_stride));		     \
-  float C = tex1Dfetch((accumTexNorm), sid);			     \
-  accum0.x *= C; accum0.y *= C;	accum0.z *= C; accum0.w *= C;	     \
-  accum1.x *= C; accum1.y *= C;	accum1.z *= C; accum1.w *= C;	     \
-  accum2.x *= C; accum2.y *= C;	accum2.z *= C; accum2.w *= C;        \
-  accum3.x *= C; accum3.y *= C;	accum3.z *= C; accum3.w *= C;	     \
-  accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C;	     \
+#define READ_ACCUM_HALF(spinor)					   \
+  float4 accum0 = tex1Dfetch((spinor), sid + 0*(sp_stride));       \
+  float4 accum1 = tex1Dfetch((spinor), sid + 1*(sp_stride));       \
+  float4 accum2 = tex1Dfetch((spinor), sid + 2*(sp_stride));       \
+  float4 accum3 = tex1Dfetch((spinor), sid + 3*(sp_stride));       \
+  float4 accum4 = tex1Dfetch((spinor), sid + 4*(sp_stride));       \
+  float4 accum5 = tex1Dfetch((spinor), sid + 5*(sp_stride));       \
+  float C = tex1Dfetch((accumTexNorm), sid);		           \
+  accum0.x *= C; accum0.y *= C;	accum0.z *= C; accum0.w *= C;      \
+  accum1.x *= C; accum1.y *= C;	accum1.z *= C; accum1.w *= C;      \
+  accum2.x *= C; accum2.y *= C;	accum2.z *= C; accum2.w *= C;      \
+  accum3.x *= C; accum3.y *= C;	accum3.z *= C; accum3.w *= C;      \
+  accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C;      \
  accum5.x *= C; accum5.y *= C;	accum5.z *= C; accum5.w *= C;					     


-#define WRITE_SPINOR_DOUBLE2()					 \
-  g_out[0*(sp_stride)+sid] = make_double2(o00_re, o00_im);		 \
-  g_out[1*(sp_stride)+sid] = make_double2(o01_re, o01_im);		 \
-  g_out[2*(sp_stride)+sid] = make_double2(o02_re, o02_im);		 \
-  g_out[3*(sp_stride)+sid] = make_double2(o10_re, o10_im);		 \
-  g_out[4*(sp_stride)+sid] = make_double2(o11_re, o11_im);		 \
-  g_out[5*(sp_stride)+sid] = make_double2(o12_re, o12_im);		 \
-  g_out[6*(sp_stride)+sid] = make_double2(o20_re, o20_im);		 \
-  g_out[7*(sp_stride)+sid] = make_double2(o21_re, o21_im);		 \
-  g_out[8*(sp_stride)+sid] = make_double2(o22_re, o22_im);		 \
-  g_out[9*(sp_stride)+sid] = make_double2(o30_re, o30_im);		 \
-  g_out[10*(sp_stride)+sid] = make_double2(o31_re, o31_im);		 \
+#define WRITE_SPINOR_DOUBLE2()					   \
+  g_out[0*(sp_stride)+sid] = make_double2(o00_re, o00_im);	   \
+  g_out[1*(sp_stride)+sid] = make_double2(o01_re, o01_im);	   \
+  g_out[2*(sp_stride)+sid] = make_double2(o02_re, o02_im);	   \
+  g_out[3*(sp_stride)+sid] = make_double2(o10_re, o10_im);	   \
+  g_out[4*(sp_stride)+sid] = make_double2(o11_re, o11_im);	   \
+  g_out[5*(sp_stride)+sid] = make_double2(o12_re, o12_im);	   \
+  g_out[6*(sp_stride)+sid] = make_double2(o20_re, o20_im);	   \
+  g_out[7*(sp_stride)+sid] = make_double2(o21_re, o21_im);	   \
+  g_out[8*(sp_stride)+sid] = make_double2(o22_re, o22_im);	   \
+  g_out[9*(sp_stride)+sid] = make_double2(o30_re, o30_im);	   \
+  g_out[10*(sp_stride)+sid] = make_double2(o31_re, o31_im);	   \
  g_out[11*(sp_stride)+sid] = make_double2(o32_re, o32_im);		 

-#define WRITE_SPINOR_FLOAT4()					 \
+#define WRITE_SPINOR_FLOAT4()					          \
  g_out[0*(sp_stride)+sid] = make_float4(o00_re, o00_im, o01_re, o01_im); \
  g_out[1*(sp_stride)+sid] = make_float4(o02_re, o02_im, o10_re, o10_im); \
  g_out[2*(sp_stride)+sid] = make_float4(o11_re, o11_im, o12_re, o12_im); \