quda: minor cleanup

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@590 be54200a-260c-0410-bdd7-ce6af2a381ab

quda: minor cleanup
git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@590 be54200a-260c-0410-bdd7-ce6af2a381ab
6cbfc6be · rbabich · f0247f5f · 6cbfc6be · 6cbfc6be · 6cbfc6be
Commit 6cbfc6be authored Dec 09, 2009 by rbabich
--- a/NEWS
+++ b/NEWS

 Version 0.x

- Add auto-tuning blas to improve performance (see README for details)
 - Introduced new interface functions newQudaGaugeParam() and
  newQudaInvertParam() to allow for enhanced error checking.  See
  invert_test for an example of their use.
 - Modified loadCloverQuda() to take gauge_param as an additional
  parameter.
+- Added auto-tuning blas to improve performance (see README for details).
 - Improved stability of the half precision 8-parameter SU(3)
  reconstruction (with thanks to Guochun Shi).
 - Cleaned up the invert_test example to remove unnecessary dependencies.

--- a/README
+++ b/README

-Release Notes for QUDA v0.1                          17 November 2009
+Release Notes for QUDA v0.x                          ?? December 2009
 ---------------------------

 Overview:
@@ -53,14 +53,14 @@ against lib/libquda.a, and study tests/invert_test.c for an example of
 the interface.  The various inverter options are enumerated in
 include/enum_quda.h.

-The lib/blas_quda.cu file contains all of the BLAS like functions
+The lib/blas_quda.cu file contains all of the BLAS-like functions
 required for the inverters.  The threads per block and blocks per grid
-parameters are auto-tuned using the blas_test function in quda/tests,
-and the output stored in blas_param.h which is included here.  These
-optimal values may change a function of the CUDA device and the host
-hardware, so re-running blas_test and copying over the output
-blas_param.h and recompiling the blas library may provide extra
-performance.
+parameters are auto-tuned using the blas_test function in tests/, and
+the output stored in blas_param.h which is included here.  These
+optimal values may change as a function of the CUDA device and the
+host hardware, so re-running blas_test and copying over the output
+blas_param.h into lib/ and recompiling the blas library may provide
+extra performance.


 Known issues:

--- a/include/dslash_quda.h
+++ b/include/dslash_quda.h
@@ -2,7 +2,7 @@
 #define _DSLASH_QUDA_H

 #include <cuComplex.h>
-
+#include <quda.h>
 #include <quda_internal.h>

 #ifdef __cplusplus

--- a/include/quda_internal.h
+++ b/include/quda_internal.h
@@ -100,7 +100,6 @@ extern "C" {
 }
 #endif

-#include <quda.h>
 #include <blas_quda.h>
 #include <dslash_quda.h>


--- a/lib/Makefile
+++ b/lib/Makefile
@@ -11,9 +11,10 @@ QUDA_HDRS = blas_quda.h dslash_quda.h enum_quda.h gauge_quda.h quda.h \

 # files containing complex macros and other code fragments to be inlined,
 # found in lib/
-QUDA_INLN = check_params.h clover_def.h dslash_common.h dslash_def.h	\
-	dslash_textures.h io_spinor.h read_clover.h read_gauge.h	\
-	reduce_complex_core.h reduce_core.h reduce_triple_core.h
+QUDA_INLN = blas_param.h check_params.h clover_def.h dslash_common.h	\
+	dslash_def.h dslash_textures.h io_spinor.h read_clover.h	\
+	read_gauge.h reduce_complex_core.h reduce_core.h		\
+	reduce_triple_core.h

 # files generated by the scripts in lib/generate/, found in lib/dslash_core/
 # (The current clover_core.h was edited by hand.)

--- a/lib/blas_quda.cu
+++ b/lib/blas_quda.cu
@@ -40,7 +40,7 @@ int blas_blocks[3][22];
 dim3 blasBlock;
 dim3 blasGrid;

-void initBlas() {
+void initBlas(void) {
  
  if (!d_reduceFloat) {
    if (cudaMalloc((void**) &d_reduceFloat, REDUCE_MAX_BLOCKS*sizeof(QudaSumFloat)) == cudaErrorMemoryAllocation) {

--- a/lib/dslash_quda.cu
+++ b/lib/dslash_quda.cu
 #include <stdlib.h>
 #include <stdio.h>

+#include <quda_internal.h>
 #include <dslash_quda.h>
 #include <spinor_quda.h> // not needed once call to allocateParitySpinor() is removed

@@ -124,7 +125,7 @@ void initDslashConstants(FullGauge gauge, int sp_stride, int cl_stride) {
  initDslash = 1;
 }

-void bindGaugeTex(FullGauge gauge, int oddBit) {
+static void bindGaugeTex(FullGauge gauge, int oddBit) {
  if (gauge.precision == QUDA_DOUBLE_PRECISION) {
    if (oddBit) {
      cudaBindTexture(0, gauge0TexDouble, gauge.odd, gauge.bytes); 
@@ -613,8 +614,7 @@ void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa, int d
 }


-
-void bindCloverTex(ParityClover clover) {
+static void bindCloverTex(ParityClover clover) {
  if (clover.precision == QUDA_DOUBLE_PRECISION) {
    cudaBindTexture(0, cloverTexDouble, clover.clover, clover.bytes); 
  } else if (clover.precision == QUDA_SINGLE_PRECISION) {

--- a/lib/inv_bicgstab_quda.cpp
+++ b/lib/inv_bicgstab_quda.cpp
@@ -3,6 +3,7 @@
 #include <math.h>
 #include <cuComplex.h>

+#include <quda.h>
 #include <quda_internal.h>
 #include <spinor_quda.h>


--- a/lib/inv_cg_quda.cpp
+++ b/lib/inv_cg_quda.cpp
@@ -2,6 +2,7 @@
 #include <stdlib.h>
 #include <math.h>

+#include <quda.h>
 #include <quda_internal.h>
 #include <util_quda.h>
 #include <spinor_quda.h>

--- a/lib/invert_quda.cpp
+++ b/lib/invert_quda.cpp
@@ -18,7 +18,7 @@ FullClover cudaCloverSloppy;
 FullClover cudaCloverInvPrecise; // inverted clover term
 FullClover cudaCloverInvSloppy;

-void initBlas();
+void initBlas(void);

 // define newQudaGaugeParam() and newQudaInvertParam()
 #define INIT_PARAM

--- a/tests/blas_test.cu
+++ b/tests/blas_test.cu
@@ -8,7 +8,7 @@
 #include <test_util.h>

 QudaPrecision cuda_prec;
-QudaPrecision other_precision;   // Used for copy benchmark
+QudaPrecision other_prec; // Used for copy benchmark
 ParitySpinor x, y, z, w, v, p;

 int nIters;
@@ -53,7 +53,7 @@ void init() {
  x = allocateParitySpinor(X, cuda_prec, sp_pad);
  y = allocateParitySpinor(X, cuda_prec, sp_pad);
  z = allocateParitySpinor(X, cuda_prec, sp_pad);
-  p = allocateParitySpinor(X, other_precision, sp_pad);
+  p = allocateParitySpinor(X, other_prec, sp_pad);
 }

 void end() {