Advanced Computing Platform for Theoretical Physics

Commit 6cbfc6be authored by rbabich's avatar rbabich
Browse files

quda: minor cleanup


git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@590 be54200a-260c-0410-bdd7-ce6af2a381ab
parent f0247f5f
Version 0.x Version 0.x
- Add auto-tuning blas to improve performance (see README for details)
- Introduced new interface functions newQudaGaugeParam() and - Introduced new interface functions newQudaGaugeParam() and
newQudaInvertParam() to allow for enhanced error checking. See newQudaInvertParam() to allow for enhanced error checking. See
invert_test for an example of their use. invert_test for an example of their use.
- Modified loadCloverQuda() to take gauge_param as an additional - Modified loadCloverQuda() to take gauge_param as an additional
parameter. parameter.
- Added auto-tuning blas to improve performance (see README for details).
- Improved stability of the half precision 8-parameter SU(3) - Improved stability of the half precision 8-parameter SU(3)
reconstruction (with thanks to Guochun Shi). reconstruction (with thanks to Guochun Shi).
- Cleaned up the invert_test example to remove unnecessary dependencies. - Cleaned up the invert_test example to remove unnecessary dependencies.
......
Release Notes for QUDA v0.1 17 November 2009 Release Notes for QUDA v0.x ?? December 2009
--------------------------- ---------------------------
Overview: Overview:
...@@ -53,14 +53,14 @@ against lib/libquda.a, and study tests/invert_test.c for an example of ...@@ -53,14 +53,14 @@ against lib/libquda.a, and study tests/invert_test.c for an example of
the interface. The various inverter options are enumerated in the interface. The various inverter options are enumerated in
include/enum_quda.h. include/enum_quda.h.
The lib/blas_quda.cu file contains all of the BLAS like functions The lib/blas_quda.cu file contains all of the BLAS-like functions
required for the inverters. The threads per block and blocks per grid required for the inverters. The threads per block and blocks per grid
parameters are auto-tuned using the blas_test function in quda/tests, parameters are auto-tuned using the blas_test function in tests/, and
and the output stored in blas_param.h which is included here. These the output stored in blas_param.h which is included here. These
optimal values may change a function of the CUDA device and the host optimal values may change as a function of the CUDA device and the
hardware, so re-running blas_test and copying over the output host hardware, so re-running blas_test and copying over the output
blas_param.h and recompiling the blas library may provide extra blas_param.h into lib/ and recompiling the blas library may provide
performance. extra performance.
Known issues: Known issues:
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
#define _DSLASH_QUDA_H #define _DSLASH_QUDA_H
#include <cuComplex.h> #include <cuComplex.h>
#include <quda.h>
#include <quda_internal.h> #include <quda_internal.h>
#ifdef __cplusplus #ifdef __cplusplus
......
...@@ -100,7 +100,6 @@ extern "C" { ...@@ -100,7 +100,6 @@ extern "C" {
} }
#endif #endif
#include <quda.h>
#include <blas_quda.h> #include <blas_quda.h>
#include <dslash_quda.h> #include <dslash_quda.h>
......
...@@ -11,9 +11,10 @@ QUDA_HDRS = blas_quda.h dslash_quda.h enum_quda.h gauge_quda.h quda.h \ ...@@ -11,9 +11,10 @@ QUDA_HDRS = blas_quda.h dslash_quda.h enum_quda.h gauge_quda.h quda.h \
# files containing complex macros and other code fragments to be inlined, # files containing complex macros and other code fragments to be inlined,
# found in lib/ # found in lib/
QUDA_INLN = check_params.h clover_def.h dslash_common.h dslash_def.h \ QUDA_INLN = blas_param.h check_params.h clover_def.h dslash_common.h \
dslash_textures.h io_spinor.h read_clover.h read_gauge.h \ dslash_def.h dslash_textures.h io_spinor.h read_clover.h \
reduce_complex_core.h reduce_core.h reduce_triple_core.h read_gauge.h reduce_complex_core.h reduce_core.h \
reduce_triple_core.h
# files generated by the scripts in lib/generate/, found in lib/dslash_core/ # files generated by the scripts in lib/generate/, found in lib/dslash_core/
# (The current clover_core.h was edited by hand.) # (The current clover_core.h was edited by hand.)
......
...@@ -40,7 +40,7 @@ int blas_blocks[3][22]; ...@@ -40,7 +40,7 @@ int blas_blocks[3][22];
dim3 blasBlock; dim3 blasBlock;
dim3 blasGrid; dim3 blasGrid;
void initBlas() { void initBlas(void) {
if (!d_reduceFloat) { if (!d_reduceFloat) {
if (cudaMalloc((void**) &d_reduceFloat, REDUCE_MAX_BLOCKS*sizeof(QudaSumFloat)) == cudaErrorMemoryAllocation) { if (cudaMalloc((void**) &d_reduceFloat, REDUCE_MAX_BLOCKS*sizeof(QudaSumFloat)) == cudaErrorMemoryAllocation) {
......
#include <stdlib.h> #include <stdlib.h>
#include <stdio.h> #include <stdio.h>
#include <quda_internal.h>
#include <dslash_quda.h> #include <dslash_quda.h>
#include <spinor_quda.h> // not needed once call to allocateParitySpinor() is removed #include <spinor_quda.h> // not needed once call to allocateParitySpinor() is removed
...@@ -124,7 +125,7 @@ void initDslashConstants(FullGauge gauge, int sp_stride, int cl_stride) { ...@@ -124,7 +125,7 @@ void initDslashConstants(FullGauge gauge, int sp_stride, int cl_stride) {
initDslash = 1; initDslash = 1;
} }
void bindGaugeTex(FullGauge gauge, int oddBit) { static void bindGaugeTex(FullGauge gauge, int oddBit) {
if (gauge.precision == QUDA_DOUBLE_PRECISION) { if (gauge.precision == QUDA_DOUBLE_PRECISION) {
if (oddBit) { if (oddBit) {
cudaBindTexture(0, gauge0TexDouble, gauge.odd, gauge.bytes); cudaBindTexture(0, gauge0TexDouble, gauge.odd, gauge.bytes);
...@@ -613,8 +614,7 @@ void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa, int d ...@@ -613,8 +614,7 @@ void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa, int d
} }
static void bindCloverTex(ParityClover clover) {
void bindCloverTex(ParityClover clover) {
if (clover.precision == QUDA_DOUBLE_PRECISION) { if (clover.precision == QUDA_DOUBLE_PRECISION) {
cudaBindTexture(0, cloverTexDouble, clover.clover, clover.bytes); cudaBindTexture(0, cloverTexDouble, clover.clover, clover.bytes);
} else if (clover.precision == QUDA_SINGLE_PRECISION) { } else if (clover.precision == QUDA_SINGLE_PRECISION) {
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#include <math.h> #include <math.h>
#include <cuComplex.h> #include <cuComplex.h>
#include <quda.h>
#include <quda_internal.h> #include <quda_internal.h>
#include <spinor_quda.h> #include <spinor_quda.h>
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#include <stdlib.h> #include <stdlib.h>
#include <math.h> #include <math.h>
#include <quda.h>
#include <quda_internal.h> #include <quda_internal.h>
#include <util_quda.h> #include <util_quda.h>
#include <spinor_quda.h> #include <spinor_quda.h>
......
...@@ -18,7 +18,7 @@ FullClover cudaCloverSloppy; ...@@ -18,7 +18,7 @@ FullClover cudaCloverSloppy;
FullClover cudaCloverInvPrecise; // inverted clover term FullClover cudaCloverInvPrecise; // inverted clover term
FullClover cudaCloverInvSloppy; FullClover cudaCloverInvSloppy;
void initBlas(); void initBlas(void);
// define newQudaGaugeParam() and newQudaInvertParam() // define newQudaGaugeParam() and newQudaInvertParam()
#define INIT_PARAM #define INIT_PARAM
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
#include <test_util.h> #include <test_util.h>
QudaPrecision cuda_prec; QudaPrecision cuda_prec;
QudaPrecision other_precision; // Used for copy benchmark QudaPrecision other_prec; // Used for copy benchmark
ParitySpinor x, y, z, w, v, p; ParitySpinor x, y, z, w, v, p;
int nIters; int nIters;
...@@ -53,7 +53,7 @@ void init() { ...@@ -53,7 +53,7 @@ void init() {
x = allocateParitySpinor(X, cuda_prec, sp_pad); x = allocateParitySpinor(X, cuda_prec, sp_pad);
y = allocateParitySpinor(X, cuda_prec, sp_pad); y = allocateParitySpinor(X, cuda_prec, sp_pad);
z = allocateParitySpinor(X, cuda_prec, sp_pad); z = allocateParitySpinor(X, cuda_prec, sp_pad);
p = allocateParitySpinor(X, other_precision, sp_pad); p = allocateParitySpinor(X, other_prec, sp_pad);
} }
void end() { void end() {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment