Advanced Computing Platform for Theoretical Physics

commit大文件会使得服务器变得不稳定,请大家尽量只commit代码,不要commit大的文件。

Commit 6cbfc6be authored by rbabich's avatar rbabich
Browse files

quda: minor cleanup


git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@590 be54200a-260c-0410-bdd7-ce6af2a381ab
parent f0247f5f
Version 0.x
- Add auto-tuning blas to improve performance (see README for details)
- Introduced new interface functions newQudaGaugeParam() and
newQudaInvertParam() to allow for enhanced error checking. See
invert_test for an example of their use.
- Modified loadCloverQuda() to take gauge_param as an additional
parameter.
- Added auto-tuning blas to improve performance (see README for details).
- Improved stability of the half precision 8-parameter SU(3)
reconstruction (with thanks to Guochun Shi).
- Cleaned up the invert_test example to remove unnecessary dependencies.
......
Release Notes for QUDA v0.1 17 November 2009
Release Notes for QUDA v0.x ?? December 2009
---------------------------
Overview:
......@@ -53,14 +53,14 @@ against lib/libquda.a, and study tests/invert_test.c for an example of
the interface. The various inverter options are enumerated in
include/enum_quda.h.
The lib/blas_quda.cu file contains all of the BLAS like functions
The lib/blas_quda.cu file contains all of the BLAS-like functions
required for the inverters. The threads per block and blocks per grid
parameters are auto-tuned using the blas_test function in quda/tests,
and the output stored in blas_param.h which is included here. These
optimal values may change a function of the CUDA device and the host
hardware, so re-running blas_test and copying over the output
blas_param.h and recompiling the blas library may provide extra
performance.
parameters are auto-tuned using the blas_test function in tests/, and
the output stored in blas_param.h which is included here. These
optimal values may change as a function of the CUDA device and the
host hardware, so re-running blas_test and copying over the output
blas_param.h into lib/ and recompiling the blas library may provide
extra performance.
Known issues:
......
......@@ -2,7 +2,7 @@
#define _DSLASH_QUDA_H
#include <cuComplex.h>
#include <quda.h>
#include <quda_internal.h>
#ifdef __cplusplus
......
......@@ -100,7 +100,6 @@ extern "C" {
}
#endif
#include <quda.h>
#include <blas_quda.h>
#include <dslash_quda.h>
......
......@@ -11,9 +11,10 @@ QUDA_HDRS = blas_quda.h dslash_quda.h enum_quda.h gauge_quda.h quda.h \
# files containing complex macros and other code fragments to be inlined,
# found in lib/
QUDA_INLN = check_params.h clover_def.h dslash_common.h dslash_def.h \
dslash_textures.h io_spinor.h read_clover.h read_gauge.h \
reduce_complex_core.h reduce_core.h reduce_triple_core.h
QUDA_INLN = blas_param.h check_params.h clover_def.h dslash_common.h \
dslash_def.h dslash_textures.h io_spinor.h read_clover.h \
read_gauge.h reduce_complex_core.h reduce_core.h \
reduce_triple_core.h
# files generated by the scripts in lib/generate/, found in lib/dslash_core/
# (The current clover_core.h was edited by hand.)
......
......@@ -40,7 +40,7 @@ int blas_blocks[3][22];
dim3 blasBlock;
dim3 blasGrid;
void initBlas() {
void initBlas(void) {
if (!d_reduceFloat) {
if (cudaMalloc((void**) &d_reduceFloat, REDUCE_MAX_BLOCKS*sizeof(QudaSumFloat)) == cudaErrorMemoryAllocation) {
......
#include <stdlib.h>
#include <stdio.h>
#include <quda_internal.h>
#include <dslash_quda.h>
#include <spinor_quda.h> // not needed once call to allocateParitySpinor() is removed
......@@ -124,7 +125,7 @@ void initDslashConstants(FullGauge gauge, int sp_stride, int cl_stride) {
initDslash = 1;
}
void bindGaugeTex(FullGauge gauge, int oddBit) {
static void bindGaugeTex(FullGauge gauge, int oddBit) {
if (gauge.precision == QUDA_DOUBLE_PRECISION) {
if (oddBit) {
cudaBindTexture(0, gauge0TexDouble, gauge.odd, gauge.bytes);
......@@ -613,8 +614,7 @@ void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa, int d
}
void bindCloverTex(ParityClover clover) {
static void bindCloverTex(ParityClover clover) {
if (clover.precision == QUDA_DOUBLE_PRECISION) {
cudaBindTexture(0, cloverTexDouble, clover.clover, clover.bytes);
} else if (clover.precision == QUDA_SINGLE_PRECISION) {
......
......@@ -3,6 +3,7 @@
#include <math.h>
#include <cuComplex.h>
#include <quda.h>
#include <quda_internal.h>
#include <spinor_quda.h>
......
......@@ -2,6 +2,7 @@
#include <stdlib.h>
#include <math.h>
#include <quda.h>
#include <quda_internal.h>
#include <util_quda.h>
#include <spinor_quda.h>
......
......@@ -18,7 +18,7 @@ FullClover cudaCloverSloppy;
FullClover cudaCloverInvPrecise; // inverted clover term
FullClover cudaCloverInvSloppy;
void initBlas();
void initBlas(void);
// define newQudaGaugeParam() and newQudaInvertParam()
#define INIT_PARAM
......
......@@ -8,7 +8,7 @@
#include <test_util.h>
QudaPrecision cuda_prec;
QudaPrecision other_precision; // Used for copy benchmark
QudaPrecision other_prec; // Used for copy benchmark
ParitySpinor x, y, z, w, v, p;
int nIters;
......@@ -53,7 +53,7 @@ void init() {
x = allocateParitySpinor(X, cuda_prec, sp_pad);
y = allocateParitySpinor(X, cuda_prec, sp_pad);
z = allocateParitySpinor(X, cuda_prec, sp_pad);
p = allocateParitySpinor(X, other_precision, sp_pad);
p = allocateParitySpinor(X, other_prec, sp_pad);
}
void end() {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment