Advanced Computing Platform for Theoretical Physics

commit大文件会使得服务器变得不稳定,请大家尽量只commit代码,不要commit大的文件。

Commit 458d290a authored by rbabich's avatar rbabich
Browse files

more work on quda clover


git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@462 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 7f8a13b9
......@@ -6,7 +6,7 @@ CPU_ARCH = x86_64 # x86 or x86_64
GPU_ARCH = sm_13 # sm_10, sm_11, sm_12, or sm_13
GPU_EMU = false # set to 'true' for device emulation
PYTHON = python2.6 # python 2.5 or later required for 'make generated'
PYTHON = python2.6 # python 2.5 or later required for 'make gen'
######
......@@ -36,44 +36,45 @@ LDFLAGS = -fPIC $(LIB)
all: dslash_test invert_test su3_test pack_test
ILIB = libquda.a
ILIB_OBJS = blas_quda.o blas_reference.o dslash_quda.o dslash_reference.o \
gauge_quda.o inv_bicgstab_quda.o inv_cg_quda.o invert_quda.o \
spinor_quda.o util_quda.o
ILIB_HDRS = blas_quda.h blas_reference.h dslash_def.h dslash_quda.h \
dslash_reference.h enum_quda.h gauge_quda.h invert_quda.h \
io_spinor.h quda.h read_clover.h read_gauge.h reduce_complex_core.h \
reduce_core.h reduce_triple_core.h spinor_quda.h util_quda.h
ILIB_CORE = dslash_core.h dslash_dagger_core.h
QUDA = libquda.a
QUDA_OBJS = blas_quda.o blas_reference.o clover_quda.o dslash_quda.o \
dslash_reference.o gauge_quda.o inv_bicgstab_quda.o inv_cg_quda.o \
invert_quda.o spinor_quda.o util_quda.o
QUDA_HDRS = blas_quda.h blas_reference.h clover_def.h dslash_def.h \
dslash_quda.h dslash_reference.h enum_quda.h gauge_quda.h \
invert_quda.h io_spinor.h quda.h read_clover.h read_gauge.h \
reduce_complex_core.h reduce_core.h reduce_triple_core.h \
spinor_quda.h util_quda.h
QUDA_CORE = clover_core.h dslash_core.h dslash_dagger_core.h
$(ILIB): $(ILIB_OBJS)
ar cru $@ $(ILIB_OBJS)
$(QUDA): $(QUDA_OBJS)
ar cru $@ $(QUDA_OBJS)
dslash_test: dslash_test.o $(ILIB)
$(CXX) $(LDFLAGS) $< $(ILIB) -o $@
dslash_test: dslash_test.o $(QUDA)
$(CXX) $(LDFLAGS) $< $(QUDA) -o $@
invert_test: invert_test.o $(ILIB)
$(CXX) $(LDFLAGS) $< $(ILIB) -o $@
invert_test: invert_test.o $(QUDA)
$(CXX) $(LDFLAGS) $< $(QUDA) -o $@
su3_test: su3_test.o $(ILIB)
$(CXX) $(LDFLAGS) $< $(ILIB) -o $@
su3_test: su3_test.o $(QUDA)
$(CXX) $(LDFLAGS) $< $(QUDA) -o $@
pack_test: pack_test.o $(ILIB)
$(CXX) $(LDFLAGS) $< $(ILIB) -o $@
pack_test: pack_test.o $(QUDA)
$(CXX) $(LDFLAGS) $< $(QUDA) -o $@
generated:
gen:
$(PYTHON) dslash_cuda_gen.py
clean:
-rm -f *.o dslash_test invert_test su3_test pack_test $(ILIB)
-rm -f *.o dslash_test invert_test su3_test pack_test $(QUDA)
%.o: %.c $(ILIB_HDRS)
%.o: %.c $(QUDA_HDRS)
$(CC) $(CFLAGS) $< -c -o $@
%.o: %.cpp $(ILIB_HDRS)
%.o: %.cpp $(QUDA_HDRS)
$(CXX) $(CXXFLAGS) $< -c -o $@
%.o: %.cu $(ILIB_HDRS) $(ILIB_CORE)
%.o: %.cu $(QUDA_HDRS) $(QUDA_CORE)
$(NVCC) $(NVCCFLAGS) $< -c -o $@
.PHONY: all generated clean
.PHONY: all gen clean
QUDA v0.x Release Notes
-----------------------
Release Notes for QUDA v0.x
---------------------------
Overview:
......@@ -44,7 +45,8 @@ Installation:
In the source directory, copy the template 'Makefile.tmpl' to
'Makefile', and edit the first few lines to specify the CUDA install
path, the platform (x86 or x86_64), and the GPU architecture (see
"Compatibility" above). Then type 'make' to build the library.
"Hardware compatibility" above). Then type 'make' to build the
library.
Using the library:
......@@ -74,9 +76,11 @@ For help or to report a bug, please contact Mike Clark
(mikec@seas.harvard.edu) or Ron Babich (rbabich@bu.edu).
If you find this code useful in your work, a citation to the following
write-up would be appreciated:
would be appreciated:
K. Barros et al., "Blasting through lattice calculations using CUDA,"
PoS LATTICE2008, 045 (2008) [arXiv:0810.5365 [hep-lat]].
Please also let us know so that we can send you updates and bug-fixes.
Please also drop us a note so that we can send you updates and
bug-fixes.
This diff is collapsed.
// clover_def.h - clover kernel definitions
// initialize on first iteration
#ifndef DD_LOOP
#define DD_LOOP
#define DD_XPAY 0
#define DD_SPREC 0
#define DD_CPREC 0
#endif
// set options for current iteration
#if (DD_XPAY==0) // no xpay
#define DD_XPAY_F
#define DD_PARAM2 int oddBit
#else // xpay
#define DD_XPAY_F Xpay
#if (DD_SPREC == 0)
#define DD_PARAM2 int oddBit, double a
#else
#define DD_PARAM2 int oddBit, float a
#endif
#define DSLASH_XPAY
#endif
#if (DD_SPREC==0) // double-precision spinor field
#define DD_SPREC_F D
#define DD_PARAM1 double2* g_out
#define READ_SPINOR READ_SPINOR_DOUBLE
#define SPINORTEX spinorTexDouble
#define WRITE_SPINOR WRITE_SPINOR_DOUBLE2
#define SPINOR_DOUBLE
#if (DD_XPAY==1)
#define ACCUMTEX accumTexDouble
#define READ_ACCUM READ_ACCUM_DOUBLE
#endif
#elif (DD_SPREC==1) // single-precision spinor field
#define DD_SPREC_F S
#define DD_PARAM1 float4* g_out
#define READ_SPINOR READ_SPINOR_SINGLE
#define SPINORTEX spinorTexSingle
#define WRITE_SPINOR WRITE_SPINOR_FLOAT4
#if (DD_XPAY==1)
#define ACCUMTEX accumTexSingle
#define READ_ACCUM READ_ACCUM_SINGLE
#endif
#else // half-precision spinor field
#define DD_SPREC_F H
#define READ_SPINOR READ_SPINOR_HALF
#define SPINORTEX spinorTexHalf
#define DD_PARAM1 short4* g_out, float *c
#define WRITE_SPINOR WRITE_SPINOR_SHORT4
#if (DD_XPAY==1)
#define ACCUMTEX accumTexHalf
#define READ_ACCUM READ_ACCUM_HALF
#endif
#endif
#if (DD_CPREC==0) // double-precision clover term
#define DD_CPREC_F D
#define CLOVERTEX cloverTexDouble
#define READ_CLOVER READ_CLOVER_DOUBLE
#define CLOVER_DOUBLE
#elif (DD_CPREC==1) // single-precision clover term
#define DD_CPREC_F S
#define CLOVERTEX cloverTexSingle
#define READ_CLOVER READ_CLOVER_SINGLE
#else // half-precision clover term
#define DD_CPREC_F H
#define CLOVERTEX cloverTexHalf
#define READ_CLOVER READ_CLOVER_HALF
#endif
#define DD_CONCAT(s,c,x) clover ## s ## c ## x ## Kernel
#define DD_FUNC(s,c,x) DD_CONCAT(s,c,x)
// define the kernel
#if !(__CUDA_ARCH__ != 130 && (DD_SPREC == 0 || DD_CPREC == 0))
__global__ void
DD_FUNC(DD_SPREC_F, DD_CPREC_F, DD_XPAY_F)(DD_PARAM1, DD_PARAM2) {
#include "clover_core.h"
}
#endif
// clean up
#undef DD_SPREC_F
#undef DD_CPREC_F
#undef DD_XPAY_F
#undef DD_PARAM1
#undef DD_PARAM2
#undef DD_CONCAT
#undef DD_FUNC
#undef DSLASH_XPAY
#undef READ_SPINOR
#undef SPINORTEX
#undef WRITE_SPINOR
#undef ACCUMTEX
#undef READ_ACCUM
#undef CLOVERTEX
#undef READ_CLOVER
#undef GAUGE_DOUBLE
#undef SPINOR_DOUBLE
#undef CLOVER_DOUBLE
// prepare next set of options, or clean up after final iteration
//#if (DD_XPAY==0) // xpay variant is not needed
//#undef DD_XPAY
//#define DD_XPAY 1
//#else
//#undef DD_XPAY
//#define DD_XPAY 0
#if (DD_SPREC==0)
#undef DD_SPREC
#define DD_SPREC 1
#elif (DD_SPREC==1)
#undef DD_SPREC
#define DD_SPREC 2
#else
#undef DD_SPREC
#define DD_SPREC 0
#if (DD_CPREC==0)
#undef DD_CPREC
#define DD_CPREC 1
#elif (DD_CPREC==1)
#undef DD_CPREC
#define DD_CPREC 2
#else
#undef DD_LOOP
#undef DD_XPAY
#undef DD_SPREC
#undef DD_CPREC
#endif // DD_CPREC
#endif // DD_SPREC
//#endif // DD_XPAY
#ifdef DD_LOOP
#include "clover_def.h"
#endif
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <quda.h>
#include <spinor_quda.h>
#include <util_quda.h>
ParityClover allocateParityClover(int *X, Precision precision)
{
ParityClover ret;
ret.precision = precision;
ret.volume = 1;
for (int d=0; d<4; d++) {
ret.X[d] = X[d];
ret.volume *= X[d];
}
ret.Nc = 3;
ret.Ns = 4;
ret.length = ret.volume*ret.Nc*ret.Nc*ret.Ns*ret.Ns/2; // block-diagonal Hermitian (72 reals)
if (precision == QUDA_DOUBLE_PRECISION) ret.bytes = ret.length*sizeof(double);
else if (precision == QUDA_SINGLE_PRECISION) ret.bytes = ret.length*sizeof(float);
else ret.bytes = ret.length*sizeof(short);
if (cudaMalloc((void**)&(ret.clover), ret.bytes) == cudaErrorMemoryAllocation) {
printf("Error allocating clover term\n");
exit(0);
}
if (precision == QUDA_HALF_PRECISION) {
if (cudaMalloc((void**)&ret.cloverNorm, ret.bytes/18) == cudaErrorMemoryAllocation) {
printf("Error allocating cloverNorm\n");
exit(0);
}
}
return ret;
}
FullClover allocateCloverField(int *X, Precision precision)
{
FullClover ret;
ret.even = allocateParityClover(X, precision);
ret.odd = allocateParityClover(X, precision);
return ret;
}
void freeParityClover(ParityClover *clover)
{
cudaFree(clover->clover);
clover->clover = NULL;
}
void freeCloverField(FullClover *clover)
{
freeParityClover(&clover->even);
freeParityClover(&clover->odd);
}
template <typename Float>
static inline void packCloverMatrix(float4* a, Float *b, int Vh)
{
for (int i=0; i<18; i++) {
a[i*Vh].x = b[4*i+0];
a[i*Vh].y = b[4*i+1];
a[i*Vh].z = b[4*i+2];
a[i*Vh].w = b[4*i+3];
}
}
template <typename Float>
static inline void packCloverMatrix(double2* a, Float *b, int Vh)
{
for (int i=0; i<36; i++) {
a[i*Vh].x = b[2*i+0];
a[i*Vh].y = b[2*i+1];
}
}
template <typename Float, typename FloatN>
static void packParityClover(FloatN *res, Float *clover, int Vh)
{
for (int i = 0; i < Vh; i++) {
packCloverMatrix(res+i, clover+72*i, Vh);
}
}
template <typename Float, typename FloatN>
static void packFullClover(FloatN *even, FloatN *odd, Float *clover, int *X)
{
int Vh = X[0]*X[1]*X[2]*X[3];
X[0] *= 2; // X now contains dimensions of the full lattice
for (int i=0; i<Vh; i++) {
int boundaryCrossings = i/X[0] + i/(X[1]*X[0]) + i/(X[2]*X[1]*X[0]);
{ // even sites
int k = 2*i + boundaryCrossings%2;
packCloverMatrix(even+i, clover+72*k, Vh);
}
{ // odd sites
int k = 2*i + (boundaryCrossings+1)%2;
packCloverMatrix(odd+i, clover+72*k, Vh);
}
}
}
template<typename Float>
static inline void packCloverMatrixHalf(short4 *res, float *norm, Float *clover, int Vh)
{
Float max, a, c;
// treat the two chiral blocks separately
for (int chi=0; chi<2; chi++) {
max = fabs(clover[0]);
for (int i=1; i<36; i++) {
if ((a = fabs(clover[i])) > max) max = a;
}
c = MAX_SHORT/max;
for (int i=0; i<9; i++) {
res[i*Vh].x = (short) (c * clover[4*i+0]);
res[i*Vh].y = (short) (c * clover[4*i+1]);
res[i*Vh].z = (short) (c * clover[4*i+2]);
res[i*Vh].w = (short) (c * clover[4*i+3]);
}
norm[chi*Vh] = 1/c;
res += 9;
clover += 36;
}
}
template <typename Float>
static void packParityCloverHalf(short4 *res, float *norm, Float *clover, int Vh)
{
for (int i = 0; i < Vh; i++) {
packCloverMatrixHalf(res+i, norm+i, clover+72*i, Vh);
}
}
template <typename Float>
static void packFullCloverHalf(short4 *even, float *evenNorm, short4 *odd, float *oddNorm,
Float *clover, int *X)
{
int Vh = X[0]*X[1]*X[2]*X[3];
X[0] *= 2; // X now contains dimensions of the full lattice
for (int i=0; i<Vh; i++) {
int boundaryCrossings = i/X[0] + i/(X[1]*X[0]) + i/(X[2]*X[1]*X[0]);
{ // even sites
int k = 2*i + boundaryCrossings%2;
packCloverMatrixHalf(even+i, evenNorm+i, clover+72*k, Vh);
}
{ // odd sites
int k = 2*i + (boundaryCrossings+1)%2;
packCloverMatrixHalf(odd+i, oddNorm+i, clover+72*k, Vh);
}
}
}
void loadParityClover(ParityClover ret, void *clover, Precision cpu_prec,
CloverFieldOrder clover_order)
{
// use pinned memory
void *packedClover, *packedCloverNorm;
if (ret.precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) {
printf("QUDA error: cannot have CUDA double precision without double CPU precision\n");
exit(-1);
}
if (clover_order != QUDA_PACKED_CLOVER_ORDER) {
printf("QUDA error: invalid clover order\n");
exit(-1);
}
#ifndef __DEVICE_EMULATION__
cudaMallocHost(&packedClover, ret.bytes);
if (ret.precision == QUDA_HALF_PRECISION) cudaMallocHost(&packedCloverNorm, ret.bytes/18);
#else
packedClover = malloc(ret.bytes);
if (ret.precision == QUDA_HALF_PRECISION) packedCloverNorm = malloc(ret.bytes/18);
#endif
if (ret.precision == QUDA_DOUBLE_PRECISION) {
packParityClover((double2 *)packedClover, (double *)clover, ret.volume);
} else if (ret.precision == QUDA_SINGLE_PRECISION) {
if (cpu_prec == QUDA_DOUBLE_PRECISION) {
packParityClover((float4 *)packedClover, (double *)clover, ret.volume);
} else {
packParityClover((float4 *)packedClover, (float *)clover, ret.volume);
}
} else {
if (cpu_prec == QUDA_DOUBLE_PRECISION) {
packParityCloverHalf((short4 *)packedClover, (float *)packedCloverNorm, (double *)clover, ret.volume);
} else {
packParityCloverHalf((short4 *)packedClover, (float *)packedCloverNorm, (float *)clover, ret.volume);
}
}
cudaMemcpy(ret.clover, packedClover, ret.bytes, cudaMemcpyHostToDevice);
if (ret.precision == QUDA_HALF_PRECISION) {
cudaMemcpy(ret.cloverNorm, packedCloverNorm, ret.bytes/18, cudaMemcpyHostToDevice);
}
#ifndef __DEVICE_EMULATION__
cudaFreeHost(packedClover);
if (ret.precision == QUDA_HALF_PRECISION) cudaFreeHost(packedCloverNorm);
#else
free(packedClover);
if (ret.precision == CUDA_HALF_PRECISION) free(packedCloverNorm);
#endif
}
void loadFullClover(FullClover ret, void *clover, Precision cpu_prec,
CloverFieldOrder clover_order)
{
// use pinned memory
void *packedEven, *packedEvenNorm, *packedOdd, *packedOddNorm;
if (ret.even.precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) {
printf("QUDA error: cannot have CUDA double precision without double CPU precision\n");
exit(-1);
}
if (clover_order != QUDA_LEX_PACKED_CLOVER_ORDER) {
printf("QUDA error: invalid clover order\n");
exit(-1);
}
#ifndef __DEVICE_EMULATION__
cudaMallocHost(&packedEven, ret.even.bytes);
cudaMallocHost(&packedOdd, ret.even.bytes);
if (ret.even.precision == QUDA_HALF_PRECISION) {
cudaMallocHost(&packedEvenNorm, ret.even.bytes/18);
cudaMallocHost(&packedOddNorm, ret.even.bytes/18);
}
#else
packedEven = malloc(ret.even.bytes);
packedOdd = malloc(ret.even.bytes);
if (ret.even.precision == QUDA_HALF_PRECISION) {
packedEvenNorm = malloc(ret.even.bytes/18);
packedOddNorm = malloc(ret.even.bytes/18);
}
#endif
if (ret.even.precision == QUDA_DOUBLE_PRECISION) {
packFullClover((double2 *)packedEven, (double2 *)packedOdd, (double *)clover, ret.even.X);
} else if (ret.even.precision == QUDA_SINGLE_PRECISION) {
if (cpu_prec == QUDA_DOUBLE_PRECISION) {
packFullClover((float4 *)packedEven, (float4 *)packedOdd, (double *)clover, ret.even.X);
} else {
packFullClover((float4 *)packedEven, (float4 *)packedOdd, (float *)clover, ret.even.X);
}
} else {
if (cpu_prec == QUDA_DOUBLE_PRECISION) {
packFullCloverHalf((short4 *)packedEven, (float *) packedEvenNorm, (short4 *)packedOdd,
(float *) packedOddNorm, (double *)clover, ret.even.X);
} else {
packFullCloverHalf((short4 *)packedEven, (float *) packedEvenNorm, (short4 *)packedOdd,
(float * )packedOddNorm, (float *)clover, ret.even.X);
}
}
cudaMemcpy(ret.even.clover, packedEven, ret.even.bytes, cudaMemcpyHostToDevice);
cudaMemcpy(ret.odd.clover, packedOdd, ret.even.bytes, cudaMemcpyHostToDevice);
if (ret.even.precision == QUDA_HALF_PRECISION) {
cudaMemcpy(ret.even.cloverNorm, packedEvenNorm, ret.even.bytes/18, cudaMemcpyHostToDevice);
cudaMemcpy(ret.odd.cloverNorm, packedOddNorm, ret.even.bytes/18, cudaMemcpyHostToDevice);
}
#ifndef __DEVICE_EMULATION__
cudaFreeHost(packedEven);
cudaFreeHost(packedOdd);
if (ret.even.precision == QUDA_HALF_PRECISION) {
cudaFreeHost(packedEvenNorm);
cudaFreeHost(packedOddNorm);
}
#else
free(packedEven);
free(packedOdd);
if (ret.even.precision == QUDA_HALF_PRECISION) {
free(packedEvenNorm);
free(packedOddNorm);
}
#endif
}
void loadCloverField(FullClover ret, void *clover, Precision cpu_prec, CloverFieldOrder clover_order)
{
void *clover_odd;
if (cpu_prec == QUDA_SINGLE_PRECISION) clover_odd = (float *)clover + ret.even.length;
else clover_odd = (double *)clover + ret.even.length;
if (clover_order == QUDA_LEX_PACKED_CLOVER_ORDER) {
loadFullClover(ret, clover, cpu_prec, clover_order);
} else if (clover_order == QUDA_PACKED_CLOVER_ORDER) {
loadParityClover(ret.even, clover, cpu_prec, clover_order);
loadParityClover(ret.odd, clover_odd, cpu_prec, clover_order);
} else {
printf("QUDA error: CloverFieldOrder %d not supported\n", clover_order);
exit(-1);
}
}
......@@ -1085,10 +1085,10 @@ o32_re = o32_im = 0;
spinorFloat a30_re = o00_re - o20_re;
spinorFloat a30_im = o00_im - o20_im;
o00_re = a00_re;
o10_re = a10_re;
o20_re = a20_re;
o30_re = a30_re;
o00_re = a00_re; o00_im = a00_im;
o10_re = a10_re; o10_im = a10_im;
o20_re = a20_re; o20_im = a20_im;
o30_re = a30_re; o30_im = a30_im;
}
{
spinorFloat a01_re = -o11_re - o31_re;
......@@ -1100,10 +1100,10 @@ o32_re = o32_im = 0;
spinorFloat a31_re = o01_re - o21_re;
spinorFloat a31_im = o01_im - o21_im;
o01_re = a01_re;
o11_re = a11_re;
o21_re = a21_re;
o31_re = a31_re;
o01_re = a01_re; o01_im = a01_im;
o11_re = a11_re; o11_im = a11_im;
o21_re = a21_re; o21_im = a21_im;
o31_re = a31_re; o31_im = a31_im;
}
{
spinorFloat a02_re = -o12_re - o32_re;
......@@ -1115,10 +1115,10 @@ o32_re = o32_im = 0;
spinorFloat a32_re = o02_re - o22_re;
spinorFloat a32_im = o02_im - o22_im;
o02_re = a02_re;
o12_re = a12_re;
o22_re = a22_re;
o32_re = a32_re;
o02_re = a02_re; o02_im = a02_im;
o12_re = a12_re; o12_im = a12_im;
o22_re = a22_re; o22_im = a22_im;
o32_re = a32_re