Advanced Computing Platform for Theoretical Physics

Commit 458d290a authored by rbabich's avatar rbabich
Browse files

more work on quda clover


git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@462 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 7f8a13b9
......@@ -6,7 +6,7 @@ CPU_ARCH = x86_64 # x86 or x86_64
GPU_ARCH = sm_13 # sm_10, sm_11, sm_12, or sm_13
GPU_EMU = false # set to 'true' for device emulation
PYTHON = python2.6 # python 2.5 or later required for 'make generated'
PYTHON = python2.6 # python 2.5 or later required for 'make gen'
######
......@@ -36,44 +36,45 @@ LDFLAGS = -fPIC $(LIB)
all: dslash_test invert_test su3_test pack_test
ILIB = libquda.a
ILIB_OBJS = blas_quda.o blas_reference.o dslash_quda.o dslash_reference.o \
gauge_quda.o inv_bicgstab_quda.o inv_cg_quda.o invert_quda.o \
spinor_quda.o util_quda.o
ILIB_HDRS = blas_quda.h blas_reference.h dslash_def.h dslash_quda.h \
dslash_reference.h enum_quda.h gauge_quda.h invert_quda.h \
io_spinor.h quda.h read_clover.h read_gauge.h reduce_complex_core.h \
reduce_core.h reduce_triple_core.h spinor_quda.h util_quda.h
ILIB_CORE = dslash_core.h dslash_dagger_core.h
QUDA = libquda.a
QUDA_OBJS = blas_quda.o blas_reference.o clover_quda.o dslash_quda.o \
dslash_reference.o gauge_quda.o inv_bicgstab_quda.o inv_cg_quda.o \
invert_quda.o spinor_quda.o util_quda.o
QUDA_HDRS = blas_quda.h blas_reference.h clover_def.h dslash_def.h \
dslash_quda.h dslash_reference.h enum_quda.h gauge_quda.h \
invert_quda.h io_spinor.h quda.h read_clover.h read_gauge.h \
reduce_complex_core.h reduce_core.h reduce_triple_core.h \
spinor_quda.h util_quda.h
QUDA_CORE = clover_core.h dslash_core.h dslash_dagger_core.h
$(ILIB): $(ILIB_OBJS)
ar cru $@ $(ILIB_OBJS)
$(QUDA): $(QUDA_OBJS)
ar cru $@ $(QUDA_OBJS)
dslash_test: dslash_test.o $(ILIB)
$(CXX) $(LDFLAGS) $< $(ILIB) -o $@
dslash_test: dslash_test.o $(QUDA)
$(CXX) $(LDFLAGS) $< $(QUDA) -o $@
invert_test: invert_test.o $(ILIB)
$(CXX) $(LDFLAGS) $< $(ILIB) -o $@
invert_test: invert_test.o $(QUDA)
$(CXX) $(LDFLAGS) $< $(QUDA) -o $@
su3_test: su3_test.o $(ILIB)
$(CXX) $(LDFLAGS) $< $(ILIB) -o $@
su3_test: su3_test.o $(QUDA)
$(CXX) $(LDFLAGS) $< $(QUDA) -o $@
pack_test: pack_test.o $(ILIB)
$(CXX) $(LDFLAGS) $< $(ILIB) -o $@
pack_test: pack_test.o $(QUDA)
$(CXX) $(LDFLAGS) $< $(QUDA) -o $@
generated:
gen:
$(PYTHON) dslash_cuda_gen.py
clean:
-rm -f *.o dslash_test invert_test su3_test pack_test $(ILIB)
-rm -f *.o dslash_test invert_test su3_test pack_test $(QUDA)
%.o: %.c $(ILIB_HDRS)
%.o: %.c $(QUDA_HDRS)
$(CC) $(CFLAGS) $< -c -o $@
%.o: %.cpp $(ILIB_HDRS)
%.o: %.cpp $(QUDA_HDRS)
$(CXX) $(CXXFLAGS) $< -c -o $@
%.o: %.cu $(ILIB_HDRS) $(ILIB_CORE)
%.o: %.cu $(QUDA_HDRS) $(QUDA_CORE)
$(NVCC) $(NVCCFLAGS) $< -c -o $@
.PHONY: all generated clean
.PHONY: all gen clean
QUDA v0.x Release Notes
-----------------------
Release Notes for QUDA v0.x
---------------------------
Overview:
......@@ -44,7 +45,8 @@ Installation:
In the source directory, copy the template 'Makefile.tmpl' to
'Makefile', and edit the first few lines to specify the CUDA install
path, the platform (x86 or x86_64), and the GPU architecture (see
"Compatibility" above). Then type 'make' to build the library.
"Hardware compatibility" above). Then type 'make' to build the
library.
Using the library:
......@@ -74,9 +76,11 @@ For help or to report a bug, please contact Mike Clark
(mikec@seas.harvard.edu) or Ron Babich (rbabich@bu.edu).
If you find this code useful in your work, a citation to the following
write-up would be appreciated:
would be appreciated:
K. Barros et al., "Blasting through lattice calculations using CUDA,"
PoS LATTICE2008, 045 (2008) [arXiv:0810.5365 [hep-lat]].
Please also let us know so that we can send you updates and bug-fixes.
Please also drop us a note so that we can send you updates and
bug-fixes.
This diff is collapsed.
// clover_def.h - clover kernel definitions
// initialize on first iteration
#ifndef DD_LOOP
#define DD_LOOP
#define DD_XPAY 0
#define DD_SPREC 0
#define DD_CPREC 0
#endif
// set options for current iteration
#if (DD_XPAY==0) // no xpay
#define DD_XPAY_F
#define DD_PARAM2 int oddBit
#else // xpay
#define DD_XPAY_F Xpay
#if (DD_SPREC == 0)
#define DD_PARAM2 int oddBit, double a
#else
#define DD_PARAM2 int oddBit, float a
#endif
#define DSLASH_XPAY
#endif
#if (DD_SPREC==0) // double-precision spinor field
#define DD_SPREC_F D
#define DD_PARAM1 double2* g_out
#define READ_SPINOR READ_SPINOR_DOUBLE
#define SPINORTEX spinorTexDouble
#define WRITE_SPINOR WRITE_SPINOR_DOUBLE2
#define SPINOR_DOUBLE
#if (DD_XPAY==1)
#define ACCUMTEX accumTexDouble
#define READ_ACCUM READ_ACCUM_DOUBLE
#endif
#elif (DD_SPREC==1) // single-precision spinor field
#define DD_SPREC_F S
#define DD_PARAM1 float4* g_out
#define READ_SPINOR READ_SPINOR_SINGLE
#define SPINORTEX spinorTexSingle
#define WRITE_SPINOR WRITE_SPINOR_FLOAT4
#if (DD_XPAY==1)
#define ACCUMTEX accumTexSingle
#define READ_ACCUM READ_ACCUM_SINGLE
#endif
#else // half-precision spinor field
#define DD_SPREC_F H
#define READ_SPINOR READ_SPINOR_HALF
#define SPINORTEX spinorTexHalf
#define DD_PARAM1 short4* g_out, float *c
#define WRITE_SPINOR WRITE_SPINOR_SHORT4
#if (DD_XPAY==1)
#define ACCUMTEX accumTexHalf
#define READ_ACCUM READ_ACCUM_HALF
#endif
#endif
#if (DD_CPREC==0) // double-precision clover term
#define DD_CPREC_F D
#define CLOVERTEX cloverTexDouble
#define READ_CLOVER READ_CLOVER_DOUBLE
#define CLOVER_DOUBLE
#elif (DD_CPREC==1) // single-precision clover term
#define DD_CPREC_F S
#define CLOVERTEX cloverTexSingle
#define READ_CLOVER READ_CLOVER_SINGLE
#else // half-precision clover term
#define DD_CPREC_F H
#define CLOVERTEX cloverTexHalf
#define READ_CLOVER READ_CLOVER_HALF
#endif
#define DD_CONCAT(s,c,x) clover ## s ## c ## x ## Kernel
#define DD_FUNC(s,c,x) DD_CONCAT(s,c,x)
// define the kernel
#if !(__CUDA_ARCH__ != 130 && (DD_SPREC == 0 || DD_CPREC == 0))
__global__ void
DD_FUNC(DD_SPREC_F, DD_CPREC_F, DD_XPAY_F)(DD_PARAM1, DD_PARAM2) {
#include "clover_core.h"
}
#endif
// clean up
#undef DD_SPREC_F
#undef DD_CPREC_F
#undef DD_XPAY_F
#undef DD_PARAM1
#undef DD_PARAM2
#undef DD_CONCAT
#undef DD_FUNC
#undef DSLASH_XPAY
#undef READ_SPINOR
#undef SPINORTEX
#undef WRITE_SPINOR
#undef ACCUMTEX
#undef READ_ACCUM
#undef CLOVERTEX
#undef READ_CLOVER
#undef GAUGE_DOUBLE
#undef SPINOR_DOUBLE
#undef CLOVER_DOUBLE
// prepare next set of options, or clean up after final iteration
//#if (DD_XPAY==0) // xpay variant is not needed
//#undef DD_XPAY
//#define DD_XPAY 1
//#else
//#undef DD_XPAY
//#define DD_XPAY 0
#if (DD_SPREC==0)
#undef DD_SPREC
#define DD_SPREC 1
#elif (DD_SPREC==1)
#undef DD_SPREC
#define DD_SPREC 2
#else
#undef DD_SPREC
#define DD_SPREC 0
#if (DD_CPREC==0)
#undef DD_CPREC
#define DD_CPREC 1
#elif (DD_CPREC==1)
#undef DD_CPREC
#define DD_CPREC 2
#else
#undef DD_LOOP
#undef DD_XPAY
#undef DD_SPREC
#undef DD_CPREC
#endif // DD_CPREC
#endif // DD_SPREC
//#endif // DD_XPAY
#ifdef DD_LOOP
#include "clover_def.h"
#endif
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <quda.h>
#include <spinor_quda.h>
#include <util_quda.h>
ParityClover allocateParityClover(int *X, Precision precision)
{
ParityClover ret;
ret.precision = precision;
ret.volume = 1;
for (int d=0; d<4; d++) {
ret.X[d] = X[d];
ret.volume *= X[d];
}
ret.Nc = 3;
ret.Ns = 4;
ret.length = ret.volume*ret.Nc*ret.Nc*ret.Ns*ret.Ns/2; // block-diagonal Hermitian (72 reals)
if (precision == QUDA_DOUBLE_PRECISION) ret.bytes = ret.length*sizeof(double);
else if (precision == QUDA_SINGLE_PRECISION) ret.bytes = ret.length*sizeof(float);
else ret.bytes = ret.length*sizeof(short);
if (cudaMalloc((void**)&(ret.clover), ret.bytes) == cudaErrorMemoryAllocation) {
printf("Error allocating clover term\n");
exit(0);
}
if (precision == QUDA_HALF_PRECISION) {
if (cudaMalloc((void**)&ret.cloverNorm, ret.bytes/18) == cudaErrorMemoryAllocation) {
printf("Error allocating cloverNorm\n");
exit(0);
}
}
return ret;
}
FullClover allocateCloverField(int *X, Precision precision)
{
FullClover ret;
ret.even = allocateParityClover(X, precision);
ret.odd = allocateParityClover(X, precision);
return ret;
}
void freeParityClover(ParityClover *clover)
{
cudaFree(clover->clover);
clover->clover = NULL;
}
void freeCloverField(FullClover *clover)
{
freeParityClover(&clover->even);
freeParityClover(&clover->odd);
}
template <typename Float>
static inline void packCloverMatrix(float4* a, Float *b, int Vh)
{
for (int i=0; i<18; i++) {
a[i*Vh].x = b[4*i+0];
a[i*Vh].y = b[4*i+1];
a[i*Vh].z = b[4*i+2];
a[i*Vh].w = b[4*i+3];
}
}
template <typename Float>
static inline void packCloverMatrix(double2* a, Float *b, int Vh)
{
for (int i=0; i<36; i++) {
a[i*Vh].x = b[2*i+0];
a[i*Vh].y = b[2*i+1];
}
}
template <typename Float, typename FloatN>
static void packParityClover(FloatN *res, Float *clover, int Vh)
{
for (int i = 0; i < Vh; i++) {
packCloverMatrix(res+i, clover+72*i, Vh);
}
}
template <typename Float, typename FloatN>
static void packFullClover(FloatN *even, FloatN *odd, Float *clover, int *X)
{
int Vh = X[0]*X[1]*X[2]*X[3];
X[0] *= 2; // X now contains dimensions of the full lattice
for (int i=0; i<Vh; i++) {
int boundaryCrossings = i/X[0] + i/(X[1]*X[0]) + i/(X[2]*X[1]*X[0]);
{ // even sites
int k = 2*i + boundaryCrossings%2;
packCloverMatrix(even+i, clover+72*k, Vh);
}
{ // odd sites
int k = 2*i + (boundaryCrossings+1)%2;
packCloverMatrix(odd+i, clover+72*k, Vh);
}
}
}
template<typename Float>
static inline void packCloverMatrixHalf(short4 *res, float *norm, Float *clover, int Vh)
{
Float max, a, c;
// treat the two chiral blocks separately
for (int chi=0; chi<2; chi++) {
max = fabs(clover[0]);
for (int i=1; i<36; i++) {
if ((a = fabs(clover[i])) > max) max = a;
}
c = MAX_SHORT/max;
for (int i=0; i<9; i++) {
res[i*Vh].x = (short) (c * clover[4*i+0]);
res[i*Vh].y = (short) (c * clover[4*i+1]);
res[i*Vh].z = (short) (c * clover[4*i+2]);
res[i*Vh].w = (short) (c * clover[4*i+3]);
}
norm[chi*Vh] = 1/c;
res += 9;
clover += 36;
}
}
template <typename Float>
static void packParityCloverHalf(short4 *res, float *norm, Float *clover, int Vh)
{
for (int i = 0; i < Vh; i++) {
packCloverMatrixHalf(res+i, norm+i, clover+72*i, Vh);
}
}
template <typename Float>
static void packFullCloverHalf(short4 *even, float *evenNorm, short4 *odd, float *oddNorm,
Float *clover, int *X)
{
int Vh = X[0]*X[1]*X[2]*X[3];
X[0] *= 2; // X now contains dimensions of the full lattice
for (int i=0; i<Vh; i++) {
int boundaryCrossings = i/X[0] + i/(X[1]*X[0]) + i/(X[2]*X[1]*X[0]);
{ // even sites
int k = 2*i + boundaryCrossings%2;
packCloverMatrixHalf(even+i, evenNorm+i, clover+72*k, Vh);
}
{ // odd sites
int k = 2*i + (boundaryCrossings+1)%2;
packCloverMatrixHalf(odd+i, oddNorm+i, clover+72*k, Vh);
}
}
}
void loadParityClover(ParityClover ret, void *clover, Precision cpu_prec,
CloverFieldOrder clover_order)
{
// use pinned memory
void *packedClover, *packedCloverNorm;
if (ret.precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) {
printf("QUDA error: cannot have CUDA double precision without double CPU precision\n");
exit(-1);
}
if (clover_order != QUDA_PACKED_CLOVER_ORDER) {
printf("QUDA error: invalid clover order\n");
exit(-1);
}
#ifndef __DEVICE_EMULATION__
cudaMallocHost(&packedClover, ret.bytes);
if (ret.precision == QUDA_HALF_PRECISION) cudaMallocHost(&packedCloverNorm, ret.bytes/18);
#else
packedClover = malloc(ret.bytes);
if (ret.precision == QUDA_HALF_PRECISION) packedCloverNorm = malloc(ret.bytes/18);
#endif
if (ret.precision == QUDA_DOUBLE_PRECISION) {
packParityClover((double2 *)packedClover, (double *)clover, ret.volume);
} else if (ret.precision == QUDA_SINGLE_PRECISION) {
if (cpu_prec == QUDA_DOUBLE_PRECISION) {
packParityClover((float4 *)packedClover, (double *)clover, ret.volume);
} else {
packParityClover((float4 *)packedClover, (float *)clover, ret.volume);
}
} else {
if (cpu_prec == QUDA_DOUBLE_PRECISION) {
packParityCloverHalf((short4 *)packedClover, (float *)packedCloverNorm, (double *)clover, ret.volume);
} else {
packParityCloverHalf((short4 *)packedClover, (float *)packedCloverNorm, (float *)clover, ret.volume);
}
}
cudaMemcpy(ret.clover, packedClover, ret.bytes, cudaMemcpyHostToDevice);
if (ret.precision == QUDA_HALF_PRECISION) {
cudaMemcpy(ret.cloverNorm, packedCloverNorm, ret.bytes/18, cudaMemcpyHostToDevice);
}
#ifndef __DEVICE_EMULATION__
cudaFreeHost(packedClover);
if (ret.precision == QUDA_HALF_PRECISION) cudaFreeHost(packedCloverNorm);
#else
free(packedClover);
if (ret.precision == CUDA_HALF_PRECISION) free(packedCloverNorm);
#endif
}
void loadFullClover(FullClover ret, void *clover, Precision cpu_prec,
CloverFieldOrder clover_order)
{
// use pinned memory
void *packedEven, *packedEvenNorm, *packedOdd, *packedOddNorm;
if (ret.even.precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) {
printf("QUDA error: cannot have CUDA double precision without double CPU precision\n");
exit(-1);
}
if (clover_order != QUDA_LEX_PACKED_CLOVER_ORDER) {
printf("QUDA error: invalid clover order\n");
exit(-1);
}
#ifndef __DEVICE_EMULATION__
cudaMallocHost(&packedEven, ret.even.bytes);
cudaMallocHost(&packedOdd, ret.even.bytes);
if (ret.even.precision == QUDA_HALF_PRECISION) {
cudaMallocHost(&packedEvenNorm, ret.even.bytes/18);
cudaMallocHost(&packedOddNorm, ret.even.bytes/18);
}
#else
packedEven = malloc(ret.even.bytes);
packedOdd = malloc(ret.even.bytes);
if (ret.even.precision == QUDA_HALF_PRECISION) {
packedEvenNorm = malloc(ret.even.bytes/18);
packedOddNorm = malloc(ret.even.bytes/18);
}
#endif
if (ret.even.precision == QUDA_DOUBLE_PRECISION) {
packFullClover((double2 *)packedEven, (double2 *)packedOdd, (double *)clover, ret.even.X);
} else if (ret.even.precision == QUDA_SINGLE_PRECISION) {
if (cpu_prec == QUDA_DOUBLE_PRECISION) {
packFullClover((float4 *)packedEven, (float4 *)packedOdd, (double *)clover, ret.even.X);
} else {
packFullClover((float4 *)packedEven, (float4 *)packedOdd, (float *)clover, ret.even.X);
}
} else {
if (cpu_prec == QUDA_DOUBLE_PRECISION) {
packFullCloverHalf((short4 *)packedEven, (float *) packedEvenNorm, (short4 *)packedOdd,
(float *) packedOddNorm, (double *)clover, ret.even.X);
} else {
packFullCloverHalf((short4 *)packedEven, (float *) packedEvenNorm, (short4 *)packedOdd,
(float * )packedOddNorm, (float *)clover, ret.even.X);
}
}
cudaMemcpy(ret.even.clover, packedEven, ret.even.bytes, cudaMemcpyHostToDevice);
cudaMemcpy(ret.odd.clover, packedOdd, ret.even.bytes, cudaMemcpyHostToDevice);
if (ret.even.precision == QUDA_HALF_PRECISION) {
cudaMemcpy(ret.even.cloverNorm, packedEvenNorm, ret.even.bytes/18, cudaMemcpyHostToDevice);
cudaMemcpy(ret.odd.cloverNorm, packedOddNorm, ret.even.bytes/18, cudaMemcpyHostToDevice);
}
#ifndef __DEVICE_EMULATION__
cudaFreeHost(packedEven);
cudaFreeHost(packedOdd);
if (ret.even.precision == QUDA_HALF_PRECISION) {
cudaFreeHost(packedEvenNorm);
cudaFreeHost(packedOddNorm);
}
#else
free(packedEven);
free(packedOdd);
if (ret.even.precision == QUDA_HALF_PRECISION) {
free(packedEvenNorm);
free(packedOddNorm);
}
#endif
}
void loadCloverField(FullClover ret, void *clover, Precision cpu_prec, CloverFieldOrder clover_order)
{
void *clover_odd;
if (cpu_prec == QUDA_SINGLE_PRECISION) clover_odd = (float *)clover + ret.even.length;
else clover_odd = (double *)clover + ret.even.length;
if (clover_order == QUDA_LEX_PACKED_CLOVER_ORDER) {
loadFullClover(ret, clover, cpu_prec, clover_order);
} else if (clover_order == QUDA_PACKED_CLOVER_ORDER) {
loadParityClover(ret.even, clover, cpu_prec, clover_order);
loadParityClover(ret.odd, clover_odd, cpu_prec, clover_order);
} else {
printf("QUDA error: CloverFieldOrder %d not supported\n", clover_order);
exit(-1);
}
}
......@@ -1085,10 +1085,10 @@ o32_re = o32_im = 0;
spinorFloat a30_re = o00_re - o20_re;
spinorFloat a30_im = o00_im - o20_im;
o00_re = a00_re;
o10_re = a10_re;
o20_re = a20_re;
o30_re = a30_re;
o00_re = a00_re; o00_im = a00_im;
o10_re = a10_re; o10_im = a10_im;
o20_re = a20_re; o20_im = a20_im;
o30_re = a30_re; o30_im = a30_im;
}
{
spinorFloat a01_re = -o11_re - o31_re;
......@@ -1100,10 +1100,10 @@ o32_re = o32_im = 0;
spinorFloat a31_re = o01_re - o21_re;
spinorFloat a31_im = o01_im - o21_im;
o01_re = a01_re;
o11_re = a11_re;
o21_re = a21_re;
o31_re = a31_re;
o01_re = a01_re; o01_im = a01_im;
o11_re = a11_re; o11_im = a11_im;
o21_re = a21_re; o21_im = a21_im;
o31_re = a31_re; o31_im = a31_im;
}
{
spinorFloat a02_re = -o12_re - o32_re;
......@@ -1115,10 +1115,10 @@ o32_re = o32_im = 0;
spinorFloat a32_re = o02_re - o22_re;
spinorFloat a32_im = o02_im - o22_im;
o02_re = a02_re;
o12_re = a12_re;
o22_re = a22_re;
o32_re = a32_re;
o02_re = a02_re; o02_im = a02_im;
o12_re = a12_re; o12_im = a12_im;
o22_re = a22_re; o22_im = a22_im;
o32_re = a32_re; o32_im = a32_im;
}
// apply first chiral block
......@@ -1329,10 +1329,10 @@ o32_re = o32_im = 0;
spinorFloat a30_re = -o00_re + o20_re;
spinorFloat a30_im = -o00_im + o20_im;
o00_re = a00_re;
o10_re = a10_re;
o20_re = a20_re;
o30_re = a30_re;
o00_re = a00_re; o00_im = a00_im;
o10_re = a10_re; o10_im = a10_im;
o20_re = a20_re; o20_im = a20_im;
o30_re = a30_re; o30_im = a30_im;
}
{
spinorFloat a01_re = o11_re + o31_re;
......@@ -1344,10 +1344,10 @@ o32_re = o32_im = 0;
spinorFloat a31_re = -o01_re + o21_re;
spinorFloat a31_im = -o01_im + o21_im;
o01_re = a01_re;
o11_re = a11_re;
o21_re = a21_re;
o31_re = a31_re;
o01_re = a01_re; o01_im = a01_im;
o11_re = a11_re; o11_im = a11_im;
o21_re = a21_re; o21_im = a21_im;
o31_re = a31_re; o31_im = a31_im;
}
{
spinorFloat a02_re = o12_re + o32_re;
......@@ -1359,10 +1359,10 @@ o32_re = o32_im = 0;
spinorFloat a32_re = -o02_re + o22_re;
spinorFloat a32_im = -o02_im + o22_im;
o02_re = a02_re;
o12_re = a12_re;
o22_re = a22_re;
o32_re = a32_re;
o02_re = a02_re; o02_im = a02_im;
o12_re = a12_re; o12_im = a12_im;
o22_re = a22_re; o22_im = a22_im;
o32_re = a32_re; o32_im = a32_im;
}
#endif // DSLASH_CLOVER
......@@ -1474,3 +1474,40 @@ o32_re = o32_im = 0;
#undef i31_im
#undef i32_re
#undef i32_im
#undef c00_00_re
#undef c01_01_re
#undef c02_02_re
#undef c10_10_re
#undef c11_11_re
#undef c12_12_re
#undef c01_00_re
#undef c01_00_im
#undef c02_00_re
#undef c02_00_im
#undef c10_00_re
#undef c10_00_im
#undef c11_00_re
#undef c11_00_im
#undef c12_00_re
#undef c12_00_im
#undef c02_01_re
#undef c02_01_im
#undef c10_01_re
#undef c10_01_im
#undef c11_01_re
#undef c11_01_im
#undef c12_01_re
#undef c12_01_im
#undef c10_02_re
#undef c10_02_im
#undef c11_02_re
#undef c11_02_im
#undef c12_02_re
#undef c12_02_im
#undef c11_10_re
#undef c11_10_im
#undef c12_10_re
#undef c12_10_im
#undef c12_11_re
#undef c12_11_im
......@@ -467,7 +467,8 @@ def toChiralBasis(c):
str.append("\n")
for s in range (0,4):
str.append(out_re(s,c)+" = "+a_re(0,s,c)+";\n")
str.append(out_re(s,c)+" = "+a_re(0,s,c)+"; ")
str.append(out_im(s,c)+" = "+a_im(0,s,c)+";\n")
return block(''.join(str))
# end def toChiralBasis
......@@ -486,7 +487,8 @@ def fromChiralBasis(c): # note: factor of 1/2 is included in clover term normali
str.append("\n")
for s in range (0,4):
str.append(out_re(s,c)+" = "+a_re(0,s,c)+";\n")
str.append(out_re(s,c)+" = "+a_re(0,s,c)+"; ")
str.append(out_im(s,c)+" = "+a_im(0,s,c)+";\n")
return block(''.join(str))
# end def fromChiralBasis
......@@ -516,7 +518,8 @@ def cloverMult(chi):
for s in range (0,2):
for c in range (0,3):
str.append(out_re(2*chi+s,c)+" = "+a_re(chi,s,c)+"; "+out_im(2*chi+s,c)+" = "+a_im(chi,s,c)+";\n")
str.append(out_re(2*chi+s,c)+" = "+a_re(chi,s,c)+"; ")
str.append(out_im(2*chi+s,c)+" = "+a_im(chi,s,c)+";\n")
str.append("\n")
return block(''.join(str))+"\n"
......@@ -595,6 +598,20 @@ def epilog():
i = 3*s+c
str.append("#undef "+in_re(s,c)+"\n")
str.append("#undef "+in_im(s,c)+"\n")
str.append("\n")
for m in range(0,6):
s = m/3
c = m%3
str.append("#undef "+c_re(0,s,c,s,c)+"\n")
for n in range(0,6):
sn = n/3
cn = n%3
for m in range(n+1,6):
sm = m/3
cm = m%3
str.append("#undef "+c_re(0,sm,cm,sn,cn)+"\n")
str.append("#undef "+c_im(0,sm,cm,sn,cn)+"\n")
return ''.join(str)
# end def epilog
......
......@@ -1085,10 +1085,10 @@ o32_re = o32_im = 0;
spinorFloat a30_re = o00_re - o20_re;
spinorFloat a30_im = o00_im - o20_im;
o00_re = a00_re;
o10_re = a10_re;
o20_re = a20_re;
o30_re = a30_re;
o00_re = a00_re; o00_im = a00_im;
o10_re = a10_re; o10_im = a10_im;
o20_re = a20_re; o20_im = a20_im;
o30_re = a30_re; o30_im = a30_im;
}
{
spinorFloat a01_re = -o11_re - o31_re;
......@@ -1100,10 +1100,10 @@ o32_re = o32_im = 0;
spinorFloat a31_re = o01_re - o21_re;
spinorFloat a31_im = o01_im - o21_im;
o01_re = a01_re;
o11_re = a11_re;
o21_re = a21_re;
o31_re = a31_re;
o01_re = a01_re; o01_im = a01_im;
o11_re = a11_re; o11_im = a11_im;
o21_re = a21_re; o21_im = a21_im;
o31_re = a31_re; o31_im = a31_im;
}
{
spinorFloat a02_re = -o12_re - o32_re;
......@@ -1115,10 +1115,10 @@ o32_re = o32_im = 0;
spinorFloat a32_re = o02_re - o22_re;
spinorFloat a32_im = o02_im - o22_im;
o02_re = a02_re;
o12_re = a12_re;
o22_re = a22_re;
o32_re = a32_re;
o02_re = a02_re; o02_im = a02_im;
o12_re = a12_re; o12_im = a12_im;
o22_re = a22_re; o22_im = a22_im;
o32_re = a32_re; o32_im = a32_im;
}
// apply first chiral block
......@@ -1329,10 +1329,10 @@ o32_re = o32_im = 0;
spinorFloat a30_re = -o00_re + o20_re;
spinorFloat a30_im = -o00_im + o20_im;
o00_re = a00_re;
o10_re = a10_re;
o20_re = a20_re;
o30_re = a30_re;
o00_re = a00_re; o00_im = a00_im;
o10_re = a10_re; o10_im = a10_im;
o20_re = a20_re; o20_im = a20_im;
o30_re = a30_re; o30_im = a30_im;
}
{
spinorFloat a01_re = o11_re + o31_re;
......@@ -1344,10 +1344,10 @@ o32_re = o32_im = 0;
spinorFloat a31_re = -o01_re + o21_re;
spinorFloat a31_im = -o01_im + o21_im;
o01_re = a01_re;
o11_re = a11_re;
o21_re = a21_re;
o31_re = a31_re;
o01_re = a01_re; o01_im = a01_im;
o11_re = a11_re; o11_im = a11_im;
o21_re = a21_re; o21_im = a21_im;
o31_re = a31_re; o31_im = a31_im;
}
{
spinorFloat a02_re = o12_re + o32_re;
......@@ -1359,10 +1359,10 @@ o32_re = o32_im = 0;
spinorFloat a32_re = -o02_re + o22_re;
spinorFloat a32_im = -o02_im + o22_im;
o02_re = a02_re;
o12_re = a12_re;
o22_re = a22_re;
o32_re = a32_re;
o02_re = a02_re; o02_im = a02_im;
o12_re = a12_re; o12_im = a12_im;
o22_re = a22_re; o22_im = a22_im;
o32_re = a32_re; o32_im = a32_im;
}
#endif // DSLASH_CLOVER
......@@ -1474,3 +1474,40 @@ o32_re = o32_im = 0;
#undef i31_im
#undef i32_re
#undef i32_im
#undef c00_00_re
#undef c01_01_re
#undef c02_02_re
#undef c10_10_re
#undef c11_11_re
#undef c12_12_re
#undef c01_00_re
#undef c01_00_im
#undef c02_00_re
#undef c02_00_im
#undef c10_00_re
#undef c10_00_im
#undef c11_00_re
#undef c11_00_im
#undef c12_00_re
#undef c12_00_im
#undef c02_01_re
#undef c02_01_im
#undef c10_01_re
#undef c10_01_im
#undef c11_01_re
#undef c11_01_im
#undef c12_01_re
#undef c12_01_im
#undef c10_02_re
#undef c10_02_im
#undef c11_02_re
#undef c11_02_im
#undef c12_02_re
#undef c12_02_im
#undef c11_10_re
#undef c11_10_im
#undef c12_10_re
#undef c12_10_im
#undef c12_11_re
#undef c12_11_im
......@@ -30,12 +30,12 @@
#ifndef DD_LOOP
#define DD_LOOP
#define DD_DAG 0
//#define DD_DAG 0
#define DD_XPAY 0
#define DD_RECON 0
#define DD_GPREC 0
#define DD_SPREC 0
#define DD_CPREC 1
#define DD_CPREC 0
#endif
// set options for current iteration
......@@ -211,12 +211,12 @@ DD_FUNC(DD_GPREC_F, DD_SPREC_F, DD_CPREC_F, DD_RECON_F, DD_DAG_F, DD_XPAY_F)(DD_
// prepare next set of options, or clean up after final iteration
#if (DD_DAG==0)
#undef DD_DAG
#define DD_DAG 1
#else
#undef DD_DAG
#define DD_DAG 0
//#if (DD_DAG==0)
//#undef DD_DAG
//#define DD_DAG 1
//#else
//#undef DD_DAG
//#define DD_DAG 0
#if (DD_XPAY==0)
#undef DD_XPAY
......@@ -252,14 +252,14 @@ DD_FUNC(DD_GPREC_F, DD_SPREC_F, DD_CPREC_F, DD_RECON_F, DD_DAG_F, DD_XPAY_F)(DD_
#undef DD_SPREC
#define DD_SPREC 0
//#if (DD_CPREC==0)
//#undef DD_CPREC
//#define DD_CPREC 1
#if (DD_CPREC==1)
#if (DD_CPREC==0)
#undef DD_CPREC
#define DD_CPREC 1
#elif (DD_CPREC==1)
#undef DD_CPREC
#define DD_CPREC 2
#elif (DD_CPREC==2)
#undef DD_CPREC
//#define DD_CPREC 2
//#elif (DD_CPREC==2)
//#undef DD_CPREC
#define DD_CPREC 3
#else
......@@ -276,7 +276,7 @@ DD_FUNC(DD_GPREC_F, DD_SPREC_F, DD_CPREC_F, DD_RECON_F, DD_DAG_F, DD_XPAY_F)(DD_
#endif // DD_GPREC
#endif // DD_RECON
#endif // DD_XPAY
#endif // DD_DAG
//#endif // DD_DAG
#ifdef DD_LOOP
#include "dslash_def.h"
......
This diff is collapsed.
......@@ -16,16 +16,15 @@ extern "C" {
extern FullGauge cudaGaugePrecise;
extern FullGauge cudaGaugeSloppy;
extern FullClover cudaClover;
extern FullClover cudaCloverSloppy;
extern QudaGaugeParam *gauge_param;
extern QudaInvertParam *invert_param;
extern FullClover cudaClover;
// ---------- dslash_quda.cu ----------
int dslashCudaSharedBytes(Precision spinor_prec, int blockDim);
void initDslashCuda();
void bindGaugeTex(FullGauge gauge, int oddBit);
// Double precision routines
void dslashDCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
......@@ -46,22 +45,76 @@ extern "C" {
int oddBit, int daggerBit, ParitySpinor x, double a);
// wrapper to above
void dslashCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity, int dagger);
void dslashXpayCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity, int dagger,
ParitySpinor x, double a);
void dslashCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in,
int parity, int dagger);
void dslashXpayCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in,
int parity, int dagger, ParitySpinor x, double a);
// Full Wilson matrix
void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa, int daggerBit);
void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa,
int daggerBit);
void MatPCCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven,
double kappa, ParitySpinor tmp, MatPCType matpc_type, int daggerBit);
double kappa, ParitySpinor tmp, MatPCType matpc_type,
int daggerBit);
void MatPCDagMatPCCuda(ParitySpinor outEven, FullGauge gauge,
ParitySpinor inEven, double kappa, ParitySpinor tmp,
MatPCType matpc_type);
// clover Dslash routines
void cloverDslashCuda(ParitySpinor out, FullGauge gauge,
FullClover cloverInv, ParitySpinor in, int parity,
int dagger);
void cloverDslashDCuda(ParitySpinor res, FullGauge gauge,
FullClover cloverInv, ParitySpinor spinor,
int oddBit, int daggerBit);
void cloverDslashSCuda(ParitySpinor res, FullGauge gauge,
FullClover cloverInv, ParitySpinor spinor,
int oddBit, int daggerBit);
void cloverDslashHCuda(ParitySpinor res, FullGauge gauge,
FullClover cloverInv, ParitySpinor spinor,
int oddBit, int daggerBit);
void MatPCDagMatPCCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven,
double kappa, ParitySpinor tmp, MatPCType matpc_type);
void cloverDslashXpayCuda(ParitySpinor out, FullGauge gauge,
FullClover cloverInv, ParitySpinor in, int parity,
int dagger, ParitySpinor x, double a);
void cloverDslashXpayDCuda(ParitySpinor res, FullGauge gauge,
FullClover cloverInv, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x,
double a);
void cloverDslashXpaySCuda(ParitySpinor res, FullGauge gauge,
FullClover cloverInv, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x,
double a);
void cloverDslashXpayHCuda(ParitySpinor res, FullGauge gauge,
FullClover cloverInv, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x,
double a);
void cloverMatPCCuda(ParitySpinor out, FullGauge gauge,
FullClover cloverInv, ParitySpinor in, double kappa,
ParitySpinor tmp, MatPCType matpc_type, int dagger);
void cloverMatPCDagMatPCCuda(ParitySpinor out, FullGauge gauge,
FullClover cloverInv, ParitySpinor in,
double kappa, ParitySpinor tmp,
MatPCType matpc_type);
void cloverMatCuda(FullSpinor out, FullGauge gauge, FullClover clover,
FullSpinor in, double kappa, ParitySpinor tmp,
int dagger);
// routines for applying the clover term alone
void cloverCuda(ParitySpinor out, FullGauge gauge, FullClover clover,
ParitySpinor in, int parity);
void cloverDCuda(ParitySpinor res, FullGauge gauge, FullClover clover,
ParitySpinor spinor, int oddBit);
void cloverSCuda(ParitySpinor res, FullGauge gauge, FullClover clover,
ParitySpinor spinor, int oddBit);
void cloverHCuda(ParitySpinor res, FullGauge gauge, FullClover clover,
ParitySpinor spinor, int oddBit);
// -- inv_cg_cuda.cpp
void invertCgCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge,
FullGauge gaugeSloppy, ParitySpinor tmp, QudaInvertParam *param);
FullGauge gaugeSloppy, ParitySpinor tmp,
QudaInvertParam *param);
// -- inv_bicgstab_cuda.cpp
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge,
......
......@@ -9,6 +9,8 @@
// What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat)
int test_type = 1;
// clover-improved? (0 = plain Wilson, 1 = clover)
int dslash_type = 0;
QudaGaugeParam gaugeParam;
QudaInvertParam inv_param;
......@@ -49,13 +51,22 @@ void init() {
gaugeParam.gauge_order = QUDA_QDP_GAUGE_ORDER;
gaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;
gaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
gauge_param = &gaugeParam;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
if (test_type == 2) inv_param.dirac_order = QUDA_DIRAC_ORDER;
else inv_param.dirac_order = QUDA_DIRAC_ORDER;
inv_param.kappa = kappa;
if (dslash_type) {
inv_param.dslash_type = QUDA_CLOVER_WILSON_DSLASH;
inv_param.clover_cpu_prec = QUDA_SINGLE_PRECISION;
inv_param.clover_cuda_prec = QUDA_SINGLE_PRECISION;
} else {
inv_param.dslash_type = QUDA_WILSON_DSLASH;
}
gauge_param = &gaugeParam;
invert_param = &inv_param;
size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
......@@ -219,6 +230,10 @@ void dslashTest() {
int flops = test_type ? 1320*2 + 48 : 1320;
int floats = test_type ? 2*(7*24+8*gaugeParam.packed_size+24)+24 : 7*24+8*gaugeParam.packed_size+24;
if (dslash_type) {
flops += test_type ? 504*2 : 504;
floats += test_type ? 72*2 : 72;
}
printf("GFLOPS = %f\n", 1.0e-9*flops*Vh/secs);
printf("GiB/s = %f\n\n", Vh*floats*sizeof(float)/(secs*(1<<30)));
......@@ -230,11 +245,8 @@ void dslashTest() {
if (test_type < 2) strong_check(spinorRef, spinorOdd, Vh, inv_param.cpu_prec);
else strong_check(spinorRef, spinorGPU, V, inv_param.cpu_prec);
}
end();
}
int main(int argc, char **argv) {
......
......@@ -17,6 +17,11 @@ extern "C" {
QUDA_LEX_DIRAC_ORDER // lexicographical order, colour inside spin
} QudaDiracFieldOrder;
typedef enum QudaCloverFieldOrder_s {
QUDA_PACKED_CLOVER_ORDER, // even-odd, packed
QUDA_LEX_PACKED_CLOVER_ORDER // lexicographical order, packed
} QudaCloverFieldOrder;
typedef enum QudaDslashType_s {
QUDA_WILSON_DSLASH,
QUDA_CLOVER_WILSON_DSLASH
......@@ -78,6 +83,12 @@ extern "C" {
QUDA_PERIODIC_T = 1
} QudaTboundary;
typedef enum QudaVerbosity_s {
QUDA_SILENT = 0,
QUDA_SUMMARIZE = 1,
QUDA_VERBOSE = 2
} QudaVerbosity;
typedef struct double3_s {
double x;
double y;
......
......@@ -72,6 +72,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise
int k=0;
int xUpdate = 0, rUpdate = 0;
if (invert_param->verbosity >= QUDA_VERBOSE)
printf("%d iterations, r2 = %e\n", k, r2);
stopwatchStart();
while (r2 > stop && k<invert_param->maxiter) {
......@@ -143,6 +144,8 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise
}
k++;
if (invert_param->verbosity >= QUDA_VERBOSE)
printf("%d iterations, r2 = %e\n", k, r2);
}
......@@ -154,6 +157,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor src, FullGauge gaugePrecise
if (k==invert_param->maxiter)
printf("Exceeded maximum iterations %d\n", invert_param->maxiter);
if (invert_param->verbosity >= QUDA_SUMMARIZE)
printf("Residual updates = %d, Solution updates = %d\n", rUpdate, xUpdate);
float gflops = (1.0e-9*x.volume)*(2*(2*1320+48)*k + (32*k + 8*(k-1))*spinorSiteSize);
......
......@@ -59,6 +59,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugePrecise,
int k=0;
int xUpdate = 0, rUpdate = 0;
if (invert_param->verbosity >= QUDA_VERBOSE)
printf("%d iterations, r2 = %e\n", k, r2);
stopwatchStart();
while (r2 > stop && k<perf->maxiter) {
......@@ -110,6 +111,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugePrecise,
}
k++;
if (invert_param->verbosity >= QUDA_VERBOSE)
printf("%d iterations, r2 = %e\n", k, r2);
}
......@@ -121,6 +123,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugePrecise,
if (k==invert_param->maxiter)
printf("Exceeded maximum iterations %d\n", invert_param->maxiter);
if (invert_param->verbosity >= QUDA_SUMMARIZE)
printf("Residual updates = %d, Solution updates = %d\n", rUpdate, xUpdate);
float gflops = k*(1.0e-9*x.volume)*(2*(2*1320+48) + 10*spinorSiteSize);
......
......@@ -14,6 +14,9 @@
FullGauge cudaGaugePrecise; // precise gauge field
FullGauge cudaGaugeSloppy; // sloppy gauge field
FullClover cudaCloverPrecise;
FullClover cudaCloverSloppy;
void printGaugeParam(QudaGaugeParam *param) {
printf("Gauge Params:\n");
......@@ -36,6 +39,7 @@ void printGaugeParam(QudaGaugeParam *param) {
void printInvertParam(QudaInvertParam *param) {
printf("kappa = %e\n", param->kappa);
printf("mass_normalization = %d\n", param->mass_normalization);
printf("dslash_type = %d\n", param->dslash_type);
printf("inv_type = %d\n", param->inv_type);
printf("tol = %e\n", param->tol);
printf("iter = %d\n", param->iter);
......@@ -45,10 +49,19 @@ void printInvertParam(QudaInvertParam *param) {
printf("preserve_source = %d\n", param->preserve_source);
printf("cpu_prec = %d\n", param->cpu_prec);
printf("cuda_prec = %d\n", param->cuda_prec);
printf("cuda_prec_sloppy = %d\n", param->cuda_prec_sloppy);
printf("dirac_order = %d\n", param->dirac_order);
printf("spinorGiB = %e\n", param->spinorGiB);
if (param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
printf("clover_cpu_prec = %d\n", param->clover_cpu_prec);
printf("clover_cuda_prec = %d\n", param->clover_cuda_prec);
printf("clover_cuda_prec_sloppy = %d\n", param->clover_cuda_prec_sloppy);
printf("clover_order = %d\n", param->clover_order);
printf("cloverGiB = %e\n", param->cloverGiB);
}
printf("gflops = %e\n", param->gflops);
printf("secs = %f\n", param->secs);
printf("verbosity = %d\n", param->verbosity);
}
void initQuda(int dev)
......@@ -110,6 +123,14 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
}
// for now, only single-precision clover is supported
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *param)
{
cudaCloverSloppy = cudaCloverPrecise;
}
void endQuda()
{
freeSpinorBuffer();
......
......@@ -57,10 +57,19 @@ extern "C" {
QudaDiracFieldOrder dirac_order;
QudaPrecision clover_cpu_prec;
QudaPrecision clover_cuda_prec;
QudaPrecision clover_cuda_prec_sloppy;
QudaCloverFieldOrder clover_order;
double spinorGiB;
double cloverGiB;
double gflops;
double secs;
QudaVerbosity verbosity;
} QudaInvertParam;
// Interface functions
......
......@@ -53,8 +53,9 @@ int main(int argc, char **argv)
inv_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
inv_param.solution_type = QUDA_MAT_SOLUTION;
inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
inv_param.preserve_source = QUDA_PRESERVE_SOURCE_YES; // preservation doesn't work with reliable?
inv_param.preserve_source = QUDA_PRESERVE_SOURCE_YES;
inv_param.dirac_order = QUDA_DIRAC_ORDER;
inv_param.verbosity = QUDA_VERBOSE;
size_t gSize = (Gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
size_t sSize = (inv_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
......
......@@ -17,6 +17,7 @@
// The Quda is added to avoid collisions with other libs
#define GaugeFieldOrder QudaGaugeFieldOrder
#define DiracFieldOrder QudaDiracFieldOrder
#define CloverFieldOrder QudaCloverFieldOrder
#define InverterType QudaInverterType
#define Precision QudaPrecision
#define MatPCType QudaMatPCType
......@@ -55,14 +56,14 @@ extern "C" {
Precision precision;
int length;
int volume;
int X[4];
int Nc;
int Ns;
void *clover; // pointer to clover matrix
void *cloverInverse; // pointer to inverse of clover matrix
void *clover;
float *cloverNorm;
} ParityClover;
typedef struct {
Precision precision;
ParityClover odd;
ParityClover even;
} FullClover;
......
......@@ -8,7 +8,7 @@
double2 C6 = fetch_double2((clover), sid + (18*chi+6)*Vh); \
double2 C7 = fetch_double2((clover), sid + (18*chi+7)*Vh); \
double2 C8 = fetch_double2((clover), sid + (18*chi+8)*Vh); \
double2 c9 = fetch_double2((clover), sid + (18*chi+9)*Vh); \
double2 C9 = fetch_double2((clover), sid + (18*chi+9)*Vh); \
double2 C10 = fetch_double2((clover), sid + (18*chi+10)*Vh); \
double2 C11 = fetch_double2((clover), sid + (18*chi+11)*Vh); \
double2 C12 = fetch_double2((clover), sid + (18*chi+12)*Vh); \
......@@ -39,7 +39,7 @@
float4 C6 = tex1Dfetch((clover), sid + (9*chi+6)*Vh); \
float4 C7 = tex1Dfetch((clover), sid + (9*chi+7)*Vh); \
float4 C8 = tex1Dfetch((clover), sid + (9*chi+8)*Vh); \
float K = tex1Dfetch((cloverTexNorm), sid+chi*Vh); \
float K = tex1Dfetch((cloverTexNorm), sid + chi*Vh); \
C0.x *= K; C0.y *= K; C0.z *= K; C0.w *= K; \
C1.x *= K; C1.y *= K; C1.z *= K; C1.w *= K; \
C2.x *= K; C2.y *= K; C2.z *= K; C2.w *= K; \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment