Advanced Computing Platform for Theoretical Physics

Commit e5f59d3a authored by rbabich's avatar rbabich
Browse files

quda: cleaned up header files (invert_quda.cpp is now interface_quda.cpp)


git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@593 be54200a-260c-0410-bdd7-ce6af2a381ab
parent c3ded658
#include <cuComplex.h>
#include <enum_quda.h>
#ifndef _QUDA_BLAS_H
#define _QUDA_BLAS_H
#include <cuComplex.h>
#include <quda_internal.h>
#ifdef __cplusplus
extern "C" {
#endif
......
#ifndef _CLOVER_QUDA_H
#define _CLOVER_QUDA_H
#include <quda_internal.h>
#ifdef __cplusplus
extern "C" {
#endif
void allocateParityClover(ParityClover *, int *X, int pad,
Precision precision);
void allocateCloverField(FullClover *, int *X, int pad, Precision precision);
void freeParityClover(ParityClover *clover);
void freeCloverField(FullClover *clover);
void loadParityClover(ParityClover ret, void *clover, Precision cpu_prec,
CloverFieldOrder clover_order);
void loadFullClover(FullClover ret, void *clover, Precision cpu_prec,
CloverFieldOrder clover_order);
void loadCloverField(FullClover ret, void *clover, Precision cpu_prec,
CloverFieldOrder clover_order);
/* void createCloverField(FullClover *cudaClover, void *cpuClover, int *X,
Precision precision); */
#ifdef __cplusplus
}
#endif
#endif // _CLOVER_QUDA_H
#ifndef _DSLASH_QUDA_H
#define _DSLASH_QUDA_H
#include <cuComplex.h>
#include <quda.h>
#include <quda_internal.h>
#ifdef __cplusplus
extern "C" {
#endif
extern FullGauge cudaGaugePrecise;
extern FullGauge cudaGaugeSloppy;
extern FullClover cudaCloverPrecise;
extern FullClover cudaCloverSloppy;
extern FullClover cudaCloverInvPrecise;
extern FullClover cudaCloverInvSloppy;
// ---------- dslash_quda.cu ----------
extern unsigned long long dslash_quda_flops;
extern unsigned long long dslash_quda_bytes;
int dslashCudaSharedBytes(Precision spinor_prec, int blockDim);
......@@ -107,17 +97,6 @@ extern "C" {
void cloverHCuda(ParitySpinor res, FullGauge gauge, FullClover clover,
ParitySpinor spinor, int oddBit);
// -- inv_cg_cuda.cpp
void invertCgCuda(ParitySpinor x, ParitySpinor b, ParitySpinor tmp,
QudaInvertParam *param);
// -- inv_bicgstab_cuda.cpp
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, ParitySpinor tmp,
QudaInvertParam *param, DagType dag_type);
extern unsigned long long dslash_quda_flops;
extern unsigned long long dslash_quda_bytes;
#ifdef __cplusplus
}
#endif
......
#ifndef _GAUGE_QUDA_H
#define _GAUGE_QUDA_H
#include <enum_quda.h>
#include <dslash_quda.h>
#include <quda_internal.h>
#ifdef __cplusplus
extern "C" {
......
#ifndef _INVERT_QUDA_H
#define _INVERT_QUDA_H
#include <quda_internal.h>
#include <quda.h>
#ifdef __cplusplus
extern "C" {
#endif
extern FullGauge cudaGaugePrecise;
extern FullGauge cudaGaugeSloppy;
extern FullClover cudaCloverPrecise;
extern FullClover cudaCloverSloppy;
extern FullClover cudaCloverInvPrecise;
extern FullClover cudaCloverInvSloppy;
// -- inv_cg_cuda.cpp
void invertCgCuda(ParitySpinor x, ParitySpinor b, ParitySpinor tmp,
QudaInvertParam *param);
// -- inv_bicgstab_cuda.cpp
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, ParitySpinor tmp,
QudaInvertParam *param, DagType dag_type);
#ifdef __cplusplus
}
#endif
#endif // _INVERT_QUDA_H
......@@ -78,7 +78,7 @@ extern "C" {
} QudaInvertParam;
// Interface functions, found in invert_quda.cpp
// Interface functions, found in interface_quda.cpp
void initQuda(int dev);
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param);
......
......@@ -100,7 +100,4 @@ extern "C" {
}
#endif
#include <blas_quda.h>
#include <dslash_quda.h>
#endif // _QUDA_INTERNAL_H
#ifndef _QUDA_SPINOR_H
#define _QUDA_SPINOR_H
#ifndef _SPINOR_QUDA_H
#define _SPINOR_QUDA_H
#include <enum_quda.h>
#include <dslash_quda.h>
#include <quda_internal.h>
#ifdef __cplusplus
extern "C" {
#endif
// -- spinor_quda.cpp
ParitySpinor allocateParitySpinor(int *X, Precision precision, int stride);
FullSpinor allocateSpinorField(int *X, Precision precision, int stride);
......@@ -30,26 +27,8 @@ extern "C" {
void spinorHalfPack(float *c, short *s0, float *f0);
void spinorHalfUnpack(float *f0, float *c, short *s0);
// -- clover_quda.cpp
void allocateParityClover(ParityClover *, int *X, int pad, Precision precision);
void allocateCloverField(FullClover *, int *X, int pad, Precision precision);
void freeParityClover(ParityClover *clover);
void freeCloverField(FullClover *clover);
void loadParityClover(ParityClover ret, void *clover, Precision cpu_prec,
CloverFieldOrder clover_order);
void loadFullClover(FullClover ret, void *clover, Precision cpu_prec,
CloverFieldOrder clover_order);
void loadCloverField(FullClover ret, void *clover, Precision cpu_prec,
CloverFieldOrder clover_order);
/* void createCloverField(FullClover *cudaClover, void *cpuClover, int *X,
Precision precision); */
#ifdef __cplusplus
}
#endif
#endif // _QUDA_SPINOR_H
#endif // _SPINOR_QUDA_H
#ifndef _UTIL_QUDA_H
#define _UTIL_QUDA_H
#include <quda_internal.h>
#ifdef __cplusplus
extern "C" {
#endif
......
include ../make.inc
QUDA = libquda.a
QUDA_OBJS = blas_quda.o clover_quda.o dslash_quda.o gauge_quda.o \
inv_bicgstab_quda.o inv_cg_quda.o invert_quda.o spinor_quda.o \
QUDA_OBJS = blas_quda.o clover_quda.o dslash_quda.o gauge_quda.o \
inv_bicgstab_quda.o inv_cg_quda.o interface_quda.o spinor_quda.o \
util_quda.o
# header files, found in include/
QUDA_HDRS = blas_quda.h dslash_quda.h enum_quda.h gauge_quda.h quda.h \
quda_internal.h spinor_quda.h util_quda.h
QUDA_HDRS = blas_quda.h clover_quda.h dslash_quda.h enum_quda.h gauge_quda.h \
invert_quda.h quda.h quda_internal.h spinor_quda.h util_quda.h
# files containing complex macros and other code fragments to be inlined,
# found in lib/
QUDA_INLN = blas_param.h check_params.h clover_def.h dslash_common.h \
dslash_def.h dslash_textures.h io_spinor.h read_clover.h \
read_gauge.h reduce_complex_core.h reduce_core.h \
QUDA_INLN = blas_param.h check_params.h clover_def.h dslash_common.h \
dslash_def.h dslash_textures.h io_spinor.h read_clover.h \
read_gauge.h reduce_complex_core.h reduce_core.h \
reduce_triple_core.h
# files generated by the scripts in lib/generate/, found in lib/dslash_core/
......
......@@ -2,6 +2,7 @@
#include <stdio.h>
#include <quda_internal.h>
#include <blas_quda.h>
#define REDUCE_MAX_BLOCKS 2048
......
......@@ -3,7 +3,7 @@
#include <math.h>
#include <quda_internal.h>
#include <spinor_quda.h>
#include <clover_quda.h>
void allocateParityClover(ParityClover *ret, int *X, int pad, Precision precision)
{
......
......@@ -36,4 +36,3 @@ __constant__ float pi_f;
// double precision constants
__constant__ double anisotropy;
__constant__ double t_boundary;
// dslash_def.h - Dslash kernel definitions
// There are currently 64 different variants of the Dslash kernel,
// There are currently 288 different variants of the Dslash kernel,
// each one characterized by a set of 6 options, where each option can
// take one of two values (2^6 = 64). This file is structured so that
// the C preprocessor loops through all 64 variants (in a manner
// resembling a binary counter), sets the appropriate macros, and
// defines the corresponding functions.
// take one of several values (3*3*4*2*2*2 = 288). This file is
// structured so that the C preprocessor loops through all 288
// variants (in a manner resembling a counter), sets the appropriate
// macros, and defines the corresponding functions.
//
// As an example of the function naming conventions, consider
//
......@@ -250,7 +250,7 @@ DD_FUNC(DD_GPREC_F, DD_SPREC_F, DD_CPREC_F, DD_RECON_F, DD_DAG_F, DD_XPAY_F)(DD_
#define DD_SPREC 2
#else
#undef DD_SPREC // from here
#undef DD_SPREC
#define DD_SPREC 0
#if (DD_CPREC==0)
......@@ -263,7 +263,7 @@ DD_FUNC(DD_GPREC_F, DD_SPREC_F, DD_CPREC_F, DD_RECON_F, DD_DAG_F, DD_XPAY_F)(DD_
#undef DD_CPREC
#define DD_CPREC 3
#else // to here
#else
#undef DD_LOOP
#undef DD_DAG
......@@ -271,9 +271,9 @@ DD_FUNC(DD_GPREC_F, DD_SPREC_F, DD_CPREC_F, DD_RECON_F, DD_DAG_F, DD_XPAY_F)(DD_
#undef DD_RECON
#undef DD_GPREC
#undef DD_SPREC
#undef DD_CPREC //
#undef DD_CPREC
#endif // DD_CPREC //
#endif // DD_CPREC
#endif // DD_SPREC
#endif // DD_GPREC
#endif // DD_RECON
......
......@@ -5,8 +5,8 @@
#include <dslash_quda.h>
#include <spinor_quda.h> // not needed once call to allocateParitySpinor() is removed
#include<dslash_textures.h>
#include<dslash_constants.h>
#include <dslash_textures.h>
#include <dslash_constants.h>
unsigned long long dslash_quda_flops;
unsigned long long dslash_quda_bytes;
......@@ -33,7 +33,7 @@ int dslashCudaSharedBytes(Precision precision) {
#include <dslash_common.h>
int initDslash = 0;
static int initDslash = 0;
void initDslashConstants(FullGauge gauge, int sp_stride, int cl_stride) {
int Vh = gauge.volume;
......@@ -160,8 +160,6 @@ static void bindGaugeTex(FullGauge gauge, int oddBit) {
}
}
// ----------------------------------------------------------------------
// ----------------------------------------------------------------------
// plain Wilson Dslash:
......
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <quda_internal.h>
#include <gauge_quda.h>
......
......@@ -4,8 +4,12 @@
#include <quda.h>
#include <quda_internal.h>
#include <spinor_quda.h>
#include <gauge_quda.h>
#include <spinor_quda.h>
#include <clover_quda.h>
#include <blas_quda.h>
#include <dslash_quda.h>
#include <invert_quda.h>
#define spinorSiteSize 24 // real numbers per spinor
......
......@@ -3,10 +3,11 @@
#include <math.h>
#include <cuComplex.h>
#include <quda.h>
#include <quda_internal.h>
#include <spinor_quda.h>
#include <blas_quda.h>
#include <dslash_quda.h>
#include <invert_quda.h>
#include <util_quda.h>
void MatVec(ParitySpinor out, FullGauge gauge, FullClover clover, FullClover cloverInv, ParitySpinor in,
......
......@@ -2,10 +2,12 @@
#include <stdlib.h>
#include <math.h>
#include <quda.h>
#include <quda_internal.h>
#include <util_quda.h>
#include <spinor_quda.h>
#include <blas_quda.h>
#include <dslash_quda.h>
#include <invert_quda.h>
#include <util_quda.h>
void MatVec(ParitySpinor out, FullGauge gauge, FullClover clover, FullClover cloverInv, ParitySpinor in,
QudaInvertParam *invert_param, ParitySpinor tmp) {
......
#define READ_SPINOR_DOUBLE(spinor) \
#define READ_SPINOR_DOUBLE(spinor) \
double2 I0 = fetch_double2((spinor), sp_idx + 0*(sp_stride)); \
double2 I1 = fetch_double2((spinor), sp_idx + 1*(sp_stride)); \
double2 I2 = fetch_double2((spinor), sp_idx + 2*(sp_stride)); \
......@@ -9,10 +9,10 @@
double2 I7 = fetch_double2((spinor), sp_idx + 7*(sp_stride)); \
double2 I8 = fetch_double2((spinor), sp_idx + 8*(sp_stride)); \
double2 I9 = fetch_double2((spinor), sp_idx + 9*(sp_stride)); \
double2 I10 = fetch_double2((spinor), sp_idx + 10*(sp_stride)); \
double2 I10 = fetch_double2((spinor), sp_idx + 10*(sp_stride)); \
double2 I11 = fetch_double2((spinor), sp_idx + 11*(sp_stride));
#define READ_SPINOR_DOUBLE_UP(spinor) \
#define READ_SPINOR_DOUBLE_UP(spinor) \
double2 I0 = fetch_double2((spinor), sp_idx + 0*(sp_stride)); \
double2 I1 = fetch_double2((spinor), sp_idx + 1*(sp_stride)); \
double2 I2 = fetch_double2((spinor), sp_idx + 2*(sp_stride)); \
......@@ -20,15 +20,15 @@
double2 I4 = fetch_double2((spinor), sp_idx + 4*(sp_stride)); \
double2 I5 = fetch_double2((spinor), sp_idx + 5*(sp_stride));
#define READ_SPINOR_DOUBLE_DOWN(spinor) \
#define READ_SPINOR_DOUBLE_DOWN(spinor) \
double2 I6 = fetch_double2((spinor), sp_idx + 6*(sp_stride)); \
double2 I7 = fetch_double2((spinor), sp_idx + 7*(sp_stride)); \
double2 I8 = fetch_double2((spinor), sp_idx + 8*(sp_stride)); \
double2 I9 = fetch_double2((spinor), sp_idx + 9*(sp_stride)); \
double2 I10 = fetch_double2((spinor), sp_idx + 10*(sp_stride)); \
double2 I10 = fetch_double2((spinor), sp_idx + 10*(sp_stride)); \
double2 I11 = fetch_double2((spinor), sp_idx + 11*(sp_stride));
#define READ_SPINOR_SINGLE(spinor) \
#define READ_SPINOR_SINGLE(spinor) \
float4 I0 = tex1Dfetch((spinor), sp_idx + 0*(sp_stride)); \
float4 I1 = tex1Dfetch((spinor), sp_idx + 1*(sp_stride)); \
float4 I2 = tex1Dfetch((spinor), sp_idx + 2*(sp_stride)); \
......@@ -36,102 +36,102 @@
float4 I4 = tex1Dfetch((spinor), sp_idx + 4*(sp_stride)); \
float4 I5 = tex1Dfetch((spinor), sp_idx + 5*(sp_stride));
#define READ_SPINOR_SINGLE_UP(spinor) \
#define READ_SPINOR_SINGLE_UP(spinor) \
float4 I0 = tex1Dfetch((spinor), sp_idx + 0*(sp_stride)); \
float4 I1 = tex1Dfetch((spinor), sp_idx + 1*(sp_stride)); \
float4 I2 = tex1Dfetch((spinor), sp_idx + 2*(sp_stride)); \
#define READ_SPINOR_SINGLE_DOWN(spinor) \
#define READ_SPINOR_SINGLE_DOWN(spinor) \
float4 I3 = tex1Dfetch((spinor), sp_idx + 3*(sp_stride)); \
float4 I4 = tex1Dfetch((spinor), sp_idx + 4*(sp_stride)); \
float4 I5 = tex1Dfetch((spinor), sp_idx + 5*(sp_stride));
#define READ_SPINOR_HALF(spinor) \
#define READ_SPINOR_HALF(spinor) \
float4 I0 = tex1Dfetch((spinor), sp_idx + 0*(sp_stride)); \
float4 I1 = tex1Dfetch((spinor), sp_idx + 1*(sp_stride)); \
float4 I2 = tex1Dfetch((spinor), sp_idx + 2*(sp_stride)); \
float4 I3 = tex1Dfetch((spinor), sp_idx + 3*(sp_stride)); \
float4 I4 = tex1Dfetch((spinor), sp_idx + 4*(sp_stride)); \
float4 I5 = tex1Dfetch((spinor), sp_idx + 5*(sp_stride)); \
float C = tex1Dfetch((spinorTexNorm), sp_idx); \
I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \
I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \
I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \
float C = tex1Dfetch((spinorTexNorm), sp_idx); \
I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \
I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \
I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \
I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
#define READ_SPINOR_HALF_UP(spinor) \
#define READ_SPINOR_HALF_UP(spinor) \
float4 I0 = tex1Dfetch((spinor), sp_idx + 0*(sp_stride)); \
float4 I1 = tex1Dfetch((spinor), sp_idx + 1*(sp_stride)); \
float4 I2 = tex1Dfetch((spinor), sp_idx + 2*(sp_stride)); \
float C = tex1Dfetch((spinorTexNorm), sp_idx); \
I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \
float C = tex1Dfetch((spinorTexNorm), sp_idx); \
I0.x *= C; I0.y *= C; I0.z *= C; I0.w *= C; \
I1.x *= C; I1.y *= C; I1.z *= C; I1.w *= C; \
I2.x *= C; I2.y *= C; I2.z *= C; I2.w *= C; \
#define READ_SPINOR_HALF_DOWN(spinor) \
#define READ_SPINOR_HALF_DOWN(spinor) \
float4 I3 = tex1Dfetch((spinor), sp_idx + 3*(sp_stride)); \
float4 I4 = tex1Dfetch((spinor), sp_idx + 4*(sp_stride)); \
float4 I5 = tex1Dfetch((spinor), sp_idx + 5*(sp_stride)); \
float C = tex1Dfetch((spinorTexNorm), sp_idx); \
I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \
I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \
float C = tex1Dfetch((spinorTexNorm), sp_idx); \
I3.x *= C; I3.y *= C; I3.z *= C; I3.w *= C; \
I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \
I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
#define READ_ACCUM_DOUBLE(spinor) \
double2 accum0 = fetch_double2((spinor), sid + 0*(sp_stride)); \
double2 accum1 = fetch_double2((spinor), sid + 1*(sp_stride)); \
double2 accum2 = fetch_double2((spinor), sid + 2*(sp_stride)); \
double2 accum3 = fetch_double2((spinor), sid + 3*(sp_stride)); \
double2 accum4 = fetch_double2((spinor), sid + 4*(sp_stride)); \
double2 accum5 = fetch_double2((spinor), sid + 5*(sp_stride)); \
double2 accum6 = fetch_double2((spinor), sid + 6*(sp_stride)); \
double2 accum7 = fetch_double2((spinor), sid + 7*(sp_stride)); \
double2 accum8 = fetch_double2((spinor), sid + 8*(sp_stride)); \
double2 accum9 = fetch_double2((spinor), sid + 9*(sp_stride)); \
double2 accum10 = fetch_double2((spinor), sid + 10*(sp_stride)); \
#define READ_ACCUM_DOUBLE(spinor) \
double2 accum0 = fetch_double2((spinor), sid + 0*(sp_stride)); \
double2 accum1 = fetch_double2((spinor), sid + 1*(sp_stride)); \
double2 accum2 = fetch_double2((spinor), sid + 2*(sp_stride)); \
double2 accum3 = fetch_double2((spinor), sid + 3*(sp_stride)); \
double2 accum4 = fetch_double2((spinor), sid + 4*(sp_stride)); \
double2 accum5 = fetch_double2((spinor), sid + 5*(sp_stride)); \
double2 accum6 = fetch_double2((spinor), sid + 6*(sp_stride)); \
double2 accum7 = fetch_double2((spinor), sid + 7*(sp_stride)); \
double2 accum8 = fetch_double2((spinor), sid + 8*(sp_stride)); \
double2 accum9 = fetch_double2((spinor), sid + 9*(sp_stride)); \
double2 accum10 = fetch_double2((spinor), sid + 10*(sp_stride)); \
double2 accum11 = fetch_double2((spinor), sid + 11*(sp_stride));
#define READ_ACCUM_SINGLE(spinor) \
float4 accum0 = tex1Dfetch((spinor), sid + 0*(sp_stride)); \
float4 accum1 = tex1Dfetch((spinor), sid + 1*(sp_stride)); \
float4 accum2 = tex1Dfetch((spinor), sid + 2*(sp_stride)); \
float4 accum3 = tex1Dfetch((spinor), sid + 3*(sp_stride)); \
float4 accum4 = tex1Dfetch((spinor), sid + 4*(sp_stride)); \
#define READ_ACCUM_SINGLE(spinor) \
float4 accum0 = tex1Dfetch((spinor), sid + 0*(sp_stride)); \
float4 accum1 = tex1Dfetch((spinor), sid + 1*(sp_stride)); \
float4 accum2 = tex1Dfetch((spinor), sid + 2*(sp_stride)); \
float4 accum3 = tex1Dfetch((spinor), sid + 3*(sp_stride)); \
float4 accum4 = tex1Dfetch((spinor), sid + 4*(sp_stride)); \
float4 accum5 = tex1Dfetch((spinor), sid + 5*(sp_stride));
#define READ_ACCUM_HALF(spinor) \
float4 accum0 = tex1Dfetch((spinor), sid + 0*(sp_stride)); \
float4 accum1 = tex1Dfetch((spinor), sid + 1*(sp_stride)); \
float4 accum2 = tex1Dfetch((spinor), sid + 2*(sp_stride)); \
float4 accum3 = tex1Dfetch((spinor), sid + 3*(sp_stride)); \
float4 accum4 = tex1Dfetch((spinor), sid + 4*(sp_stride)); \
float4 accum5 = tex1Dfetch((spinor), sid + 5*(sp_stride)); \
float C = tex1Dfetch((accumTexNorm), sid); \
accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C; \
accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C; \
accum2.x *= C; accum2.y *= C; accum2.z *= C; accum2.w *= C; \
accum3.x *= C; accum3.y *= C; accum3.z *= C; accum3.w *= C; \
accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C; \
#define READ_ACCUM_HALF(spinor) \
float4 accum0 = tex1Dfetch((spinor), sid + 0*(sp_stride)); \
float4 accum1 = tex1Dfetch((spinor), sid + 1*(sp_stride)); \
float4 accum2 = tex1Dfetch((spinor), sid + 2*(sp_stride)); \
float4 accum3 = tex1Dfetch((spinor), sid + 3*(sp_stride)); \
float4 accum4 = tex1Dfetch((spinor), sid + 4*(sp_stride)); \
float4 accum5 = tex1Dfetch((spinor), sid + 5*(sp_stride)); \
float C = tex1Dfetch((accumTexNorm), sid); \
accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C; \
accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C; \
accum2.x *= C; accum2.y *= C; accum2.z *= C; accum2.w *= C; \
accum3.x *= C; accum3.y *= C; accum3.z *= C; accum3.w *= C; \
accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C; \
accum5.x *= C; accum5.y *= C; accum5.z *= C; accum5.w *= C;
#define WRITE_SPINOR_DOUBLE2() \
g_out[0*(sp_stride)+sid] = make_double2(o00_re, o00_im); \
g_out[1*(sp_stride)+sid] = make_double2(o01_re, o01_im); \
g_out[2*(sp_stride)+sid] = make_double2(o02_re, o02_im); \
g_out[3*(sp_stride)+sid] = make_double2(o10_re, o10_im); \
g_out[4*(sp_stride)+sid] = make_double2(o11_re, o11_im); \
g_out[5*(sp_stride)+sid] = make_double2(o12_re, o12_im); \
g_out[6*(sp_stride)+sid] = make_double2(o20_re, o20_im); \
g_out[7*(sp_stride)+sid] = make_double2(o21_re, o21_im); \
g_out[8*(sp_stride)+sid] = make_double2(o22_re, o22_im); \
g_out[9*(sp_stride)+sid] = make_double2(o30_re, o30_im); \
g_out[10*(sp_stride)+sid] = make_double2(o31_re, o31_im); \
#define WRITE_SPINOR_DOUBLE2() \
g_out[0*(sp_stride)+sid] = make_double2(o00_re, o00_im); \
g_out[1*(sp_stride)+sid] = make_double2(o01_re, o01_im); \
g_out[2*(sp_stride)+sid] = make_double2(o02_re, o02_im); \
g_out[3*(sp_stride)+sid] = make_double2(o10_re, o10_im); \
g_out[4*(sp_stride)+sid] = make_double2(o11_re, o11_im); \
g_out[5*(sp_stride)+sid] = make_double2(o12_re, o12_im); \
g_out[6*(sp_stride)+sid] = make_double2(o20_re, o20_im); \
g_out[7*(sp_stride)+sid] = make_double2(o21_re, o21_im); \
g_out[8*(sp_stride)+sid] = make_double2(o22_re, o22_im); \
g_out[9*(sp_stride)+sid] = make_double2(o30_re, o30_im); \
g_out[10*(sp_stride)+sid] = make_double2(o31_re, o31_im); \
g_out[11*(sp_stride)+sid] = make_double2(o32_re, o32_im);
#define WRITE_SPINOR_FLOAT4() \
#define WRITE_SPINOR_FLOAT4() \
g_out[0*(sp_stride)+sid] = make_float4(o00_re, o00_im, o01_re, o01_im); \
g_out[1*(sp_stride)+sid] = make_float4(o02_re, o02_im, o10_re, o10_im); \
g_out[2*(sp_stride)+sid] = make_float4(o11_re, o11_im, o12_re, o12_im); \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment