Advanced Computing Platform for Theoretical Physics

commit大文件会使得服务器变得不稳定,请大家尽量只commit代码,不要commit大的文件。

Commit af7028a5 authored by mikeaclark's avatar mikeaclark
Browse files

Fixed bugs in gauge_quda.cpp and added Guochuns 8 reconstruct patch

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@580 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 7a56f565
......@@ -7,8 +7,8 @@
//#include <xmmintrin.h>
#define SHORT_LENGTH 65536
#define SCALE_FLOAT (SHORT_LENGTH-1) / 2.f
#define SHIFT_FLOAT -1.f / (SHORT_LENGTH-1)
#define SCALE_FLOAT ((SHORT_LENGTH-1) / 2.f)
#define SHIFT_FLOAT (-1.f / (SHORT_LENGTH-1))
double Anisotropy;
QudaTboundary tBoundary;
......@@ -21,7 +21,7 @@ inline short FloatToShort(const Float &a) {
template <typename Float>
inline void ShortToFloat(Float &a, const short &b) {
a = (((Float)b/SCALE_FLOAT-SHIFT_FLOAT)*SCALE_FLOAT);
a = ((Float)b/SCALE_FLOAT-SHIFT_FLOAT);
}
// Routines used to pack the gauge field matrices
......@@ -172,7 +172,8 @@ inline void reconstruct8(Float *mat, int dir, int idx) {
row_sum += mat[4]*mat[4];
row_sum += mat[5]*mat[5];
Float u0 = (dir < 3 ? Anisotropy : (idx >= (X[3]-1)*X[0]*X[1]*X[2]/2 ? tBoundary : 1));
Float U00_mag = sqrt(1.f/(u0*u0) - row_sum);
Float diff = 1.f/(u0*u0) - row_sum;
Float U00_mag = sqrt(diff >= 0 ? diff : 0.0);
mat[14] = mat[0];
mat[15] = mat[1];
......@@ -183,7 +184,8 @@ inline void reconstruct8(Float *mat, int dir, int idx) {
Float column_sum = 0.0;
for (int i=0; i<2; i++) column_sum += mat[i]*mat[i];
for (int i=6; i<8; i++) column_sum += mat[i]*mat[i];
Float U20_mag = sqrt(1.f/(u0*u0) - column_sum);
diff = 1.f/(u0*u0) - column_sum;
Float U20_mag = sqrt(diff >= 0 ? diff : 0.0);
mat[12] = U20_mag * cos(mat[15]);
mat[13] = U20_mag * sin(mat[15]);
......@@ -293,10 +295,10 @@ template <typename Float>
inline void unpack12(Float *h_gauge, float4 *d_gauge, int dir, int V, int idx) {
float4 *dg = d_gauge + dir*3*V;
for (int j=0; j<3; j++) {
h_gauge[j*2+0] = dg[j*V].x;
h_gauge[j*2+1] = dg[j*V].y;
h_gauge[j*2+2] = dg[j*V].z;
h_gauge[j*2+3] = dg[j*V].w;
h_gauge[j*4+0] = dg[j*V].x;
h_gauge[j*4+1] = dg[j*V].y;
h_gauge[j*4+2] = dg[j*V].z;
h_gauge[j*4+3] = dg[j*V].w;
}
reconstruct12(h_gauge, dir, idx);
}
......@@ -305,10 +307,10 @@ template <typename Float>
inline void unpack12(Float *h_gauge, short4 *d_gauge, int dir, int V, int idx) {
short4 *dg = d_gauge + dir*3*V;
for (int j=0; j<3; j++) {
ShortToFloat(h_gauge[j*2+0], dg[j*V].x);
ShortToFloat(h_gauge[j*2+1], dg[j*V].y);
ShortToFloat(h_gauge[j*2+2], dg[j*V].z);
ShortToFloat(h_gauge[j*2+3], dg[j*V].w);
ShortToFloat(h_gauge[j*4+0], dg[j*V].x);
ShortToFloat(h_gauge[j*4+1], dg[j*V].y);
ShortToFloat(h_gauge[j*4+2], dg[j*V].z);
ShortToFloat(h_gauge[j*4+3], dg[j*V].w);
}
reconstruct12(h_gauge, dir, idx);
}
......
......@@ -110,14 +110,14 @@
double u0 = (dir < 6 ? anisotropy : (ga_idx >= (X4-1)*X1h*X2*X3 ? t_boundary : 1)); \
double u02_inv = 1.0 / (u0*u0); \
double column_sum = u02_inv - row_sum; \
double U00_mag = sqrt(column_sum); \
double U00_mag = sqrt((column_sum > 0 ? column_sum : 0)); \
sincos(g21_re, &g00_im, &g00_re); \
g00_re *= U00_mag; \
g00_im *= U00_mag; \
column_sum += g10_re*g10_re; \
column_sum += g10_im*g10_im; \
sincos(g21_im, &g20_im, &g20_re); \
double U20_mag = sqrt(u02_inv - column_sum); \
double U20_mag = sqrt(((u02_inv - column_sum) > 0 ? (u02_inv-column_sum) : 0)); \
g20_re *= U20_mag; \
g20_im *= U20_mag; \
double r_inv2 = 1.0 / (u0*row_sum); \
......@@ -148,14 +148,14 @@
float u0 = (dir < 6 ? anisotropy_f : (ga_idx >= (X4-1)*X1h*X2*X3 ? t_boundary_f : 1)); \
float u02_inv = __fdividef(1.f, u0*u0); \
float column_sum = u02_inv - row_sum; \
float U00_mag = sqrtf(column_sum); \
float U00_mag = sqrtf((column_sum > 0?column_sum:0)); \
__sincosf(g21_re, &g00_im, &g00_re); \
g00_re *= U00_mag; \
g00_im *= U00_mag; \
column_sum += g10_re*g10_re; \
column_sum += g10_im*g10_im; \
__sincosf(g21_im, &g20_im, &g20_re); \
float U20_mag = sqrtf(u02_inv - column_sum); \
float U20_mag = sqrtf(((u02_inv - column_sum)>0 ? (u02_inv - column_sum) : 0)); \
g20_re *= U20_mag; \
g20_im *= U20_mag; \
float r_inv2 = __fdividef(1.0f, u0*row_sum); \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment