Advanced Computing Platform for Theoretical Physics

Commit 1e57e6e1 authored by mikeaclark's avatar mikeaclark
Browse files

Cleaned up gauge_quda.cpp

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@406 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 72aac654
......@@ -8,7 +8,7 @@
#include <gauge_quda.h>
// What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat)
int test_type = 2;
int test_type = 1;
QudaGaugeParam gaugeParam;
QudaInvertParam inv_param;
......
......@@ -12,363 +12,121 @@
#define SCALE_FLOAT (SHORT_LENGTH-1) / 2.f
#define SHIFT_FLOAT -1.f / (SHORT_LENGTH-1)
inline short floatToShort(float a) {
template <typename Float>
inline short FloatToShort(Float a) {
return (short)((a+SHIFT_FLOAT)*SCALE_FLOAT);
}
inline short doubleToShort(double a) {
return (short)((a+SHIFT_FLOAT)*SCALE_FLOAT);
}
inline void packD8D(double2 *res, double *g) {
res[0].x = atan2(g[1], g[0]);
res[0].y = atan2(g[13], g[12]);
res[Nh].x = g[2];
res[Nh].y = g[3];
res[2*Nh].x = g[4];
res[2*Nh].y = g[5];
res[3*Nh].x = g[6];
res[3*Nh].y = g[7];
}
inline void packS8D(float4 *res, double *g) {
res[0].x = atan2(g[1], g[0]);
res[0].y = atan2(g[13], g[12]);
res[0].z = g[2];
res[0].w = g[3];
res[Nh].x = g[4];
res[Nh].y = g[5];
res[Nh].z = g[6];
res[Nh].w = g[7];
}
inline void packS8S(float4 *res, float *g) {
res[0].x = atan2(g[1], g[0]);
res[0].y = atan2(g[13], g[12]);
res[0].z = g[2];
res[0].w = g[3];
res[Nh].x = g[4];
res[Nh].y = g[5];
res[Nh].z = g[6];
res[Nh].w = g[7];
}
inline void packH8D(short4 *res, double *g) {
res[0].x = doubleToShort(atan2(g[1], g[0])/ M_PI);
res[0].y = doubleToShort(atan2(g[13], g[12])/ M_PI);
res[0].z = doubleToShort(g[2]);
res[0].w = doubleToShort(g[3]);
res[Nh].x = doubleToShort(g[4]);
res[Nh].y = doubleToShort(g[5]);
res[Nh].z = doubleToShort(g[6]);
res[Nh].w = doubleToShort(g[7]);
}
inline void packH8S(short4 *res, float *g) {
res[0].x = floatToShort(atan2(g[1], g[0])/ M_PI);
res[0].y = floatToShort(atan2(g[13], g[12])/ M_PI);
res[0].z = floatToShort(g[2]);
res[0].w = floatToShort(g[3]);
res[Nh].x = floatToShort(g[4]);
res[Nh].y = floatToShort(g[5]);
res[Nh].z = floatToShort(g[6]);
res[Nh].w = floatToShort(g[7]);
}
inline void packD12D(double2 *res, double *g) {
template <typename Float>
inline void pack8(double2 *res, Float *g, int dir) {
double2 *r = res + dir*2*Nh;
r[0].x = atan2(g[1], g[0]);
r[0].y = atan2(g[13], g[12]);
r[Nh].x = g[2];
r[Nh].y = g[3];
r[2*Nh].x = g[4];
r[2*Nh].y = g[5];
r[3*Nh].x = g[6];
r[3*Nh].y = g[7];
}
template <typename Float>
inline void pack8(float4 *res, Float *g, int dir) {
float4 *r = res + dir*4*Nh;
r[0].x = atan2(g[1], g[0]);
r[0].y = atan2(g[13], g[12]);
r[0].z = g[2];
r[0].w = g[3];
r[Nh].x = g[4];
r[Nh].y = g[5];
r[Nh].z = g[6];
r[Nh].w = g[7];
}
template <typename Float>
inline void pack8(short4 *res, Float *g, int dir) {
short4 *r = res + dir*4*Nh;
r[0].x = FloatToShort(atan2(g[1], g[0])/ M_PI);
r[0].y = FloatToShort(atan2(g[13], g[12])/ M_PI);
r[0].z = FloatToShort(g[2]);
r[0].w = FloatToShort(g[3]);
r[Nh].x = FloatToShort(g[4]);
r[Nh].y = FloatToShort(g[5]);
r[Nh].z = FloatToShort(g[6]);
r[Nh].w = FloatToShort(g[7]);
}
template <typename Float>
inline void pack12(double2 *res, Float *g, int dir) {
double2 *r = res + dir*6*Nh;
for (int j=0; j<6; j++) {
res[j*Nh].x = g[j*2+0];
res[j*Nh].y = g[j*2+1];
r[j*Nh].x = g[j*2+0];
r[j*Nh].y = g[j*2+1];
}
}
inline void packS12D(float4 *res, double *g) {
template <typename Float>
inline void pack12(float4 *res, Float *g, int dir) {
float4 *r = res + dir*3*Nh;
for (int j=0; j<3; j++) {
res[j*Nh].x = (float)g[j*4+0];
res[j*Nh].y = (float)g[j*4+1];
res[j*Nh].z = (float)g[j*4+2];
res[j*Nh].w = (float)g[j*4+3];
r[j*Nh].x = (float)g[j*4+0];
r[j*Nh].y = (float)g[j*4+1];
r[j*Nh].z = (float)g[j*4+2];
r[j*Nh].w = (float)g[j*4+3];
}
}
// sse packing routine
inline void packS12S(float4 *res, float *g) {
__m128 a, b, c;
a = _mm_loadu_ps((const float*)g);
b = _mm_loadu_ps((const float*)(g+4));
c = _mm_loadu_ps((const float*)(g+8));
_mm_store_ps((float*)(res), a);
_mm_store_ps((float*)(res+Nh), b);
_mm_store_ps((float*)(res+2*Nh), c);
/* for (int j=0; j<3; j++) {
res[j*Nh].x = g[j*4+0];
res[j*Nh].y = g[j*4+1];
res[j*Nh].z = g[j*4+2];
res[j*Nh].w = g[j*4+3];
}*/
}
inline void packH12D(short4 *res, double *g) {
template <typename Float>
inline void pack12(short4 *res, Float *g, int dir) {
short4 *r = res + dir*3*Nh;
for (int j=0; j<3; j++) {
res[j*Nh].x = doubleToShort(g[j*4+0]);
res[j*Nh].y = doubleToShort(g[j*4+1]);
res[j*Nh].z = doubleToShort(g[j*4+2]);
res[j*Nh].w = doubleToShort(g[j*4+3]);
}
}
inline void packH12S(short4 *res, float *g) {
for (int j=0; j<3; j++) {
res[j*Nh].x = floatToShort(g[j*4+0]);
res[j*Nh].y = floatToShort(g[j*4+1]);
res[j*Nh].z = floatToShort(g[j*4+2]);
res[j*Nh].w = floatToShort(g[j*4+3]);
r[j*Nh].x = FloatToShort(g[j*4+0]);
r[j*Nh].y = FloatToShort(g[j*4+1]);
r[j*Nh].z = FloatToShort(g[j*4+2]);
r[j*Nh].w = FloatToShort(g[j*4+3]);
}
}
// Assume the gauge field is "QDP" ordered directions inside of
// space-time column-row ordering even-odd space-time
void packQDPGaugeFieldDD(double2 *res, double **gauge, int oddBit, ReconstructType reconstruct) {
if (reconstruct == QUDA_RECONSTRUCT_12) {
for (int dir = 0; dir < 4; dir++) {
double *g = gauge[dir] + oddBit*Nh*gaugeSiteSize;
double2 *r = res + dir*6*Nh;
for (int i = 0; i < Nh; i++) {
packD12D(r+i, g+i*18);
}
}
} else {
for (int dir = 0; dir < 4; dir++) {
double *g = gauge[dir] + oddBit*Nh*gaugeSiteSize;
double2 *r = res + dir*4*Nh;
for (int i = 0; i < Nh; i++) {
packD8D(r+i, g+i*18);
}
}
}
}
void packQDPGaugeFieldSD(float4 *res, double **gauge, int oddBit, ReconstructType reconstruct) {
if (reconstruct == QUDA_RECONSTRUCT_12) {
for (int dir = 0; dir < 4; dir++) {
double *g = gauge[dir] + oddBit*Nh*gaugeSiteSize;
float4 *r = res + dir*3*Nh;
for (int i = 0; i < Nh; i++) {
packS12D(r+i, g+i*18);
}
}
} else {
for (int dir = 0; dir < 4; dir++) {
double *g = gauge[dir] + oddBit*Nh*gaugeSiteSize;
float4 *r = res + dir*2*Nh;
for (int i = 0; i < Nh; i++) {
packS8D(r+i, g+i*18);
}
}
}
}
void packQDPGaugeFieldHD(short4 *res, double **gauge, int oddBit, ReconstructType reconstruct) {
if (reconstruct == QUDA_RECONSTRUCT_12) {
for (int dir = 0; dir < 4; dir++) {
double *g = gauge[dir] + oddBit*Nh*gaugeSiteSize;
short4 *r = res + dir*3*Nh;
for (int i = 0; i < Nh; i++) {
packH12D(r+i, g+i*18);
}
}
} else {
for (int dir = 0; dir < 4; dir++) {
double *g = gauge[dir] + oddBit*Nh*gaugeSiteSize;
short4 *r = res + dir*2*Nh;
for (int i = 0; i < Nh; i++) {
packH8D(r+i, g+i*18);
}
}
}
}
// Single precision version of the above
void packQDPGaugeFieldSS(float4 *res, float **gauge, int oddBit, ReconstructType reconstruct) {
if (reconstruct == QUDA_RECONSTRUCT_12) {
for (int dir = 0; dir < 4; dir++) {
float *g = gauge[dir] + oddBit*Nh*gaugeSiteSize;
float4 *r = res + dir*3*Nh;
for (int i = 0; i < Nh; i++) {
packS12S(r+i, g+i*18);
}
}
} else {
for (int dir = 0; dir < 4; dir++) {
float *g = gauge[dir] + oddBit*Nh*gaugeSiteSize;
float4 *r = res + dir*2*Nh;
for (int i = 0; i < Nh; i++) {
packS8S(r+i, g+i*18);
}
}
}
}
// Cuda half precision version of the above
void packQDPGaugeFieldHS(short4 *res, float **gauge, int oddBit, ReconstructType reconstruct) {
template <typename Float, typename FloatN>
void packQDPGaugeField(FloatN *res, Float **gauge, int oddBit, ReconstructType reconstruct) {
if (reconstruct == QUDA_RECONSTRUCT_12) {
for (int dir = 0; dir < 4; dir++) {
float *g = gauge[dir] + oddBit*Nh*gaugeSiteSize;
short4 *r = res + dir*3*Nh;
for (int i = 0; i < Nh; i++) {
packH12S(r+i, g+i*18);
}
Float *g = gauge[dir] + oddBit*Nh*gaugeSiteSize;
for (int i = 0; i < Nh; i++) pack12(res+i, g+i*18, dir);
}
} else {
for (int dir = 0; dir < 4; dir++) {
float *g = gauge[dir] + oddBit*Nh*gaugeSiteSize;
short4 *r = res + dir*2*Nh;
for (int i = 0; i < Nh; i++) {
packH8S(r+i, g+i*18);
}
Float *g = gauge[dir] + oddBit*Nh*gaugeSiteSize;
for (int i = 0; i < Nh; i++) pack8(res+i, g+i*18, dir);
}
}
}
// Assume the gauge field is "Wilson" ordered directions inside of
// space-time column-row ordering even-odd space-time
void packCPSGaugeFieldDD(double2 *res, double *gauge, int oddBit, ReconstructType reconstruct) {
double gT[18];
template <typename Float, typename FloatN>
void packCPSGaugeField(FloatN *res, Float *gauge, int oddBit, ReconstructType reconstruct) {
Float gT[18];
if (reconstruct == QUDA_RECONSTRUCT_12) {
for (int dir = 0; dir < 4; dir++) {
double *g = gauge + (oddBit*Nh*4+dir)*gaugeSiteSize;
double2 *r = res + dir*6*Nh;
Float *g = gauge + (oddBit*Nh*4+dir)*gaugeSiteSize;
for (int i = 0; i < Nh; i++) {
// Must reorder rows-columns
for (int ic=0; ic<2; ic++) for (int jc=0; jc<3; jc++) for (int r=0; r<2; r++)
gT[(ic*3+jc)*2+r] = g[4*i*18 + (jc*3+ic)*2+r];
packD12D(r+i, gT);
pack12(res+i, gT, dir);
}
}
} else {
for (int dir = 0; dir < 4; dir++) {
double *g = gauge + (oddBit*Nh*4+dir)*gaugeSiteSize;
double2 *r = res + dir*4*Nh;
Float *g = gauge + (oddBit*Nh*4+dir)*gaugeSiteSize;
for (int i = 0; i < Nh; i++) {
// Must reorder rows-columns
for (int ic=0; ic<3; ic++) for (int jc=0; jc<3; jc++) for (int r=0; r<2; r++)
gT[(ic*3+jc)*2+r] = g[4*i*18 + (jc*3+ic)*2+r];
packD8D(r+i, gT);
}
}
}
}
void packCPSGaugeFieldSD(float4 *res, double *gauge, int oddBit, ReconstructType reconstruct) {
float gT[18];
if (reconstruct == QUDA_RECONSTRUCT_12) {
for (int dir = 0; dir < 4; dir++) {
double *g = gauge + (oddBit*Nh*4+dir)*gaugeSiteSize;
float4 *r = res + dir*3*Nh;
for (int i = 0; i < Nh; i++) {
// Must reorder rows-columns
for (int ic=0; ic<2; ic++) for (int jc=0; jc<3; jc++) for (int r=0; r<2; r++)
gT[(ic*3+jc)*2+r] = (float)g[4*i*18 + (jc*3+ic)*2+r];
packS12S(r+i, gT);
}
}
} else {
for (int dir = 0; dir < 4; dir++) {
double *g = gauge + (oddBit*Nh*4+dir)*gaugeSiteSize;
float4 *r = res + dir*2*Nh;
for (int i = 0; i < Nh; i++) {
// Must reorder rows-columns
for (int ic=0; ic<3; ic++) for (int jc=0; jc<3; jc++) for (int r=0; r<2; r++)
gT[(ic*3+jc)*2+r] = (float)g[4*i*18 + (jc*3+ic)*2+r];
packS8S(r+i, gT);
}
}
}
}
void packCPSGaugeFieldHD(short4 *res, double *gauge, int oddBit, ReconstructType reconstruct) {
float gT[18];
if (reconstruct == QUDA_RECONSTRUCT_12) {
for (int dir = 0; dir < 4; dir++) {
double *g = gauge + (oddBit*Nh*4+dir)*gaugeSiteSize;
short4 *r = res + dir*3*Nh;
for (int i = 0; i < Nh; i++) {
// Must reorder rows-columns
for (int ic=0; ic<2; ic++) for (int jc=0; jc<3; jc++) for (int r=0; r<2; r++)
gT[(ic*3+jc)*2+r] = (float)g[4*i*18 + (jc*3+ic)*2+r];
packH12S(r+i, gT);
}
}
} else {
for (int dir = 0; dir < 4; dir++) {
double *g = gauge + (oddBit*Nh*4+dir)*gaugeSiteSize;
short4 *r = res + dir*2*Nh;
for (int i = 0; i < Nh; i++) {
// Must reorder rows-columns
for (int ic=0; ic<3; ic++) for (int jc=0; jc<3; jc++) for (int r=0; r<2; r++)
gT[(ic*3+jc)*2+r] = (float)g[4*i*18 + (jc*3+ic)*2+r];
packH8S(r+i, gT);
}
}
}
}
// Single precision version of the above
void packCPSGaugeFieldSS(float4 *res, float *gauge, int oddBit, ReconstructType reconstruct) {
float gT[18];
if (reconstruct == QUDA_RECONSTRUCT_12) {
for (int dir = 0; dir < 4; dir++) {
float *g = gauge + (oddBit*Nh*4+dir)*gaugeSiteSize;
float4 *r = res + dir*3*Nh;
for (int i = 0; i < Nh; i++) {
// Must reorder rows-columns
for (int ic=0; ic<2; ic++) for (int jc=0; jc<3; jc++) for (int r=0; r<2; r++)
gT[(ic*3+jc)*2+r] = g[4*i*18 + (jc*3+ic)*2+r];
packS12S(r+i, gT);
}
}
} else {
for (int dir = 0; dir < 4; dir++) {
float *g = gauge + (oddBit*Nh*4+dir)*gaugeSiteSize;
float4 *r = res + dir*2*Nh;
for (int i = 0; i < Nh; i++) {
// Must reorder rows-columns
for (int ic=0; ic<3; ic++) for (int jc=0; jc<3; jc++) for (int r=0; r<2; r++)
gT[(ic*3+jc)*2+r] = g[4*i*18 + (jc*3+ic)*2+r];
packS8S(r+i, gT);
}
}
}
}
// Cuda half precision, single precision CPS field
void packCPSGaugeFieldHS(short4 *res, float *gauge, int oddBit, ReconstructType reconstruct) {
if (reconstruct == QUDA_RECONSTRUCT_12) {
for (int dir = 0; dir < 4; dir++) {
float *g = gauge + (oddBit*Nh*4+dir)*gaugeSiteSize;
float gT[12];
short4 *r = res + dir*3*Nh;
for (int i = 0; i < Nh; i++) {
// Must reorder rows-columns
for (int ic=0; ic<2; ic++) for (int jc=0; jc<3; jc++) for (int r=0; r<2; r++)
gT[(ic*3+jc)*2+r] = g[4*i*18 + (jc*3+ic)*2+r];
packH12S(r+i, gT);
}
}
} else {
for (int dir = 0; dir < 4; dir++) {
float *g = gauge + (oddBit*Nh*4+dir)*gaugeSiteSize;
float gT[18];
short4 *r = res + dir*2*Nh;
for (int i = 0; i < Nh; i++) {
// Must reorder rows-columns
for (int ic=0; ic<3; ic++) for (int jc=0; jc<3; jc++) for (int r=0; r<2; r++)
gT[(ic*3+jc)*2+r] = g[4*i*18 + (jc*3+ic)*2+r];
packH8S(r+i, gT);
pack8(res+i, gT, dir);
}
}
}
......@@ -411,134 +169,43 @@ void freeGaugeField(FullGauge *cudaGauge) {
cudaGauge->odd = NULL;
}
void loadGaugeField(FullGauge *cudaGauge, void *cpuGauge) {
template <typename Float, typename FloatN>
void loadGaugeField(FloatN *even, FloatN *odd, Float *cpuGauge, ReconstructType reconstruct, int packedGaugeBytes) {
if (cudaGauge->precision == QUDA_DOUBLE_PRECISION) {
if (gauge_param->cpu_prec != QUDA_DOUBLE_PRECISION) {
printf("Error: can only create a double GPU gauge field from a double CPU gauge field\n");
exit(-1);
}
// Use pinned memory
double2 *packedEven, *packedOdd;
// Use pinned memory
FloatN *packedEven, *packedOdd;
#ifndef __DEVICE_EMULATION__
cudaMallocHost((void**)&packedEven, cudaGauge->packedGaugeBytes);
cudaMallocHost((void**)&packedOdd, cudaGauge->packedGaugeBytes);
cudaMallocHost((void**)&packedEven, packedGaugeBytes);
cudaMallocHost((void**)&packedOdd, packedGaugeBytes);
#else
packedEven = (double2*)malloc(cudaGauge->packedGaugeBytes);
packedOdd = (double2*)malloc(cudaGauge->packedGaugeBytes);
packedEven = (FloatN*)malloc(packedGaugeBytes);
packedOdd = (FloatN*)malloc(packedGaugeBytes);
#endif
if (gauge_param->gauge_order == QUDA_QDP_GAUGE_ORDER) {
packQDPGaugeFieldDD(packedEven, (double**)cpuGauge, 0, cudaGauge->reconstruct);
packQDPGaugeFieldDD(packedOdd, (double**)cpuGauge, 1, cudaGauge->reconstruct);
} else if (gauge_param->gauge_order == QUDA_CPS_WILSON_GAUGE_ORDER) {
packCPSGaugeFieldDD(packedEven, (double*)cpuGauge, 0, cudaGauge->reconstruct);
packCPSGaugeFieldDD(packedOdd, (double*)cpuGauge, 1, cudaGauge->reconstruct);
} else {
printf("Sorry, %d GaugeFieldOrder not supported\n", gauge_param->gauge_order);
exit(-1);
}
cudaMemcpy(cudaGauge->even, packedEven, cudaGauge->packedGaugeBytes, cudaMemcpyHostToDevice);
cudaMemcpy(cudaGauge->odd, packedOdd, cudaGauge->packedGaugeBytes, cudaMemcpyHostToDevice);
#ifndef __DEVICE_EMULATION__
cudaFreeHost(packedEven);
cudaFreeHost(packedOdd);
#else
free(packedEven);
free(packedOdd);
#endif
} else if (cudaGauge->precision == QUDA_SINGLE_PRECISION) {
// Use pinned memory
float4 *packedEven, *packedOdd;
#ifndef __DEVICE_EMULATION__
cudaMallocHost((void**)&packedEven, cudaGauge->packedGaugeBytes);
cudaMallocHost((void**)&packedOdd, cudaGauge->packedGaugeBytes);
#else
packedEven = (float4*)malloc(cudaGauge->packedGaugeBytes);
packedOdd = (float4*)malloc(cudaGauge->packedGaugeBytes);
#endif
if (gauge_param->gauge_order == QUDA_QDP_GAUGE_ORDER) {
if (gauge_param->cpu_prec == QUDA_DOUBLE_PRECISION) {
packQDPGaugeFieldSD(packedEven, (double**)cpuGauge, 0, cudaGauge->reconstruct);
packQDPGaugeFieldSD(packedOdd, (double**)cpuGauge, 1, cudaGauge->reconstruct);
} else {
packQDPGaugeFieldSS(packedEven, (float**)cpuGauge, 0, cudaGauge->reconstruct);
packQDPGaugeFieldSS(packedOdd, (float**)cpuGauge, 1, cudaGauge->reconstruct);
}
} else if (gauge_param->gauge_order == QUDA_CPS_WILSON_GAUGE_ORDER) {
if (gauge_param->cpu_prec == QUDA_DOUBLE_PRECISION) {
packCPSGaugeFieldSD(packedEven, (double*)cpuGauge, 0, cudaGauge->reconstruct);
packCPSGaugeFieldSD(packedOdd, (double*)cpuGauge, 1, cudaGauge->reconstruct);
} else {
packCPSGaugeFieldSS(packedEven, (float*)cpuGauge, 0, cudaGauge->reconstruct);
packCPSGaugeFieldSS(packedOdd, (float*)cpuGauge, 1, cudaGauge->reconstruct);
}
} else {
printf("Sorry, %d GaugeFieldOrder not supported\n", gauge_param->gauge_order);
exit(-1);
}
cudaMemcpy(cudaGauge->even, packedEven, cudaGauge->packedGaugeBytes, cudaMemcpyHostToDevice);
cudaMemcpy(cudaGauge->odd, packedOdd, cudaGauge->packedGaugeBytes, cudaMemcpyHostToDevice);
#ifndef __DEVICE_EMULATION__
cudaFreeHost(packedEven);
cudaFreeHost(packedOdd);
#else
free(packedEven);
free(packedOdd);
#endif
if (gauge_param->gauge_order == QUDA_QDP_GAUGE_ORDER) {
packQDPGaugeField(packedEven, (Float**)cpuGauge, 0, reconstruct);
packQDPGaugeField(packedOdd, (Float**)cpuGauge, 1, reconstruct);
} else if (gauge_param->gauge_order == QUDA_CPS_WILSON_GAUGE_ORDER) {
packCPSGaugeField(packedEven, (Float*)cpuGauge, 0, reconstruct);
packCPSGaugeField(packedOdd, (Float*)cpuGauge, 1, reconstruct);
} else {
// Use pinned memory
short4 *packedEven, *packedOdd;
#ifndef __DEVICE_EMULATION__
cudaMallocHost((void**)&packedEven, cudaGauge->packedGaugeBytes);
cudaMallocHost((void**)&packedOdd, cudaGauge->packedGaugeBytes);
#else
packedEven = (short4*)malloc(cudaGauge->packedGaugeBytes);
packedOdd = (short4*)malloc(cudaGauge->packedGaugeBytes);
#endif
printf("Sorry, %d GaugeFieldOrder not supported\n", gauge_param->gauge_order);
exit(-1);
}
cudaMemcpy(even, packedEven, packedGaugeBytes, cudaMemcpyHostToDevice);
cudaMemcpy(odd, packedOdd, packedGaugeBytes, cudaMemcpyHostToDevice);
if (gauge_param->gauge_order == QUDA_QDP_GAUGE_ORDER) {
if (gauge_param->cpu_prec == QUDA_DOUBLE_PRECISION) {
packQDPGaugeFieldHD(packedEven, (double**)cpuGauge, 0, cudaGauge->reconstruct);
packQDPGaugeFieldHD(packedOdd, (double**)cpuGauge, 1, cudaGauge->reconstruct);
} else {
packQDPGaugeFieldHS(packedEven, (float**)cpuGauge, 0, cudaGauge->reconstruct);
packQDPGaugeFieldHS(packedOdd, (float**)cpuGauge, 1, cudaGauge->reconstruct);
}
} else if (gauge_param->gauge_order == QUDA_CPS_WILSON_GAUGE_ORDER) {
if (gauge_param->cpu_prec == QUDA_DOUBLE_PRECISION) {
packCPSGaugeFieldHD(packedEven, (double*)cpuGauge, 0, cudaGauge->reconstruct);
packCPSGaugeFieldHD(packedOdd, (double*)cpuGauge, 1, cudaGauge->reconstruct);
} else {
packCPSGaugeFieldHS(packedEven, (float*)cpuGauge, 0, cudaGauge->reconstruct);
packCPSGaugeFieldHS(packedOdd, (float*)cpuGauge, 1, cudaGauge->reconstruct);
}
} else {
printf("Sorry, %d GaugeFieldOrder not supported\n", gauge_param->gauge_order);
exit(-1);
}
cudaMemcpy(cudaGauge->even, packedEven, cudaGauge->packedGaugeBytes, cudaMemcpyHostToDevice);
cudaMemcpy(cudaGauge->odd, packedOdd, cudaGauge->packedGaugeBytes, cudaMemcpyHostToDevice);
#ifndef __DEVICE_EMULATION__
cudaFreeHost(packedEven);
cudaFreeHost(packedOdd);
cudaFreeHost(packedEven);
cudaFreeHost(packedOdd);
#else
free(packedEven);
free(packedOdd);
free(packedEven);
free(packedOdd);
#endif
}
}
void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, ReconstructType reconstruct, Precision precision) {
......@@ -553,7 +220,29 @@ void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, ReconstructType reco
exit(-1);
}
if (precision == QUDA_DOUBLE_PRECISION && gauge_param->cpu_prec != QUDA_DOUBLE_PRECISION) {
printf("Error: can only create a double GPU gauge field from a double CPU gauge field\n");
exit(-1);
}
allocateGaugeField(cudaGauge, reconstruct, precision);
loadGaugeField(cudaGauge, cpuGauge);
if (precision == QUDA_DOUBLE_PRECISION) {
loadGaugeField((double2*)(cudaGauge->even), (double2*)(cudaGauge->odd), (double*)cpuGauge,
cudaGauge->reconstruct, cudaGauge->packedGaugeBytes);
} else if (precision == QUDA_SINGLE_PRECISION) {
if (gauge_param->cpu_prec == QUDA_DOUBLE_PRECISION)
loadGaugeField((float4*)(cudaGauge->even), (float4*)(cudaGauge->odd), (double*)cpuGauge,
cudaGauge->reconstruct, cudaGauge->packedGaugeBytes);
else if (gauge_param->cpu_prec == QUDA_SINGLE_PRECISION)
loadGaugeField((float4*)(cudaGauge->even), (float4*)(cudaGauge->odd), (float*)cpuGauge,
cudaGauge->reconstruct, cudaGauge->packedGaugeBytes);
} else if (precision == QUDA_HALF_PRECISION) {
if (gauge_param->cpu_prec == QUDA_DOUBLE_PRECISION)
loadGaugeField((short4*)(cudaGauge->even), (short4*)(cudaGauge->odd), (double*)cpuGauge,
cudaGauge->reconstruct, cudaGauge->packedGaugeBytes);
else if (gauge_param->cpu_prec == QUDA_SINGLE_PRECISION)
loadGaugeField((short4*)(cudaGauge->even), (short4*)(cudaGauge->odd), (float*)cpuGauge,
cudaGauge->reconstruct, cudaGauge->packedGaugeBytes);
}
}
......@@ -104,28 +104,6 @@ void freeSpinorBuffer() {
packedSpinor1 = NULL;
}
inline void packDouble2(double2* a, double *b) {
a->x = b[0]; a->y = b[1];
}
inline void unpackDouble2(double *a, double2 *b) {
a[0] = b->x; a[1] = b->y;
}
inline void packFloat4(float4* a, float *b) {
__m128 SSEtmp;
SSEtmp = _mm_loadu_ps((const float*)b);
_mm_storeu_ps((float*)a, SSEtmp);
//a->x = b[0]; a->y = b[1]; a->z = b[2]; a->w = b[3];
}
inline void unpackFloat4(float *a, float4 *b) {
__m128 SSEtmp;
SSEtmp = _mm_load_ps((const float*)b);
_mm_storeu_ps((float*)a, SSEtmp);
//a[0] = b->x; a[1] = b->y; a[2] = b->z; a[3] = b->w;
}
template <typename Float>
inline void packSpinorVector(float4* a, Float *b) {
Float K = 1.0 / 2.0;
......@@ -134,11 +112,11 @@ inline void packSpinorVector(float4* a, Float *b) {
a[0*Nh].y = K*(b[1*6+0*2+1]+b[3*6+0*2+1]);
a[0*Nh].z = K*(b[1*6+1*2+0]+b[3*6+1*2+0]);
a[0*Nh].w = K*(b[1*6+1*2+1]+b[3*6+1*2+1]);
a[1*Nh].x = K*(b[1*6+2*2+0]+b[3*6+2*2+0]);
a[1*Nh].y = K*(b[1*6+2*2+1]+b[3*6+2*2+1]);
a[1*Nh].z = -K*(b[0*6+0*2+0]+b[2*6+0*2+0]);
a[1*Nh].w = -K*(b[0*6+0*2+1]+b[2*6+0*2+1]);
a[1*Nh].z = -K*(b[2*6+0*2+0]+b[0*6+0*2+0]);
a[1*Nh].w = -K*(b[2*6+0*2+1]+b[0*6+0*2+1]);
a[2*Nh].x = -K*(b[0*6+1*2+0]+b[2*6+1*2+0]);
a[2*Nh].y = -K*(b[0*6+1*2+1]+b[2*6+1*2+1]);
......@@ -159,7 +137,6 @@ inline void packSpinorVector(float4* a, Float *b) {
a[5*Nh].y = K*(b[2*6+1*2+1]-b[0*6+1*2+1]);
a[5*Nh].z = K*(b[2*6+2*2+0]-b[0*6+2*2+0]);
a[5*Nh].w = K*(b[2*6+2*2+1]-b[0*6+2*2+1]);
}
template <typename Float>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment