Advanced Computing Platform for Theoretical Physics

commit大文件会使得服务器变得不稳定,请大家尽量只commit代码,不要commit大的文件。

Commit 937f5509 authored by mikeaclark's avatar mikeaclark
Browse files

Fixed pack test

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@403 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 86503876
......@@ -8,7 +8,7 @@
#include <gauge_quda.h>
// What test are we doing (0 = dslash, 1 = MatPC, 2 = Mat)
int test_type = 0;
int test_type = 1;
QudaGaugeParam gaugeParam;
QudaInvertParam inv_param;
......@@ -30,7 +30,7 @@ int TRANSFER = 0; // include transfer time in the benchmark?
void init() {
gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION;
gaugeParam.cuda_prec = QUDA_HALF_PRECISION;
gaugeParam.cuda_prec = QUDA_DOUBLE_PRECISION;
gaugeParam.reconstruct = QUDA_RECONSTRUCT_12;
gaugeParam.reconstruct_sloppy = gaugeParam.reconstruct;
gaugeParam.cuda_prec_sloppy = gaugeParam.cuda_prec;
......@@ -45,7 +45,7 @@ void init() {
gauge_param = &gaugeParam;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_HALF_PRECISION;
inv_param.cuda_prec = QUDA_DOUBLE_PRECISION;
if (test_type == 2) inv_param.dirac_order = QUDA_DIRAC_ORDER;
else inv_param.dirac_order = QUDA_DIRAC_ORDER;
inv_param.kappa = kappa;
......
......@@ -20,7 +20,7 @@ int main(int argc, char **argv)
Gauge_param.cuda_prec = QUDA_DOUBLE_PRECISION;
Gauge_param.reconstruct = QUDA_RECONSTRUCT_12;
Gauge_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
Gauge_param.cuda_prec_sloppy = QUDA_DOUBLE_PRECISION;
Gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12;
Gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
......@@ -36,15 +36,15 @@ int main(int argc, char **argv)
Gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
gauge_param = &Gauge_param;
double mass = -0.96;
double mass = -0.97;
inv_param.kappa = 1.0 / (2.0*(4 + mass));
inv_param.tol = 1e-7;
inv_param.maxiter = 5000;
inv_param.reliable_delta = 1e-2;
inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
inv_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
inv_param.cuda_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec_sloppy = QUDA_DOUBLE_PRECISION;
inv_param.solution_type = QUDA_MAT_SOLUTION;
inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO;
......
......@@ -110,28 +110,24 @@ void packTest() {
printf("QDP Gauge send time = %e seconds\n", qdpGtime);
stopwatchStart();
loadSpinorField(cudaFullSpinor, (void*)spinor, QUDA_SINGLE_PRECISION,
QUDA_SINGLE_PRECISION, QUDA_DIRAC_ORDER);
loadSpinorField(cudaFullSpinor, (void*)spinor, QUDA_SINGLE_PRECISION, QUDA_DIRAC_ORDER);
double sSendTime = stopwatchReadSeconds();
printf("Spinor send time = %e seconds\n", sSendTime);
stopwatchStart();
stopwatchStart();
loadParitySpinor(cudaFullSpinor.even, (void*)spinor, QUDA_SINGLE_PRECISION,
QUDA_SINGLE_PRECISION, QUDA_DIRAC_ORDER);
loadParitySpinor(cudaFullSpinor.even, (void*)spinor, QUDA_SINGLE_PRECISION, QUDA_DIRAC_ORDER);
double pSendTime = stopwatchReadSeconds();
printf("Parity spinor send time = %e seconds\n", pSendTime);
stopwatchStart();
retrieveSpinorField(spinor, cudaFullSpinor, QUDA_SINGLE_PRECISION,
QUDA_SINGLE_PRECISION, QUDA_DIRAC_ORDER);
retrieveSpinorField(spinor, cudaFullSpinor, QUDA_SINGLE_PRECISION, QUDA_DIRAC_ORDER);
double sRecTime = stopwatchReadSeconds();
printf("Spinor receive time = %e seconds\n", sRecTime);
stopwatchStart();
retrieveParitySpinor(spinor, cudaParitySpinor, QUDA_SINGLE_PRECISION,
QUDA_SINGLE_PRECISION, QUDA_DIRAC_ORDER);
retrieveParitySpinor(spinor, cudaParitySpinor, QUDA_SINGLE_PRECISION, QUDA_DIRAC_ORDER);
double pRecTime = stopwatchReadSeconds();
printf("Parity receive time = %e seconds\n", pRecTime);
......
......@@ -146,7 +146,6 @@ void packFullSpinorDD(double2 *even, double2 *odd, double *spinor) {
b[3*6+cr] = K*(a[2*6+cr]-a[0*6+cr]);
}
}
for (int j = 0; j < 12; j++) packDouble2(even+j*Nh+i, b+j*2);
}
......@@ -162,58 +161,15 @@ void packFullSpinorDD(double2 *even, double2 *odd, double *spinor) {
b[3*6+cr] = K*(a[2*6+cr]-a[0*6+cr]);
}
}
for (int j=0; j<12; j++) packDouble2(odd+j*Nh+i, b+j*2);
}
}
}
void packFullSpinorSD(float4 *even, float4 *odd, double *spinor) {
double K = 1.0 / 2.0;
float b[24];
for (int i=0; i<Nh; i++) {
int boundaryCrossings = i/L1h + i/(L2*L1h) + i/(L3*L2*L1h);
{ // even sites
int k = 2*i + boundaryCrossings%2;
double *a = spinor + k*24;
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
b[0*6+cr] = K*(a[1*6+cr]+a[3*6+cr]);
b[1*6+cr] = -K*(a[0*6+cr]+a[2*6+cr]);
b[2*6+cr] = K*(a[1*6+cr]-a[3*6+cr]);
b[3*6+cr] = K*(a[2*6+cr]-a[0*6+cr]);
}
}
for (int j=0; j<6; j++) packFloat4(even+j*Nh+i, b+j*4);
}
{ // odd sites
int k = 2*i + (boundaryCrossings+1)%2;
double *a = spinor + k*24;
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
b[0*6+cr] = K*(a[1*6+cr]+a[3*6+cr]);
b[1*6+cr] = -K*(a[0*6+cr]+a[2*6+cr]);
b[2*6+cr] = K*(a[1*6+cr]-a[3*6+cr]);
b[3*6+cr] = K*(a[2*6+cr]-a[0*6+cr]);
}
}
for (int j=0; j<6; j++) packFloat4(odd+j*Nh+i, b+j*4);
}
}
}
void packFullSpinorSS(float4 *even, float4 *odd, float *spinor) {
float K = 1.0 / 2.0;
template <typename Float>
void packFullSpinorSF(float4 *even, float4 *odd, Float *spinor) {
Float K = 1.0 / 2.0;
float b[24];
for (int i=0; i<Nh; i++) {
......@@ -222,7 +178,7 @@ void packFullSpinorSS(float4 *even, float4 *odd, float *spinor) {
{ // even sites
int k = 2*i + boundaryCrossings%2;
float *a = spinor + k*24;
Float *a = spinor + k*24;
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
......@@ -232,13 +188,12 @@ void packFullSpinorSS(float4 *even, float4 *odd, float *spinor) {
b[3*6+cr] = K*(a[2*6+cr]-a[0*6+cr]);
}
}
for (int j=0; j<6; j++) packFloat4(even+j*Nh+i, b+j*4);
}
{ // odd sites
int k = 2*i + (boundaryCrossings+1)%2;
float *a = spinor + k*24;
Float *a = spinor + k*24;
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
......@@ -247,8 +202,7 @@ void packFullSpinorSS(float4 *even, float4 *odd, float *spinor) {
b[2*6+cr] = K*(a[1*6+cr]-a[3*6+cr]);
b[3*6+cr] = K*(a[2*6+cr]-a[0*6+cr]);
}
}
}
for (int j=0; j<6; j++) packFloat4(odd+j*Nh+i, b+j*4);
}
}
......@@ -436,55 +390,9 @@ void unpackFullSpinorDD(double *res, double2 *even, double2 *odd) {
}
void unpackFullSpinorDS(double *res, float4 *even, float4 *odd) {
double K = 1.0;
float b[24];
for (int i=0; i<Nh; i++) {
int boundaryCrossings = i/L1h + i/(L2*L1h) + i/(L3*L2*L1h);
{ // even sites
int k = 2*i + boundaryCrossings%2;
double *a = res + k*24;
for (int j = 0; j < 6; j++) unpackFloat4(b+j*4, even+j*Nh+i);
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
a[0*6+cr] = -K*(b[1*6+cr]+b[3*6+cr]);
a[1*6+cr] = K*(b[0*6+cr]+b[2*6+cr]);
a[2*6+cr] = -K*(b[1*6+cr]-b[3*6+cr]);
a[3*6+cr] = -K*(b[2*6+cr]-b[0*6+cr]);
}
}
}
{ // odd sites
int k = 2*i + (boundaryCrossings+1)%2;
double *a = res + k*24;
for (int j = 0; j < 6; j++) unpackFloat4(b+j*4, odd+j*Nh+i);
for (int c=0; c<3; c++) {
for (int r=0; r<2; r++) {
int cr = c*2+r;
a[0*6+cr] = -K*(b[1*6+cr]+b[3*6+cr]);
a[1*6+cr] = K*(b[0*6+cr]+b[2*6+cr]);
a[2*6+cr] = -K*(b[1*6+cr]-b[3*6+cr]);
a[3*6+cr] = -K*(b[2*6+cr]-b[0*6+cr]);
}
}
}
}
}
void unpackFullSpinorSS(float *res, float4 *even, float4 *odd) {
float K = 1.0;
template <typename Float>
void unpackFullSpinorFS(Float *res, float4 *even, float4 *odd) {
Float K = 1.0;
float b[24];
for (int i=0; i<Nh; i++) {
......@@ -493,7 +401,7 @@ void unpackFullSpinorSS(float *res, float4 *even, float4 *odd) {
{ // even sites
int k = 2*i + boundaryCrossings%2;
float *a = res + k*24;
Float *a = res + k*24;
for (int j = 0; j < 6; j++) unpackFloat4(b+j*4, even+j*Nh+i);
......@@ -511,7 +419,7 @@ void unpackFullSpinorSS(float *res, float4 *even, float4 *odd) {
{ // odd sites
int k = 2*i + (boundaryCrossings+1)%2;
float *a = res + k*24;
Float *a = res + k*24;
for (int j = 0; j < 6; j++) unpackFloat4(b+j*4, odd+j*Nh+i);
......@@ -715,8 +623,7 @@ void loadFullSpinor(FullSpinor ret, void *spinor, Precision cpu_prec) {
if (ret.even.precision != QUDA_HALF_PRECISION) {
size_t spinor_bytes;
if (ret.even.precision == QUDA_DOUBLE_PRECISION) spinor_bytes = Nh*spinorSiteSize*sizeof(double);
else if (ret.even.precision == QUDA_SINGLE_PRECISION) spinor_bytes = Nh*spinorSiteSize*sizeof(float);
else spinor_bytes = Nh*spinorSiteSize*sizeof(float)/2;
else spinor_bytes = Nh*spinorSiteSize*sizeof(float);
#ifndef __DEVICE_EMULATION__
if (!packedSpinor1) cudaMallocHost(&packedSpinor1, spinor_bytes);
......@@ -729,8 +636,9 @@ void loadFullSpinor(FullSpinor ret, void *spinor, Precision cpu_prec) {
if (ret.even.precision == QUDA_DOUBLE_PRECISION) {
packFullSpinorDD((double2*)packedSpinor1, (double2*)packedSpinor2, (double*)spinor);
} else {
if (cpu_prec == QUDA_DOUBLE_PRECISION) packFullSpinorSD((float4*)packedSpinor1, (float4*)packedSpinor2, (double*)spinor);
else packFullSpinorSS((float4*)packedSpinor1, (float4*)packedSpinor2, (float*)spinor);
if (cpu_prec == QUDA_DOUBLE_PRECISION)
packFullSpinorSF((float4*)packedSpinor1, (float4*)packedSpinor2, (double*)spinor);
else packFullSpinorSF((float4*)packedSpinor1, (float4*)packedSpinor2, (float*)spinor);
}
cudaMemcpy(ret.even.spinor, packedSpinor1, spinor_bytes, cudaMemcpyHostToDevice);
......@@ -820,8 +728,9 @@ void retrieveFullSpinor(void *res, FullSpinor spinor, Precision cpu_prec) {
if (spinor.even.precision == QUDA_DOUBLE_PRECISION) {
unpackFullSpinorDD((double*)res, (double2*)packedSpinor1, (double2*)packedSpinor2);
} else {
if (cpu_prec == QUDA_DOUBLE_PRECISION) unpackFullSpinorDS((double*)res, (float4*)packedSpinor1, (float4*)packedSpinor2);
else unpackFullSpinorSS((float*)res, (float4*)packedSpinor1, (float4*)packedSpinor2);
if (cpu_prec == QUDA_DOUBLE_PRECISION)
unpackFullSpinorFS((double*)res, (float4*)packedSpinor1, (float4*)packedSpinor2);
else unpackFullSpinorFS((float*)res, (float4*)packedSpinor1, (float4*)packedSpinor2);
}
#ifndef __DEVICE_EMULATION__
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment