#include #include #include #include #include #include ParityClover allocateParityClover(int *X, Precision precision) { ParityClover ret; ret.precision = precision; ret.volume = 1; for (int d=0; d<4; d++) { ret.X[d] = X[d]; ret.volume *= X[d]; } ret.Nc = 3; ret.Ns = 4; ret.length = ret.volume*ret.Nc*ret.Nc*ret.Ns*ret.Ns/2; // block-diagonal Hermitian (72 reals) if (precision == QUDA_DOUBLE_PRECISION) ret.bytes = ret.length*sizeof(double); else if (precision == QUDA_SINGLE_PRECISION) ret.bytes = ret.length*sizeof(float); else ret.bytes = ret.length*sizeof(short); if (cudaMalloc((void**)&(ret.clover), ret.bytes) == cudaErrorMemoryAllocation) { printf("Error allocating clover term\n"); exit(0); } if (precision == QUDA_HALF_PRECISION) { if (cudaMalloc((void**)&ret.cloverNorm, ret.bytes/18) == cudaErrorMemoryAllocation) { printf("Error allocating cloverNorm\n"); exit(0); } } return ret; } FullClover allocateCloverField(int *X, Precision precision) { FullClover ret; ret.even = allocateParityClover(X, precision); ret.odd = allocateParityClover(X, precision); return ret; } void freeParityClover(ParityClover *clover) { cudaFree(clover->clover); clover->clover = NULL; } void freeCloverField(FullClover *clover) { freeParityClover(&clover->even); freeParityClover(&clover->odd); } template static inline void packCloverMatrix(float4* a, Float *b, int Vh) { const Float half = 0.5; // pre-include factor of 1/2 introduced by basis change for (int i=0; i<18; i++) { a[i*Vh].x = half * b[4*i+0]; a[i*Vh].y = half * b[4*i+1]; a[i*Vh].z = half * b[4*i+2]; a[i*Vh].w = half * b[4*i+3]; } } template static inline void packCloverMatrix(double2* a, Float *b, int Vh) { const Float half = 0.5; // pre-include factor of 1/2 introduced by basis change for (int i=0; i<36; i++) { a[i*Vh].x = half * b[2*i+0]; a[i*Vh].y = half * b[2*i+1]; } } template static void packParityClover(FloatN *res, Float *clover, int Vh) { for (int i = 0; i < Vh; i++) { packCloverMatrix(res+i, clover+72*i, Vh); } } template static void packFullClover(FloatN *even, FloatN *odd, Float *clover, int *X) { int Vh = X[0]*X[1]*X[2]*X[3]; X[0] *= 2; // X now contains dimensions of the full lattice for (int i=0; i static inline void packCloverMatrixHalf(short4 *res, float *norm, Float *clover, int Vh) { const Float half = 0.5; // pre-include factor of 1/2 introduced by basis change Float max, a, c; // treat the two chiral blocks separately for (int chi=0; chi<2; chi++) { max = fabs(clover[0]); for (int i=1; i<36; i++) { if ((a = fabs(clover[i])) > max) max = a; } c = MAX_SHORT/max; for (int i=0; i<9; i++) { res[i*Vh].x = (short) (c * clover[4*i+0]); res[i*Vh].y = (short) (c * clover[4*i+1]); res[i*Vh].z = (short) (c * clover[4*i+2]); res[i*Vh].w = (short) (c * clover[4*i+3]); } norm[chi*Vh] = half*max; res += 9*Vh; clover += 36; } } template static void packParityCloverHalf(short4 *res, float *norm, Float *clover, int Vh) { for (int i = 0; i < Vh; i++) { packCloverMatrixHalf(res+i, norm+i, clover+72*i, Vh); } } template static void packFullCloverHalf(short4 *even, float *evenNorm, short4 *odd, float *oddNorm, Float *clover, int *X) { int Vh = X[0]*X[1]*X[2]*X[3]; X[0] *= 2; // X now contains dimensions of the full lattice for (int i=0; iclover_cpu_prec == QUDA_HALF_PRECISION) { printf("QUDA error: half precision not supported on cpu\n"); exit(-1); } // X should contain the dimensions of the even/odd sublattice *cudaClover = allocateCloverField(X, precision); loadCloverField(*cudaClover, cpuClover, precision, invert_param->clover_order); } */