#include #include #include #include // not needed once call to allocateParitySpinor() is removed #if (__CUDA_ARCH__ == 130) static __inline__ __device__ double2 fetch_double2(texture t, int i) { int4 v = tex1Dfetch(t,i); return make_double2(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z)); } #endif // Double precision gauge field texture gauge0TexDouble; texture gauge1TexDouble; // Single precision gauge field texture gauge0TexSingle; texture gauge1TexSingle; // Half precision gauge field texture gauge0TexHalf; texture gauge1TexHalf; // Double precision input spinor field texture spinorTexDouble; // Single precision input spinor field texture spinorTexSingle; // Half precision input spinor field texture spinorTexHalf; texture spinorTexNorm; // Double precision accumulate spinor field texture accumTexDouble; // Single precision accumulate spinor field texture accumTexSingle; // Half precision accumulate spinor field texture accumTexHalf; texture accumTexNorm; // Double precision clover term texture cloverTexDouble; // Single precision clover term texture cloverTexSingle; // Half precision clover term texture cloverTexHalf; texture cloverTexNorm; QudaGaugeParam *gauge_param; QudaInvertParam *invert_param; __constant__ int X1h; __constant__ int X1; __constant__ int X2; __constant__ int X3; __constant__ int X4; __constant__ int X1m1; __constant__ int X2m1; __constant__ int X3m1; __constant__ int X4m1; __constant__ int X2X1mX1; __constant__ int X3X2X1mX2X1; __constant__ int X4X3X2X1mX3X2X1; __constant__ int X4X3X2X1hmX3X2X1h; __constant__ float X1h_inv; __constant__ float X2_inv; __constant__ float X3_inv; __constant__ int X2X1; __constant__ int X3X2X1; __constant__ int Vh; __constant__ int gauge_fixed; // single precision constants __constant__ float anisotropy_f; __constant__ float t_boundary_f; __constant__ float pi_f; // double precision constants __constant__ double anisotropy; __constant__ double t_boundary; static int initDslash = 0; unsigned long long dslash_quda_flops; unsigned long long dslash_quda_bytes; //#include // Dslash kernel definitions // kludge to avoid '#include nested too deeply' error #define DD_DAG 0 #include #undef DD_DAG #define DD_DAG 1 #include #undef DD_DAG #include // kernels for applying the clover term alone static void initDslashCuda(FullGauge gauge) { int Vh = gauge.volume; cudaMemcpyToSymbol("Vh", &Vh, sizeof(int)); if (gauge.blockDim%64 != 0) { printf("Sorry, block size not set approriately\n"); exit(-1); } if (Vh%gauge.blockDim !=0) { printf("Sorry, volume is not a multiple of number of threads %d\n", gauge.blockDim); exit(-1); } int X1 = 2*gauge.X[0]; cudaMemcpyToSymbol("X1", &X1, sizeof(int)); int X2 = gauge.X[1]; cudaMemcpyToSymbol("X2", &X2, sizeof(int)); int X3 = gauge.X[2]; cudaMemcpyToSymbol("X3", &X3, sizeof(int)); int X4 = gauge.X[3]; cudaMemcpyToSymbol("X4", &X4, sizeof(int)); int X2X1 = X2*X1; cudaMemcpyToSymbol("X2X1", &X2X1, sizeof(int)); int X3X2X1 = X3*X2*X1; cudaMemcpyToSymbol("X3X2X1", &X3X2X1, sizeof(int)); int X1h = X1/2; cudaMemcpyToSymbol("X1h", &X1h, sizeof(int)); float X1h_inv = 1.0 / X1h; cudaMemcpyToSymbol("X1h_inv", &X1h_inv, sizeof(float)); float X2_inv = 1.0 / X2; cudaMemcpyToSymbol("X2_inv", &X2_inv, sizeof(float)); float X3_inv = 1.0 / X3; cudaMemcpyToSymbol("X3_inv", &X3_inv, sizeof(float)); int X1m1 = X1 - 1; cudaMemcpyToSymbol("X1m1", &X1m1, sizeof(int)); int X2m1 = X2 - 1; cudaMemcpyToSymbol("X2m1", &X2m1, sizeof(int)); int X3m1 = X3 - 1; cudaMemcpyToSymbol("X3m1", &X3m1, sizeof(int)); int X4m1 = X4 - 1; cudaMemcpyToSymbol("X4m1", &X4m1, sizeof(int)); int X2X1mX1 = X2X1 - X1; cudaMemcpyToSymbol("X2X1mX1", &X2X1mX1, sizeof(int)); int X3X2X1mX2X1 = X3X2X1 - X2X1; cudaMemcpyToSymbol("X3X2X1mX2X1", &X3X2X1mX2X1, sizeof(int)); int X4X3X2X1mX3X2X1 = (X4-1)*X3X2X1; cudaMemcpyToSymbol("X4X3X2X1mX3X2X1", &X4X3X2X1mX3X2X1, sizeof(int)); int X4X3X2X1hmX3X2X1h = (X4-1)*X3*X2*X1h; cudaMemcpyToSymbol("X4X3X2X1hmX3X2X1h", &X4X3X2X1hmX3X2X1h, sizeof(int)); int gf = (gauge_param->gauge_fix == QUDA_GAUGE_FIXED_YES) ? 1 : 0; cudaMemcpyToSymbol("gauge_fixed", &(gf), sizeof(int)); cudaMemcpyToSymbol("anisotropy", &(gauge_param->anisotropy), sizeof(double)); double t_bc = (gauge_param->t_boundary == QUDA_PERIODIC_T) ? 1.0 : -1.0; cudaMemcpyToSymbol("t_boundary", &(t_bc), sizeof(double)); float anisotropy_f = gauge_param->anisotropy; cudaMemcpyToSymbol("anisotropy_f", &(anisotropy_f), sizeof(float)); float t_bc_f = (gauge_param->t_boundary == QUDA_PERIODIC_T) ? 1.0 : -1.0; cudaMemcpyToSymbol("t_boundary_f", &(t_bc_f), sizeof(float)); float h_pi_f = M_PI; cudaMemcpyToSymbol("pi_f", &(h_pi_f), sizeof(float)); initDslash = 1; } static void bindGaugeTex(FullGauge gauge, int oddBit) { if (gauge.precision == QUDA_DOUBLE_PRECISION) { if (oddBit) { cudaBindTexture(0, gauge0TexDouble, gauge.odd, gauge.bytes); cudaBindTexture(0, gauge1TexDouble, gauge.even, gauge.bytes); } else { cudaBindTexture(0, gauge0TexDouble, gauge.even, gauge.bytes); cudaBindTexture(0, gauge1TexDouble, gauge.odd, gauge.bytes); } } else if (gauge.precision == QUDA_SINGLE_PRECISION) { if (oddBit) { cudaBindTexture(0, gauge0TexSingle, gauge.odd, gauge.bytes); cudaBindTexture(0, gauge1TexSingle, gauge.even, gauge.bytes); } else { cudaBindTexture(0, gauge0TexSingle, gauge.even, gauge.bytes); cudaBindTexture(0, gauge1TexSingle, gauge.odd, gauge.bytes); } } else { if (oddBit) { cudaBindTexture(0, gauge0TexHalf, gauge.odd, gauge.bytes); cudaBindTexture(0, gauge1TexHalf, gauge.even, gauge.bytes); } else { cudaBindTexture(0, gauge0TexHalf, gauge.even, gauge.bytes); cudaBindTexture(0, gauge1TexHalf, gauge.odd, gauge.bytes); } } } static void bindCloverTex(ParityClover clover) { if (clover.precision == QUDA_DOUBLE_PRECISION) { cudaBindTexture(0, cloverTexDouble, clover.clover, clover.bytes); } else if (clover.precision == QUDA_SINGLE_PRECISION) { cudaBindTexture(0, cloverTexSingle, clover.clover, clover.bytes); } else { cudaBindTexture(0, cloverTexHalf, clover.clover, clover.bytes); cudaBindTexture(0, cloverTexNorm, clover.cloverNorm, clover.bytes/18); } } // ---------------------------------------------------------------------- static void checkSpinor(ParitySpinor out, ParitySpinor in) { if (in.precision != out.precision) { printf("Error in dslash quda: input and out spinor precisions don't match\n"); exit(-1); } #if (__CUDA_ARCH__ != 130) if (in.precision == QUDA_DOUBLE_PRECISION) { printf("Double precision not supported on this GPU\n"); exit(-1); } #endif } static void checkGaugeSpinor(ParitySpinor spinor, FullGauge gauge) { if (spinor.volume != gauge.volume) { printf("Error, spinor volume %d doesn't match gauge volume %d\n", spinor.volume, gauge.volume); exit(-1); } #if (__CUDA_ARCH__ != 130) if (gauge.precision == QUDA_DOUBLE_PRECISION) { printf("Double precision not supported on this GPU\n"); exit(-1); } #endif } static void checkCloverSpinor(ParitySpinor spinor, FullClover clover) { if (spinor.volume != clover.even.volume) { printf("Error, spinor volume %d doesn't match even clover volume %d\n", spinor.volume, clover.even.volume); exit(-1); } if (spinor.volume != clover.odd.volume) { printf("Error, spinor volume %d doesn't match odd clover volume %d\n", spinor.volume, clover.odd.volume); exit(-1); } #if (__CUDA_ARCH__ != 130) if ((clover.even.precision == QUDA_DOUBLE_PRECISION) || (clover.odd.precision == QUDA_DOUBLE_PRECISION)) { printf("Double precision not supported on this GPU\n"); exit(-1); } #endif } int dslashCudaSharedBytes(Precision precision, int blockDim) { if (precision == QUDA_DOUBLE_PRECISION) return blockDim*SHARED_FLOATS_PER_THREAD*sizeof(double); else return blockDim*SHARED_FLOATS_PER_THREAD*sizeof(float); } // ---------------------------------------------------------------------- // plain Wilson Dslash: void dslashCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity, int dagger) { if (!initDslash) initDslashCuda(gauge); checkSpinor(in, out); checkGaugeSpinor(in, gauge); if (in.precision == QUDA_DOUBLE_PRECISION) { dslashDCuda(out, gauge, in, parity, dagger); } else if (in.precision == QUDA_SINGLE_PRECISION) { dslashSCuda(out, gauge, in, parity, dagger); } else if (in.precision == QUDA_HALF_PRECISION) { dslashHCuda(out, gauge, in, parity, dagger); } dslash_quda_flops += 1320*in.volume; } void dslashDCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor, int oddBit, int daggerBit) { dim3 gridDim(res.volume/gauge.blockDim, 1, 1); dim3 blockDim(gauge.blockDim, 1, 1); bindGaugeTex(gauge, oddBit); int spinor_bytes = res.length*sizeof(double); cudaBindTexture(0, spinorTexDouble, spinor.spinor, spinor_bytes); int shared_bytes = blockDim.x*SHARED_FLOATS_PER_THREAD*sizeof(double); #if (__CUDA_ARCH__ == 130) if (gauge.precision == QUDA_DOUBLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashDD12Kernel <<>> ((double2 *)res.spinor, oddBit); } else { dslashDD12DaggerKernel <<>> ((double2 *)res.spinor, oddBit); } } else { if (!daggerBit) { dslashDD8Kernel <<>> ((double2 *)res.spinor, oddBit); } else { dslashDD8DaggerKernel <<>> ((double2 *)res.spinor, oddBit); } } } else if (gauge.precision == QUDA_SINGLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashSD12Kernel <<>> ((double2 *)res.spinor, oddBit); } else { dslashSD12DaggerKernel <<>> ((double2 *)res.spinor, oddBit); } } else { if (!daggerBit) { dslashSD8Kernel <<>> ((double2 *)res.spinor, oddBit); } else { dslashSD8DaggerKernel <<>> ((double2 *)res.spinor, oddBit); } } } else { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashHD12Kernel <<>> ((double2 *)res.spinor, oddBit); } else { dslashHD12DaggerKernel <<>> ((double2 *)res.spinor, oddBit); } } else { if (!daggerBit) { dslashHD8Kernel <<>> ((double2 *)res.spinor, oddBit); } else { dslashHD8DaggerKernel <<>> ((double2 *)res.spinor, oddBit); } } } #endif } void dslashSCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor, int oddBit, int daggerBit) { dim3 gridDim(res.volume/gauge.blockDim, 1, 1); dim3 blockDim(gauge.blockDim, 1, 1); bindGaugeTex(gauge, oddBit); int spinor_bytes = res.length*sizeof(float); cudaBindTexture(0, spinorTexSingle, spinor.spinor, spinor_bytes); int shared_bytes = blockDim.x*SHARED_FLOATS_PER_THREAD*sizeof(float); if (gauge.precision == QUDA_DOUBLE_PRECISION) { #if (__CUDA_ARCH__ == 130) if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashDS12Kernel <<>> ((float4 *)res.spinor, oddBit); } else { dslashDS12DaggerKernel <<>> ((float4 *)res.spinor, oddBit); } } else { if (!daggerBit) { dslashDS8Kernel <<>> ((float4 *)res.spinor, oddBit); } else { dslashDS8DaggerKernel <<>> ((float4 *)res.spinor, oddBit); } } #endif } else if (gauge.precision == QUDA_SINGLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashSS12Kernel <<>> ((float4 *)res.spinor, oddBit); } else { dslashSS12DaggerKernel <<>> ((float4 *)res.spinor, oddBit); } } else { if (!daggerBit) { dslashSS8Kernel <<>> ((float4 *)res.spinor, oddBit); } else { dslashSS8DaggerKernel <<>> ((float4 *)res.spinor, oddBit); } } } else { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashHS12Kernel <<>> ((float4 *)res.spinor, oddBit); } else { dslashHS12DaggerKernel <<>> ((float4 *)res.spinor, oddBit); } } else { if (!daggerBit) { dslashHS8Kernel <<>> ((float4 *)res.spinor, oddBit); } else { dslashHS8DaggerKernel <<>> ((float4 *)res.spinor, oddBit); } } } } void dslashHCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor, int oddBit, int daggerBit) { dim3 gridDim(res.volume/gauge.blockDim, 1, 1); dim3 blockDim(gauge.blockDim, 1, 1); bindGaugeTex(gauge, oddBit); int spinor_bytes = res.length*sizeof(float)/2; cudaBindTexture(0, spinorTexHalf, spinor.spinor, spinor_bytes); cudaBindTexture(0, spinorTexNorm, spinor.spinorNorm, spinor_bytes/12); int shared_bytes = blockDim.x*SHARED_FLOATS_PER_THREAD*sizeof(float); if (gauge.precision == QUDA_DOUBLE_PRECISION) { #if (__CUDA_ARCH__ == 130) if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashDH12Kernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } else { dslashDH12DaggerKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } } else { if (!daggerBit) { dslashDH8Kernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } else { dslashDH8DaggerKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } } #endif } else if (gauge.precision == QUDA_SINGLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashSH12Kernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } else { dslashSH12DaggerKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } } else { if (!daggerBit) { dslashSH8Kernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } else { dslashSH8DaggerKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } } } else { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashHH12Kernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } else { dslashHH12DaggerKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } } else { if (!daggerBit) { dslashHH8Kernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } else { dslashHH8DaggerKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } } } } void dslashXpayCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity, int dagger, ParitySpinor x, double a) { if (!initDslash) initDslashCuda(gauge); checkSpinor(in, out); checkGaugeSpinor(in, gauge); if (in.precision == QUDA_DOUBLE_PRECISION) { dslashXpayDCuda(out, gauge, in, parity, dagger, x, a); } else if (in.precision == QUDA_SINGLE_PRECISION) { dslashXpaySCuda(out, gauge, in, parity, dagger, x, a); } else if (in.precision == QUDA_HALF_PRECISION) { dslashXpayHCuda(out, gauge, in, parity, dagger, x, a); } dslash_quda_flops += (1320+48)*in.volume; } void dslashXpayDCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor, int oddBit, int daggerBit, ParitySpinor x, double a) { dim3 gridDim(res.volume/gauge.blockDim, 1, 1); dim3 blockDim(gauge.blockDim, 1, 1); bindGaugeTex(gauge, oddBit); int spinor_bytes = res.length*sizeof(double); cudaBindTexture(0, spinorTexDouble, spinor.spinor, spinor_bytes); cudaBindTexture(0, accumTexDouble, x.spinor, spinor_bytes); int shared_bytes = blockDim.x*SHARED_FLOATS_PER_THREAD*sizeof(double); #if (__CUDA_ARCH__ == 130) if (gauge.precision == QUDA_DOUBLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashDD12XpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } else { dslashDD12DaggerXpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashDD8XpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } else { dslashDD8DaggerXpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } } } else if (gauge.precision == QUDA_SINGLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashSD12XpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } else { dslashSD12DaggerXpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashSD8XpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } else { dslashSD8DaggerXpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } } } else { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashHD12XpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } else { dslashHD12DaggerXpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashHD8XpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } else { dslashHD8DaggerXpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } } } #endif } void dslashXpaySCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor, int oddBit, int daggerBit, ParitySpinor x, double a) { dim3 gridDim(res.volume/gauge.blockDim, 1, 1); dim3 blockDim(gauge.blockDim, 1, 1); bindGaugeTex(gauge, oddBit); int spinor_bytes = res.length*sizeof(float); cudaBindTexture(0, spinorTexSingle, spinor.spinor, spinor_bytes); cudaBindTexture(0, accumTexSingle, x.spinor, spinor_bytes); int shared_bytes = blockDim.x*SHARED_FLOATS_PER_THREAD*sizeof(float); if (gauge.precision == QUDA_DOUBLE_PRECISION) { #if (__CUDA_ARCH__ == 130) if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashDS12XpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } else { dslashDS12DaggerXpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashDS8XpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } else { dslashDS8DaggerXpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } } #endif } else if (gauge.precision == QUDA_SINGLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashSS12XpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } else { dslashSS12DaggerXpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashSS8XpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } else { dslashSS8DaggerXpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } } } else { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashHS12XpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } else { dslashHS12DaggerXpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashHS8XpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } else { dslashHS8DaggerXpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } } } } void dslashXpayHCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor, int oddBit, int daggerBit, ParitySpinor x, double a) { dim3 gridDim(res.volume/gauge.blockDim, 1, 1); dim3 blockDim(gauge.blockDim, 1, 1); bindGaugeTex(gauge, oddBit); int spinor_bytes = res.length*sizeof(float)/2; cudaBindTexture(0, spinorTexHalf, spinor.spinor, spinor_bytes); cudaBindTexture(0, spinorTexNorm, spinor.spinorNorm, spinor_bytes/12); cudaBindTexture(0, accumTexHalf, x.spinor, spinor_bytes); cudaBindTexture(0, accumTexNorm, x.spinorNorm, spinor_bytes/12); int shared_bytes = blockDim.x*SHARED_FLOATS_PER_THREAD*sizeof(float); if (gauge.precision == QUDA_DOUBLE_PRECISION) { #if (__CUDA_ARCH__ == 130) if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashDH12XpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } else { dslashDH12DaggerXpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashDH8XpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } else { dslashDH8DaggerXpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } } #endif } else if (gauge.precision == QUDA_SINGLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashSH12XpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } else { dslashSH12DaggerXpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashSH8XpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } else { dslashSH8DaggerXpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } } } else { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashHH12XpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } else { dslashHH12DaggerXpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashHH8XpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } else { dslashHH8DaggerXpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } } } } // Apply the even-odd preconditioned Dirac operator void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa, ParitySpinor tmp, MatPCType matpc_type, int dagger) { double kappa2 = -kappa*kappa; if (matpc_type == QUDA_MATPC_EVEN_EVEN) { dslashCuda(tmp, gauge, in, 1, dagger); dslashXpayCuda(out, gauge, tmp, 0, dagger, in, kappa2); } else if (matpc_type == QUDA_MATPC_ODD_ODD) { dslashCuda(tmp, gauge, in, 0, dagger); dslashXpayCuda(out, gauge, tmp, 1, dagger, in, kappa2); } else { printf("QUDA error: matpc_type not valid for plain Wilson\n"); exit(-1); } } void MatPCDagMatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa, ParitySpinor tmp, MatPCType matpc_type) { MatPCCuda(out, gauge, in, kappa, tmp, matpc_type, 0); MatPCCuda(out, gauge, out, kappa, tmp, matpc_type, 1); } // Apply the full operator void MatCuda(FullSpinor out, FullGauge gauge, FullSpinor in, double kappa, int dagger) { dslashXpayCuda(out.odd, gauge, in.even, 1, dagger, in.odd, -kappa); dslashXpayCuda(out.even, gauge, in.odd, 0, dagger, in.even, -kappa); } // ---------------------------------------------------------------------- // clover-improved Wilson Dslash // // apply hopping term, then clover: (A_ee^-1 D_eo) or (A_oo^-1 D_oe), // and likewise for dagger: (A_ee^-1 D^dagger_eo) or (A_oo^-1 D^dagger_oe) void cloverDslashCuda(ParitySpinor out, FullGauge gauge, FullClover cloverInv, ParitySpinor in, int parity, int dagger) { if (!initDslash) initDslashCuda(gauge); checkSpinor(in, out); checkGaugeSpinor(in, gauge); checkCloverSpinor(in, cloverInv); if (in.precision == QUDA_DOUBLE_PRECISION) { cloverDslashDCuda(out, gauge, cloverInv, in, parity, dagger); } else if (in.precision == QUDA_SINGLE_PRECISION) { cloverDslashSCuda(out, gauge, cloverInv, in, parity, dagger); } else if (in.precision == QUDA_HALF_PRECISION) { cloverDslashHCuda(out, gauge, cloverInv, in, parity, dagger); } dslash_quda_flops += (1320+504)*in.volume; } void cloverDslashDCuda(ParitySpinor res, FullGauge gauge, FullClover cloverInv, ParitySpinor spinor, int oddBit, int daggerBit) { dim3 gridDim(res.volume/gauge.blockDim, 1, 1); dim3 blockDim(gauge.blockDim, 1, 1); Precision clover_prec; bindGaugeTex(gauge, oddBit); if (oddBit) { bindCloverTex(cloverInv.odd); clover_prec = cloverInv.odd.precision; } else { bindCloverTex(cloverInv.even); clover_prec = cloverInv.even.precision; } int spinor_bytes = res.length*sizeof(double); cudaBindTexture(0, spinorTexDouble, spinor.spinor, spinor_bytes); int shared_bytes = blockDim.x*SHARED_FLOATS_PER_THREAD*sizeof(double); #if (__CUDA_ARCH__ == 130) if (clover_prec == QUDA_DOUBLE_PRECISION) { if (gauge.precision == QUDA_DOUBLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashDDD12Kernel <<>> ((double2 *)res.spinor, oddBit); } else { dslashDDD12DaggerKernel <<>> ((double2 *)res.spinor, oddBit); } } else { if (!daggerBit) { dslashDDD8Kernel <<>> ((double2 *)res.spinor, oddBit); } else { dslashDDD8DaggerKernel <<>> ((double2 *)res.spinor, oddBit); } } } else if (gauge.precision == QUDA_SINGLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashSDD12Kernel <<>> ((double2 *)res.spinor, oddBit); } else { dslashSDD12DaggerKernel <<>> ((double2 *)res.spinor, oddBit); } } else { if (!daggerBit) { dslashSDD8Kernel <<>> ((double2 *)res.spinor, oddBit); } else { dslashSDD8DaggerKernel <<>> ((double2 *)res.spinor, oddBit); } } } else { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashHDD12Kernel <<>> ((double2 *)res.spinor, oddBit); } else { dslashHDD12DaggerKernel <<>> ((double2 *)res.spinor, oddBit); } } else { if (!daggerBit) { dslashHDD8Kernel <<>> ((double2 *)res.spinor, oddBit); } else { dslashHDD8DaggerKernel <<>> ((double2 *)res.spinor, oddBit); } } } } else if (clover_prec == QUDA_SINGLE_PRECISION) { if (gauge.precision == QUDA_DOUBLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashDDS12Kernel <<>> ((double2 *)res.spinor, oddBit); } else { dslashDDS12DaggerKernel <<>> ((double2 *)res.spinor, oddBit); } } else { if (!daggerBit) { dslashDDS8Kernel <<>> ((double2 *)res.spinor, oddBit); } else { dslashDDS8DaggerKernel <<>> ((double2 *)res.spinor, oddBit); } } } else if (gauge.precision == QUDA_SINGLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashSDS12Kernel <<>> ((double2 *)res.spinor, oddBit); } else { dslashSDS12DaggerKernel <<>> ((double2 *)res.spinor, oddBit); } } else { if (!daggerBit) { dslashSDS8Kernel <<>> ((double2 *)res.spinor, oddBit); } else { dslashSDS8DaggerKernel <<>> ((double2 *)res.spinor, oddBit); } } } else { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashHDS12Kernel <<>> ((double2 *)res.spinor, oddBit); } else { dslashHDS12DaggerKernel <<>> ((double2 *)res.spinor, oddBit); } } else { if (!daggerBit) { dslashHDS8Kernel <<>> ((double2 *)res.spinor, oddBit); } else { dslashHDS8DaggerKernel <<>> ((double2 *)res.spinor, oddBit); } } } } else { if (gauge.precision == QUDA_DOUBLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashDDH12Kernel <<>> ((double2 *)res.spinor, oddBit); } else { dslashDDH12DaggerKernel <<>> ((double2 *)res.spinor, oddBit); } } else { if (!daggerBit) { dslashDDH8Kernel <<>> ((double2 *)res.spinor, oddBit); } else { dslashDDH8DaggerKernel <<>> ((double2 *)res.spinor, oddBit); } } } else if (gauge.precision == QUDA_SINGLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashSDH12Kernel <<>> ((double2 *)res.spinor, oddBit); } else { dslashSDH12DaggerKernel <<>> ((double2 *)res.spinor, oddBit); } } else { if (!daggerBit) { dslashSDH8Kernel <<>> ((double2 *)res.spinor, oddBit); } else { dslashSDH8DaggerKernel <<>> ((double2 *)res.spinor, oddBit); } } } else { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashHDH12Kernel <<>> ((double2 *)res.spinor, oddBit); } else { dslashHDH12DaggerKernel <<>> ((double2 *)res.spinor, oddBit); } } else { if (!daggerBit) { dslashHDH8Kernel <<>> ((double2 *)res.spinor, oddBit); } else { dslashHDH8DaggerKernel <<>> ((double2 *)res.spinor, oddBit); } } } } #endif } void cloverDslashSCuda(ParitySpinor res, FullGauge gauge, FullClover cloverInv, ParitySpinor spinor, int oddBit, int daggerBit) { dim3 gridDim(res.volume/gauge.blockDim, 1, 1); dim3 blockDim(gauge.blockDim, 1, 1); Precision clover_prec; bindGaugeTex(gauge, oddBit); if (oddBit) { bindCloverTex(cloverInv.odd); clover_prec = cloverInv.odd.precision; } else { bindCloverTex(cloverInv.even); clover_prec = cloverInv.even.precision; } int spinor_bytes = res.length*sizeof(float); cudaBindTexture(0, spinorTexSingle, spinor.spinor, spinor_bytes); int shared_bytes = blockDim.x*SHARED_FLOATS_PER_THREAD*sizeof(float); if (clover_prec == QUDA_DOUBLE_PRECISION) { #if (__CUDA_ARCH__ == 130) if (gauge.precision == QUDA_DOUBLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashDSD12Kernel <<>> ((float4 *)res.spinor, oddBit); } else { dslashDSD12DaggerKernel <<>> ((float4 *)res.spinor, oddBit); } } else { if (!daggerBit) { dslashDSD8Kernel <<>> ((float4 *)res.spinor, oddBit); } else { dslashDSD8DaggerKernel <<>> ((float4 *)res.spinor, oddBit); } } } else if (gauge.precision == QUDA_SINGLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashSSD12Kernel <<>> ((float4 *)res.spinor, oddBit); } else { dslashSSD12DaggerKernel <<>> ((float4 *)res.spinor, oddBit); } } else { if (!daggerBit) { dslashSSD8Kernel <<>> ((float4 *)res.spinor, oddBit); } else { dslashSSD8DaggerKernel <<>> ((float4 *)res.spinor, oddBit); } } } else { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashHSD12Kernel <<>> ((float4 *)res.spinor, oddBit); } else { dslashHSD12DaggerKernel <<>> ((float4 *)res.spinor, oddBit); } } else { if (!daggerBit) { dslashHSD8Kernel <<>> ((float4 *)res.spinor, oddBit); } else { dslashHSD8DaggerKernel <<>> ((float4 *)res.spinor, oddBit); } } } #endif } else if (clover_prec == QUDA_SINGLE_PRECISION) { if (gauge.precision == QUDA_DOUBLE_PRECISION) { #if (__CUDA_ARCH__ == 130) if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashDSS12Kernel <<>> ((float4 *)res.spinor, oddBit); } else { dslashDSS12DaggerKernel <<>> ((float4 *)res.spinor, oddBit); } } else { if (!daggerBit) { dslashDSS8Kernel <<>> ((float4 *)res.spinor, oddBit); } else { dslashDSS8DaggerKernel <<>> ((float4 *)res.spinor, oddBit); } } #endif } else if (gauge.precision == QUDA_SINGLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashSSS12Kernel <<>> ((float4 *)res.spinor, oddBit); } else { dslashSSS12DaggerKernel <<>> ((float4 *)res.spinor, oddBit); } } else { if (!daggerBit) { dslashSSS8Kernel <<>> ((float4 *)res.spinor, oddBit); } else { dslashSSS8DaggerKernel <<>> ((float4 *)res.spinor, oddBit); } } } else { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashHSS12Kernel <<>> ((float4 *)res.spinor, oddBit); } else { dslashHSS12DaggerKernel <<>> ((float4 *)res.spinor, oddBit); } } else { if (!daggerBit) { dslashHSS8Kernel <<>> ((float4 *)res.spinor, oddBit); } else { dslashHSS8DaggerKernel <<>> ((float4 *)res.spinor, oddBit); } } } } else { if (gauge.precision == QUDA_DOUBLE_PRECISION) { #if (__CUDA_ARCH__ == 130) if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashDSH12Kernel <<>> ((float4 *)res.spinor, oddBit); } else { dslashDSH12DaggerKernel <<>> ((float4 *)res.spinor, oddBit); } } else { if (!daggerBit) { dslashDSH8Kernel <<>> ((float4 *)res.spinor, oddBit); } else { dslashDSH8DaggerKernel <<>> ((float4 *)res.spinor, oddBit); } } #endif } else if (gauge.precision == QUDA_SINGLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashSSH12Kernel <<>> ((float4 *)res.spinor, oddBit); } else { dslashSSH12DaggerKernel <<>> ((float4 *)res.spinor, oddBit); } } else { if (!daggerBit) { dslashSSH8Kernel <<>> ((float4 *)res.spinor, oddBit); } else { dslashSSH8DaggerKernel <<>> ((float4 *)res.spinor, oddBit); } } } else { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashHSH12Kernel <<>> ((float4 *)res.spinor, oddBit); } else { dslashHSH12DaggerKernel <<>> ((float4 *)res.spinor, oddBit); } } else { if (!daggerBit) { dslashHSH8Kernel <<>> ((float4 *)res.spinor, oddBit); } else { dslashHSH8DaggerKernel <<>> ((float4 *)res.spinor, oddBit); } } } } } void cloverDslashHCuda(ParitySpinor res, FullGauge gauge, FullClover cloverInv, ParitySpinor spinor, int oddBit, int daggerBit) { dim3 gridDim(res.volume/gauge.blockDim, 1, 1); dim3 blockDim(gauge.blockDim, 1, 1); Precision clover_prec; bindGaugeTex(gauge, oddBit); if (oddBit) { bindCloverTex(cloverInv.odd); clover_prec = cloverInv.odd.precision; } else { bindCloverTex(cloverInv.even); clover_prec = cloverInv.even.precision; } int spinor_bytes = res.length*sizeof(float)/2; cudaBindTexture(0, spinorTexHalf, spinor.spinor, spinor_bytes); cudaBindTexture(0, spinorTexNorm, spinor.spinorNorm, spinor_bytes/12); int shared_bytes = blockDim.x*SHARED_FLOATS_PER_THREAD*sizeof(float); if (clover_prec == QUDA_DOUBLE_PRECISION) { #if (__CUDA_ARCH__ == 130) if (gauge.precision == QUDA_DOUBLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashDHD12Kernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } else { dslashDHD12DaggerKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } } else { if (!daggerBit) { dslashDHD8Kernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } else { dslashDHD8DaggerKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } } } else if (gauge.precision == QUDA_SINGLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashSHD12Kernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } else { dslashSHD12DaggerKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } } else { if (!daggerBit) { dslashSHD8Kernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } else { dslashSHD8DaggerKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } } } else { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashHHD12Kernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } else { dslashHHD12DaggerKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } } else { if (!daggerBit) { dslashHHD8Kernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } else { dslashHHD8DaggerKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } } } #endif } else if (clover_prec == QUDA_SINGLE_PRECISION) { if (gauge.precision == QUDA_DOUBLE_PRECISION) { #if (__CUDA_ARCH__ == 130) if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashDHS12Kernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } else { dslashDHS12DaggerKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } } else { if (!daggerBit) { dslashDHS8Kernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } else { dslashDHS8DaggerKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } } #endif } else if (gauge.precision == QUDA_SINGLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashSHS12Kernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } else { dslashSHS12DaggerKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } } else { if (!daggerBit) { dslashSHS8Kernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } else { dslashSHS8DaggerKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } } } else { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashHHS12Kernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } else { dslashHHS12DaggerKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } } else { if (!daggerBit) { dslashHHS8Kernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } else { dslashHHS8DaggerKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } } } } else { if (gauge.precision == QUDA_DOUBLE_PRECISION) { #if (__CUDA_ARCH__ == 130) if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashDHH12Kernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } else { dslashDHH12DaggerKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } } else { if (!daggerBit) { dslashDHH8Kernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } else { dslashDHH8DaggerKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } } #endif } else if (gauge.precision == QUDA_SINGLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashSHH12Kernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } else { dslashSHH12DaggerKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } } else { if (!daggerBit) { dslashSHH8Kernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } else { dslashSHH8DaggerKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } } } else { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashHHH12Kernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } else { dslashHHH12DaggerKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } } else { if (!daggerBit) { dslashHHH8Kernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } else { dslashHHH8DaggerKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit); } } } } } void cloverDslashXpayCuda(ParitySpinor out, FullGauge gauge, FullClover cloverInv, ParitySpinor in, int parity, int dagger, ParitySpinor x, double a) { if (!initDslash) initDslashCuda(gauge); checkSpinor(in, out); checkGaugeSpinor(in, gauge); checkCloverSpinor(in, cloverInv); if (in.precision == QUDA_DOUBLE_PRECISION) { cloverDslashXpayDCuda(out, gauge, cloverInv, in, parity, dagger, x, a); } else if (in.precision == QUDA_SINGLE_PRECISION) { cloverDslashXpaySCuda(out, gauge, cloverInv, in, parity, dagger, x, a); } else if (in.precision == QUDA_HALF_PRECISION) { cloverDslashXpayHCuda(out, gauge, cloverInv, in, parity, dagger, x, a); } dslash_quda_flops += (1320+504+48)*in.volume; } void cloverDslashXpayDCuda(ParitySpinor res, FullGauge gauge, FullClover cloverInv, ParitySpinor spinor, int oddBit, int daggerBit, ParitySpinor x, double a) { dim3 gridDim(res.volume/gauge.blockDim, 1, 1); dim3 blockDim(gauge.blockDim, 1, 1); Precision clover_prec; bindGaugeTex(gauge, oddBit); if (oddBit) { bindCloverTex(cloverInv.odd); clover_prec = cloverInv.odd.precision; } else { bindCloverTex(cloverInv.even); clover_prec = cloverInv.even.precision; } int spinor_bytes = res.length*sizeof(double); cudaBindTexture(0, spinorTexDouble, spinor.spinor, spinor_bytes); cudaBindTexture(0, accumTexDouble, x.spinor, spinor_bytes); int shared_bytes = blockDim.x*SHARED_FLOATS_PER_THREAD*sizeof(double); #if (__CUDA_ARCH__ == 130) if (clover_prec == QUDA_DOUBLE_PRECISION) { if (gauge.precision == QUDA_DOUBLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashDDD12XpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } else { dslashDDD12DaggerXpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashDDD8XpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } else { dslashDDD8DaggerXpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } } } else if (gauge.precision == QUDA_SINGLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashSDD12XpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } else { dslashSDD12DaggerXpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashSDD8XpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } else { dslashSDD8DaggerXpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } } } else { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashHDD12XpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } else { dslashHDD12DaggerXpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashHDD8XpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } else { dslashHDD8DaggerXpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } } } } else if (clover_prec == QUDA_SINGLE_PRECISION) { if (gauge.precision == QUDA_DOUBLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashDDS12XpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } else { dslashDDS12DaggerXpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashDDS8XpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } else { dslashDDS8DaggerXpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } } } else if (gauge.precision == QUDA_SINGLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashSDS12XpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } else { dslashSDS12DaggerXpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashSDS8XpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } else { dslashSDS8DaggerXpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } } } else { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashHDS12XpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } else { dslashHDS12DaggerXpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashHDS8XpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } else { dslashHDS8DaggerXpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } } } } else { if (gauge.precision == QUDA_DOUBLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashDDH12XpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } else { dslashDDH12DaggerXpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashDDH8XpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } else { dslashDDH8DaggerXpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } } } else if (gauge.precision == QUDA_SINGLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashSDH12XpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } else { dslashSDH12DaggerXpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashSDH8XpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } else { dslashSDH8DaggerXpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } } } else { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashHDH12XpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } else { dslashHDH12DaggerXpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashHDH8XpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } else { dslashHDH8DaggerXpayKernel <<>> ((double2 *)res.spinor, oddBit, a); } } } } #endif } void cloverDslashXpaySCuda(ParitySpinor res, FullGauge gauge, FullClover cloverInv, ParitySpinor spinor, int oddBit, int daggerBit, ParitySpinor x, double a) { dim3 gridDim(res.volume/gauge.blockDim, 1, 1); dim3 blockDim(gauge.blockDim, 1, 1); Precision clover_prec; bindGaugeTex(gauge, oddBit); if (oddBit) { bindCloverTex(cloverInv.odd); clover_prec = cloverInv.odd.precision; } else { bindCloverTex(cloverInv.even); clover_prec = cloverInv.even.precision; } int spinor_bytes = res.length*sizeof(float); cudaBindTexture(0, spinorTexSingle, spinor.spinor, spinor_bytes); cudaBindTexture(0, accumTexSingle, x.spinor, spinor_bytes); int shared_bytes = blockDim.x*SHARED_FLOATS_PER_THREAD*sizeof(float); if (clover_prec == QUDA_DOUBLE_PRECISION) { #if (__CUDA_ARCH__ == 130) if (gauge.precision == QUDA_DOUBLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashDSD12XpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } else { dslashDSD12DaggerXpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashDSD8XpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } else { dslashDSD8DaggerXpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } } } else if (gauge.precision == QUDA_SINGLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashSSD12XpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } else { dslashSSD12DaggerXpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashSSD8XpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } else { dslashSSD8DaggerXpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } } } else { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashHSD12XpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } else { dslashHSD12DaggerXpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashHSD8XpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } else { dslashHSD8DaggerXpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } } } #endif } else if (clover_prec == QUDA_SINGLE_PRECISION) { if (gauge.precision == QUDA_DOUBLE_PRECISION) { #if (__CUDA_ARCH__ == 130) if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashDSS12XpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } else { dslashDSS12DaggerXpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashDSS8XpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } else { dslashDSS8DaggerXpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } } #endif } else if (gauge.precision == QUDA_SINGLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashSSS12XpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } else { dslashSSS12DaggerXpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashSSS8XpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } else { dslashSSS8DaggerXpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } } } else { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashHSS12XpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } else { dslashHSS12DaggerXpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashHSS8XpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } else { dslashHSS8DaggerXpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } } } } else { if (gauge.precision == QUDA_DOUBLE_PRECISION) { #if (__CUDA_ARCH__ == 130) if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashDSH12XpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } else { dslashDSH12DaggerXpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashDSH8XpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } else { dslashDSH8DaggerXpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } } #endif } else if (gauge.precision == QUDA_SINGLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashSSH12XpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } else { dslashSSH12DaggerXpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashSSH8XpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } else { dslashSSH8DaggerXpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } } } else { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashHSH12XpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } else { dslashHSH12DaggerXpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashHSH8XpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } else { dslashHSH8DaggerXpayKernel <<>> ((float4 *)res.spinor, oddBit, a); } } } } } void cloverDslashXpayHCuda(ParitySpinor res, FullGauge gauge, FullClover cloverInv, ParitySpinor spinor, int oddBit, int daggerBit, ParitySpinor x, double a) { dim3 gridDim(res.volume/gauge.blockDim, 1, 1); dim3 blockDim(gauge.blockDim, 1, 1); Precision clover_prec; bindGaugeTex(gauge, oddBit); if (oddBit) { bindCloverTex(cloverInv.odd); clover_prec = cloverInv.odd.precision; } else { bindCloverTex(cloverInv.even); clover_prec = cloverInv.even.precision; } int spinor_bytes = res.length*sizeof(float)/2; cudaBindTexture(0, spinorTexHalf, spinor.spinor, spinor_bytes); cudaBindTexture(0, spinorTexNorm, spinor.spinorNorm, spinor_bytes/12); cudaBindTexture(0, accumTexHalf, x.spinor, spinor_bytes); cudaBindTexture(0, accumTexNorm, x.spinorNorm, spinor_bytes/12); int shared_bytes = blockDim.x*SHARED_FLOATS_PER_THREAD*sizeof(float); if (clover_prec == QUDA_DOUBLE_PRECISION) { #if (__CUDA_ARCH__ == 130) if (gauge.precision == QUDA_DOUBLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashDHD12XpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } else { dslashDHD12DaggerXpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashDHD8XpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } else { dslashDHD8DaggerXpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } } } else if (gauge.precision == QUDA_SINGLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashSHD12XpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } else { dslashSHD12DaggerXpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashSHD8XpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } else { dslashSHD8DaggerXpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } } } else { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashHHD12XpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } else { dslashHHD12DaggerXpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashHHD8XpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } else { dslashHHD8DaggerXpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } } } #endif } else if (clover_prec == QUDA_SINGLE_PRECISION) { if (gauge.precision == QUDA_DOUBLE_PRECISION) { #if (__CUDA_ARCH__ == 130) if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashDHS12XpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } else { dslashDHS12DaggerXpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashDHS8XpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } else { dslashDHS8DaggerXpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } } #endif } else if (gauge.precision == QUDA_SINGLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashSHS12XpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } else { dslashSHS12DaggerXpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashSHS8XpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } else { dslashSHS8DaggerXpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } } } else { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashHHS12XpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } else { dslashHHS12DaggerXpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashHHS8XpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } else { dslashHHS8DaggerXpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } } } } else { if (gauge.precision == QUDA_DOUBLE_PRECISION) { #if (__CUDA_ARCH__ == 130) if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashDHH12XpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } else { dslashDHH12DaggerXpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashDHH8XpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } else { dslashDHH8DaggerXpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } } #endif } else if (gauge.precision == QUDA_SINGLE_PRECISION) { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashSHH12XpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } else { dslashSHH12DaggerXpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashSHH8XpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } else { dslashSHH8DaggerXpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } } } else { if (gauge.reconstruct == QUDA_RECONSTRUCT_12) { if (!daggerBit) { dslashHHH12XpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } else { dslashHHH12DaggerXpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } } else if (gauge.reconstruct == QUDA_RECONSTRUCT_8) { if (!daggerBit) { dslashHHH8XpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } else { dslashHHH8DaggerXpayKernel <<>> ((short4*)res.spinor, (float*)res.spinorNorm, oddBit, a); } } } } } // Apply the even-odd preconditioned clover-improved Dirac operator void cloverMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover clover, FullClover cloverInv, ParitySpinor in, double kappa, ParitySpinor tmp, MatPCType matpc_type, int dagger) { double kappa2 = -kappa*kappa; if (((matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) || (matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC)) && (clover.even.clover == NULL)) { printf("QUDA error: For asymmetric matpc_type, the uninverted clover term must be loaded\n"); } // FIXME: For asymmetric, a "dslashCxpay" kernel would improve performance. if (matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) { cloverDslashCuda(tmp, gauge, cloverInv, in, 1, dagger); cloverCuda(out, gauge, clover, in, 0); dslashXpayCuda(out, gauge, tmp, 0, dagger, out, kappa2); // safe since out is not read after writing } else if (matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) { cloverDslashCuda(tmp, gauge, cloverInv, in, 0, dagger); cloverCuda(out, gauge, clover, in, 1); dslashXpayCuda(out, gauge, tmp, 1, dagger, out, kappa2); } else if (!dagger) { // symmetric preconditioning if (matpc_type == QUDA_MATPC_EVEN_EVEN) { cloverDslashCuda(tmp, gauge, cloverInv, in, 1, dagger); cloverDslashXpayCuda(out, gauge, cloverInv, tmp, 0, dagger, in, kappa2); } else if (matpc_type == QUDA_MATPC_ODD_ODD) { cloverDslashCuda(tmp, gauge, cloverInv, in, 0, dagger); cloverDslashXpayCuda(out, gauge, cloverInv, tmp, 1, dagger, in, kappa2); } else { printf("QUDA error: invalid matpc_type\n"); exit(-1); } } else { // symmetric preconditioning, dagger if (matpc_type == QUDA_MATPC_EVEN_EVEN) { cloverCuda(out, gauge, cloverInv, in, 0); cloverDslashCuda(tmp, gauge, cloverInv, out, 1, dagger); dslashXpayCuda(out, gauge, tmp, 0, dagger, in, kappa2); } else if (matpc_type == QUDA_MATPC_ODD_ODD) { cloverCuda(out, gauge, cloverInv, in, 1); cloverDslashCuda(tmp, gauge, cloverInv, out, 0, dagger); dslashXpayCuda(out, gauge, tmp, 1, dagger, in, kappa2); } else { printf("QUDA error: invalid matpc_type\n"); exit(-1); } } } void cloverMatPCDagMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover clover, FullClover cloverInv, ParitySpinor in, double kappa, ParitySpinor tmp, MatPCType matpc_type) { ParitySpinor aux = allocateParitySpinor(out.X, out.precision); // FIXME: eliminate aux cloverMatPCCuda(aux, gauge, clover, cloverInv, in, kappa, tmp, matpc_type, 0); cloverMatPCCuda(out, gauge, clover, cloverInv, aux, kappa, tmp, matpc_type, 1); freeParitySpinor(aux); } // Apply the full operator (FIXME: create kernel to eliminate tmp) void cloverMatCuda(FullSpinor out, FullGauge gauge, FullClover clover, FullSpinor in, double kappa, ParitySpinor tmp, int dagger) { cloverCuda(tmp, gauge, clover, in.odd, 1); dslashXpayCuda(out.odd, gauge, in.even, 1, dagger, tmp, -kappa); cloverCuda(tmp, gauge, clover, in.even, 0); dslashXpayCuda(out.even, gauge, in.odd, 0, dagger, tmp, -kappa); } // ---------------------------------------------------------------------- // Apply the clover term only void cloverCuda(ParitySpinor out, FullGauge gauge, FullClover clover, ParitySpinor in, int parity) { if (!initDslash) initDslashCuda(gauge); checkSpinor(in, out); checkGaugeSpinor(in, gauge); checkCloverSpinor(in, clover); if (in.precision == QUDA_DOUBLE_PRECISION) { cloverDCuda(out, gauge, clover, in, parity); } else if (in.precision == QUDA_SINGLE_PRECISION) { cloverSCuda(out, gauge, clover, in, parity); } else if (in.precision == QUDA_HALF_PRECISION) { cloverHCuda(out, gauge, clover, in, parity); } dslash_quda_flops += 504*in.volume; } void cloverDCuda(ParitySpinor res, FullGauge gauge, FullClover clover, ParitySpinor spinor, int oddBit) { dim3 gridDim(res.volume/gauge.blockDim, 1, 1); dim3 blockDim(gauge.blockDim, 1, 1); Precision clover_prec; bindGaugeTex(gauge, oddBit); if (oddBit) { bindCloverTex(clover.odd); clover_prec = clover.odd.precision; } else { bindCloverTex(clover.even); clover_prec = clover.even.precision; } int spinor_bytes = res.length*sizeof(double); cudaBindTexture(0, spinorTexDouble, spinor.spinor, spinor_bytes); int shared_bytes = blockDim.x*SHARED_FLOATS_PER_THREAD*sizeof(double); #if (__CUDA_ARCH__ == 130) if (clover_prec == QUDA_DOUBLE_PRECISION) { cloverDDKernel <<>> ((double2 *)res.spinor, oddBit); } else if (clover_prec == QUDA_SINGLE_PRECISION) { cloverDSKernel <<>> ((double2 *)res.spinor, oddBit); } else { cloverDHKernel <<>> ((double2 *)res.spinor, oddBit); } #endif } void cloverSCuda(ParitySpinor res, FullGauge gauge, FullClover clover, ParitySpinor spinor, int oddBit) { dim3 gridDim(res.volume/gauge.blockDim, 1, 1); dim3 blockDim(gauge.blockDim, 1, 1); Precision clover_prec; bindGaugeTex(gauge, oddBit); if (oddBit) { bindCloverTex(clover.odd); clover_prec = clover.odd.precision; } else { bindCloverTex(clover.even); clover_prec = clover.even.precision; } int spinor_bytes = res.length*sizeof(float); cudaBindTexture(0, spinorTexSingle, spinor.spinor, spinor_bytes); int shared_bytes = blockDim.x*SHARED_FLOATS_PER_THREAD*sizeof(float); if (clover_prec == QUDA_DOUBLE_PRECISION) { #if (__CUDA_ARCH__ == 130) cloverSDKernel <<>> ((float4 *)res.spinor, oddBit); #endif } else if (clover_prec == QUDA_SINGLE_PRECISION) { cloverSSKernel <<>> ((float4 *)res.spinor, oddBit); } else { cloverSHKernel <<>> ((float4 *)res.spinor, oddBit); } } void cloverHCuda(ParitySpinor res, FullGauge gauge, FullClover clover, ParitySpinor spinor, int oddBit) { dim3 gridDim(res.volume/gauge.blockDim, 1, 1); dim3 blockDim(gauge.blockDim, 1, 1); Precision clover_prec; bindGaugeTex(gauge, oddBit); if (oddBit) { bindCloverTex(clover.odd); clover_prec = clover.odd.precision; } else { bindCloverTex(clover.even); clover_prec = clover.even.precision; } int spinor_bytes = res.length*sizeof(float)/2; cudaBindTexture(0, spinorTexHalf, spinor.spinor, spinor_bytes); cudaBindTexture(0, spinorTexNorm, spinor.spinorNorm, spinor_bytes/12); int shared_bytes = blockDim.x*SHARED_FLOATS_PER_THREAD*sizeof(float); if (clover_prec == QUDA_DOUBLE_PRECISION) { #if (__CUDA_ARCH__ == 130) cloverHDKernel <<>> ((short4 *)res.spinor, (float *)res.spinorNorm, oddBit); #endif } else if (clover_prec == QUDA_SINGLE_PRECISION) { cloverHSKernel <<>> ((short4 *)res.spinor, (float *)res.spinorNorm, oddBit); } else { cloverHHKernel <<>> ((short4 *)res.spinor, (float *)res.spinorNorm, oddBit); } }