Advanced Computing Platform for Theoretical Physics

commit大文件会使得服务器变得不稳定,请大家尽量只commit代码,不要commit大的文件。

Commit 0bf35c40 authored by mikeaclark's avatar mikeaclark
Browse files

Fast runtime dimensions - finally\!

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@438 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 24b38683
......@@ -329,7 +329,7 @@ o32_re = o32_im = 0;
// 0 i 1 0
// i 0 0 1
int sp_idx = ((x1==X1-1) ? X-X1+1 : X+1) >> 1;
int sp_idx = ((x1==X1m1) ? X-X1m1 : X+1) >> 1;
int ga_idx = sid;
// read gauge matrix from device memory
......@@ -410,7 +410,7 @@ o32_re = o32_im = 0;
// 0 -i 1 0
// -i 0 0 1
int sp_idx = ((x1==0) ? X+X1-1 : X-1) >> 1;
int sp_idx = ((x1==0) ? X+X1m1 : X-1) >> 1;
int ga_idx = sp_idx;
// read gauge matrix from device memory
......@@ -491,7 +491,7 @@ o32_re = o32_im = 0;
// 0 1 1 0
// -1 0 0 1
int sp_idx = ((x2==X2-1) ? X-X2X1+X1 : X+X1) >> 1;
int sp_idx = ((x2==X2m1) ? X-X2X1mX1 : X+X1) >> 1;
int ga_idx = sid;
// read gauge matrix from device memory
......@@ -572,7 +572,7 @@ o32_re = o32_im = 0;
// 0 -1 1 0
// 1 0 0 1
int sp_idx = ((x2==0) ? X+X2X1-X1 : X-X1) >> 1;
int sp_idx = ((x2==0) ? X+X2X1mX1 : X-X1) >> 1;
int ga_idx = sp_idx;
// read gauge matrix from device memory
......@@ -653,7 +653,7 @@ o32_re = o32_im = 0;
// i 0 1 0
// 0 -i 0 1
int sp_idx = ((x3==X3-1) ? X-X3X2X1+X2X1 : X+X2X1) >> 1;
int sp_idx = ((x3==X3m1) ? X-X3X2X1mX2X1 : X+X2X1) >> 1;
int ga_idx = sid;
// read gauge matrix from device memory
......@@ -734,7 +734,7 @@ o32_re = o32_im = 0;
// -i 0 1 0
// 0 i 0 1
int sp_idx = ((x3==0) ? X+X3X2X1-X2X1 : X-X2X1) >> 1;
int sp_idx = ((x3==0) ? X+X3X2X1mX2X1 : X-X2X1) >> 1;
int ga_idx = sp_idx;
// read gauge matrix from device memory
......@@ -815,10 +815,10 @@ o32_re = o32_im = 0;
// 0 0 2 0
// 0 0 0 2
int sp_idx = ((x4==X4-1) ? X-X4X3X2X1+X3X2X1 : X+X3X2X1) >> 1;
int sp_idx = ((x4==X4m1) ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1;
int ga_idx = sid;
if (gauge_fixed && ga_idx < X4X3X2X1h-X3X2X1h) {
if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h) {
// read spinor from device memory
READ_SPINOR_DOWN(SPINORTEX);
......@@ -929,10 +929,10 @@ o32_re = o32_im = 0;
// 0 0 0 0
// 0 0 0 0
int sp_idx = ((x4==0) ? X+X4X3X2X1-X3X2X1 : X-X3X2X1) >> 1;
int sp_idx = ((x4==0) ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1;
int ga_idx = sp_idx;
if (gauge_fixed && ga_idx < X4X3X2X1h-X3X2X1h) {
if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h) {
// read spinor from device memory
READ_SPINOR_UP(SPINORTEX);
......
......@@ -302,14 +302,14 @@ def gen(dir):
str.append("// "+l+"\n")
str.append("\n")
if dir == 0: str.append("int sp_idx = ((x1==X1-1) ? X-X1+1 : X+1) >> 1;\n")
if dir == 1: str.append("int sp_idx = ((x1==0) ? X+X1-1 : X-1) >> 1;\n")
if dir == 2: str.append("int sp_idx = ((x2==X2-1) ? X-X2X1+X1 : X+X1) >> 1;\n")
if dir == 3: str.append("int sp_idx = ((x2==0) ? X+X2X1-X1 : X-X1) >> 1;\n")
if dir == 4: str.append("int sp_idx = ((x3==X3-1) ? X-X3X2X1+X2X1 : X+X2X1) >> 1;\n")
if dir == 5: str.append("int sp_idx = ((x3==0) ? X+X3X2X1-X2X1 : X-X2X1) >> 1;\n")
if dir == 6: str.append("int sp_idx = ((x4==X4-1) ? X-X4X3X2X1+X3X2X1 : X+X3X2X1) >> 1;\n")
if dir == 7: str.append("int sp_idx = ((x4==0) ? X+X4X3X2X1-X3X2X1 : X-X3X2X1) >> 1;\n")
if dir == 0: str.append("int sp_idx = ((x1==X1m1) ? X-X1m1 : X+1) >> 1;\n")
if dir == 1: str.append("int sp_idx = ((x1==0) ? X+X1m1 : X-1) >> 1;\n")
if dir == 2: str.append("int sp_idx = ((x2==X2m1) ? X-X2X1mX1 : X+X1) >> 1;\n")
if dir == 3: str.append("int sp_idx = ((x2==0) ? X+X2X1mX1 : X-X1) >> 1;\n")
if dir == 4: str.append("int sp_idx = ((x3==X3m1) ? X-X3X2X1mX2X1 : X+X2X1) >> 1;\n")
if dir == 5: str.append("int sp_idx = ((x3==0) ? X+X3X2X1mX2X1 : X-X2X1) >> 1;\n")
if dir == 6: str.append("int sp_idx = ((x4==X4m1) ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1;\n")
if dir == 7: str.append("int sp_idx = ((x4==0) ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1;\n")
ga_idx = "sid" if dir % 2 == 0 else "sp_idx"
str.append("int ga_idx = "+ga_idx+";\n\n")
......@@ -421,7 +421,7 @@ def gen(dir):
reconstruct.append("\n")
if dir >= 6:
str.append("if (gauge_fixed && ga_idx < X4X3X2X1h-X3X2X1h) ")
str.append("if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h) ")
str.append(block(''.join(load_spinor) + ''.join(project) + ''.join(ident) + ''.join(reconstruct)))
str.append("else ")
str.append(block(''.join(load_gauge) + ''.join(load_spinor) + ''.join(reconstruct_gauge) +
......
......@@ -329,7 +329,7 @@ o32_re = o32_im = 0;
// 0 -i 1 0
// -i 0 0 1
int sp_idx = ((x1==X1-1) ? X-X1+1 : X+1) >> 1;
int sp_idx = ((x1==X1m1) ? X-X1m1 : X+1) >> 1;
int ga_idx = sid;
// read gauge matrix from device memory
......@@ -410,7 +410,7 @@ o32_re = o32_im = 0;
// 0 i 1 0
// i 0 0 1
int sp_idx = ((x1==0) ? X+X1-1 : X-1) >> 1;
int sp_idx = ((x1==0) ? X+X1m1 : X-1) >> 1;
int ga_idx = sp_idx;
// read gauge matrix from device memory
......@@ -491,7 +491,7 @@ o32_re = o32_im = 0;
// 0 -1 1 0
// 1 0 0 1
int sp_idx = ((x2==X2-1) ? X-X2X1+X1 : X+X1) >> 1;
int sp_idx = ((x2==X2m1) ? X-X2X1mX1 : X+X1) >> 1;
int ga_idx = sid;
// read gauge matrix from device memory
......@@ -572,7 +572,7 @@ o32_re = o32_im = 0;
// 0 1 1 0
// -1 0 0 1
int sp_idx = ((x2==0) ? X+X2X1-X1 : X-X1) >> 1;
int sp_idx = ((x2==0) ? X+X2X1mX1 : X-X1) >> 1;
int ga_idx = sp_idx;
// read gauge matrix from device memory
......@@ -653,7 +653,7 @@ o32_re = o32_im = 0;
// -i 0 1 0
// 0 i 0 1
int sp_idx = ((x3==X3-1) ? X-X3X2X1+X2X1 : X+X2X1) >> 1;
int sp_idx = ((x3==X3m1) ? X-X3X2X1mX2X1 : X+X2X1) >> 1;
int ga_idx = sid;
// read gauge matrix from device memory
......@@ -734,7 +734,7 @@ o32_re = o32_im = 0;
// i 0 1 0
// 0 -i 0 1
int sp_idx = ((x3==0) ? X+X3X2X1-X2X1 : X-X2X1) >> 1;
int sp_idx = ((x3==0) ? X+X3X2X1mX2X1 : X-X2X1) >> 1;
int ga_idx = sp_idx;
// read gauge matrix from device memory
......@@ -815,10 +815,10 @@ o32_re = o32_im = 0;
// 0 0 0 0
// 0 0 0 0
int sp_idx = ((x4==X4-1) ? X-X4X3X2X1+X3X2X1 : X+X3X2X1) >> 1;
int sp_idx = ((x4==X4m1) ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1;
int ga_idx = sid;
if (gauge_fixed && ga_idx < X4X3X2X1h-X3X2X1h) {
if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h) {
// read spinor from device memory
READ_SPINOR_UP(SPINORTEX);
......@@ -929,10 +929,10 @@ o32_re = o32_im = 0;
// 0 0 2 0
// 0 0 0 2
int sp_idx = ((x4==0) ? X+X4X3X2X1-X3X2X1 : X-X3X2X1) >> 1;
int sp_idx = ((x4==0) ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1;
int ga_idx = sp_idx;
if (gauge_fixed && ga_idx < X4X3X2X1h-X3X2X1h) {
if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h) {
// read spinor from device memory
READ_SPINOR_DOWN(SPINORTEX);
......
......@@ -56,28 +56,28 @@ texture<float, 1, cudaReadModeElementType> cloverTexNorm;
QudaGaugeParam *gauge_param;
QudaInvertParam *invert_param;
__constant__ int X1h;
__constant__ int X1;
__constant__ int X2;
__constant__ int X3;
__constant__ int X4;
__constant__ int X1h;
__constant__ int X2X1h;
__constant__ int X3X2X1h;
__constant__ int X4X3X2X1h;
__constant__ int X2X1;
__constant__ int X3X2X1;
__constant__ int X4X3X2X1;
__constant__ int X1m1;
__constant__ int X2m1;
__constant__ int X3m1;
__constant__ int X4m1;
__constant__ float X1h_inv;
__constant__ float X2X1h_inv;
__constant__ float X3X2X1h_inv;
__constant__ int X2X1mX1;
__constant__ int X3X2X1mX2X1;
__constant__ int X4X3X2X1mX3X2X1;
__constant__ int X4X3X2X1hmX3X2X1h;
__constant__ float X1_inv;
__constant__ float X1h_inv;
__constant__ float X2_inv;
__constant__ float X3_inv;
__constant__ float X2X1_inv;
__constant__ float X3X2X1_inv;
__constant__ int X2X1;
__constant__ int X3X2X1;
__constant__ int Vh;
......@@ -92,6 +92,8 @@ __constant__ float pi_f;
__constant__ double anisotropy;
__constant__ double t_boundary;
int initDslash = 0;
#include <dslash_def.h>
void initDslashCuda(FullGauge gauge) {
......@@ -121,44 +123,41 @@ void initDslashCuda(FullGauge gauge) {
int X3X2X1 = X3*X2*X1;
cudaMemcpyToSymbol("X3X2X1", &X3X2X1, sizeof(int));
int X4X3X2X1 = X4*X3*X2*X1;
cudaMemcpyToSymbol("X4X3X2X1", &X4X3X2X1, sizeof(int));
int X1h = X1/2;
cudaMemcpyToSymbol("X1h", &X1h, sizeof(int));
int X2X1h = X2*X1h;
cudaMemcpyToSymbol("X2X1h", &X2X1h, sizeof(int));
int X3X2X1h = X3*X2*X1h;
cudaMemcpyToSymbol("X3X2X1h", &X3X2X1h, sizeof(int));
int X4X3X2X1h = X4*X3*X2*X1h;
cudaMemcpyToSymbol("X4X3X2X1h", &X4X3X2X1h, sizeof(int));
float X1h_inv = 1.0 / X1h;
cudaMemcpyToSymbol("X1h_inv", &X1h_inv, sizeof(float));
float X2X1h_inv = 1.0 / X2X1h;
cudaMemcpyToSymbol("X2X1h_inv", &X2X1h_inv, sizeof(float));
float X3X2X1h_inv = 1.0 / X3X2X1h;
cudaMemcpyToSymbol("X3X2X1h_inv", &X3X2X1h_inv, sizeof(float));
float X1_inv = 1.0 / X1;
cudaMemcpyToSymbol("X1_inv", &X1_inv, sizeof(float));
float X2_inv = 1.0 / X2;
cudaMemcpyToSymbol("X2_inv", &X2_inv, sizeof(float));
float X3_inv = 1.0 / X3;
cudaMemcpyToSymbol("X3_inv", &X3_inv, sizeof(float));
float X2X1_inv = 1.0 / X2X1;
cudaMemcpyToSymbol("X2X1_inv", &X2X1_inv, sizeof(float));
int X1m1 = X1 - 1;
cudaMemcpyToSymbol("X1m1", &X1m1, sizeof(int));
int X2m1 = X2 - 1;
cudaMemcpyToSymbol("X2m1", &X2m1, sizeof(int));
float X3X2X1_inv = 1.0 / X3X2X1;
cudaMemcpyToSymbol("X3X2X1_inv", &X3X2X1_inv, sizeof(float));
int X3m1 = X3 - 1;
cudaMemcpyToSymbol("X3m1", &X3m1, sizeof(int));
int X4m1 = X4 - 1;
cudaMemcpyToSymbol("X4m1", &X4m1, sizeof(int));
int X2X1mX1 = X2X1 - X1;
cudaMemcpyToSymbol("X2X1mX1", &X2X1mX1, sizeof(int));
int X3X2X1mX2X1 = X3X2X1 - X2X1;
cudaMemcpyToSymbol("X3X2X1mX2X1", &X3X2X1mX2X1, sizeof(int));
int X4X3X2X1mX3X2X1 = (X4-1)*X3X2X1;
cudaMemcpyToSymbol("X4X3X2X1mX3X2X1", &X4X3X2X1mX3X2X1, sizeof(int));
int X4X3X2X1hmX3X2X1h = (X4-1)*X3*X2*X1h;
cudaMemcpyToSymbol("X4X3X2X1hmX3X2X1h", &X4X3X2X1hmX3X2X1h, sizeof(int));
int gf = (gauge_param->gauge_fix == QUDA_GAUGE_FIXED_YES) ? 1 : 0;
cudaMemcpyToSymbol("gauge_fixed", &(gf), sizeof(int));
......@@ -176,6 +175,8 @@ void initDslashCuda(FullGauge gauge) {
float h_pi_f = M_PI;
cudaMemcpyToSymbol("pi_f", &(h_pi_f), sizeof(float));
initDslash = 1;
}
void bindGaugeTex(FullGauge gauge, int oddBit) {
......@@ -238,7 +239,7 @@ void checkGaugeSpinor(ParitySpinor spinor, FullGauge gauge) {
}
void dslashCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity, int dagger) {
initDslashCuda(gauge);
if (!initDslash) initDslashCuda(gauge);
checkSpinor(in, out);
checkGaugeSpinor(in, gauge);
......@@ -434,7 +435,7 @@ void dslashHCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
void dslashXpayCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity, int dagger,
ParitySpinor x, double a) {
initDslashCuda(gauge);
if (!initDslash) initDslashCuda(gauge);
checkSpinor(in, out);
checkGaugeSpinor(in, gauge);
......
......@@ -25,15 +25,15 @@ void *spinorGPU, *spinorGPUEven, *spinorGPUOdd;
double kappa = 1.0;
int ODD_BIT = 1;
int DAGGER_BIT = 1;
int DAGGER_BIT = 0;
int TRANSFER = 0; // include transfer time in the benchmark?
void init() {
gaugeParam.X[0] = 24;
gaugeParam.X[1] = 24;
gaugeParam.X[2] = 24;
gaugeParam.X[3] = 32;
gaugeParam.X[0] = 36;
gaugeParam.X[1] = 36;
gaugeParam.X[2] = 36;
gaugeParam.X[3] = 24;
setDims(gaugeParam.X);
gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION;
......
// this will evaluate wrong for (1/L^3) < 3e-7
//#define FAST_INT_DIVIDE(a, b) ((int)(__fdividef((float)a, (float)b)+0.0000003f))
#define FAST_INT_DIVIDE(a, b) ( (int)(((float)a * b##_inv)+0.0000003f) )
//#define FAST_INT_DIVIDE(a, b) ( (int)( __fdiv_rn((float)a, (float)b) +3e-7) )
//#define FAST_INT_DIVIDE(a, b) ( a/b )
#define FAST_INT_MOD(a, b) (a - FAST_INT_DIVIDE(a,b)*b)
//#define FAST_INT_MOD(a, b) (a % b)
// Performs complex addition
#define COMPLEX_ADD_TO(a, b) \
a##_re += b##_re, \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment