Advanced Computing Platform for Theoretical Physics

Commit df296187 authored by mikeaclark's avatar mikeaclark
Browse files

Some reordering changes to squeeze out more flops

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@309 be54200a-260c-0410-bdd7-ce6af2a381ab
parent 2ae57e73
......@@ -5,7 +5,9 @@ CUDA_INSTALL_PATH = /usr/local/cuda
INCLUDES = -I. -I$(CUDA_INSTALL_PATH)/include
LIB = -L$(CUDA_INSTALL_PATH)/lib -lcudart
DFLAGS = -DQudaSumFloat=double -DQudaSumComplex=cuDoubleComplex -DUNIX #-D__DEVICE_EMULATION__
DFLAGS = -DQudaSumFloat=double #-DREDUCE_KNUTH_SUMMATION
#-D__DEVICE_EMULATION__
CC = gcc
CFLAGS = -Wall -O3 -std=c99 $(INCLUDES) ${DFLAGS}
CXX = g++
......
......@@ -4,6 +4,14 @@
#ifndef _QUDA_BLAS_H
#define _QUDA_BLAS_H
#if QudaSumFloat==double
#define QudaSumComplex cuDoubleComplex
#define QudaSumFloat3 double3
#else
#define QudaSumComplex cuComplex
#define QudaSumFloat3 float3
#endif
#ifdef __cplusplus
extern "C" {
#endif
......
// *** CUDA DSLASH ***
#define SHARED_FLOATS_PER_THREAD 19
#define SHARED_FLOATS_PER_THREAD 0
#define SHARED_BYTES (BLOCK_DIM*SHARED_FLOATS_PER_THREAD*sizeof(float))
#define i00_re I0.x
......@@ -69,25 +69,6 @@
#define A_re G4.z
#define A_im G4.w
/*#define o00_re s[0]
#define o00_im s[1]
#define o01_re s[2]
#define o01_im s[3]
#define o02_re s[4]
#define o02_im s[5]
#define o10_re s[6]
#define o10_im s[7]
#define o11_re s[8]
#define o11_im s[9]
#define o12_re s[10]
#define o12_im s[11]
#define o20_re s[12]
#define o20_im s[13]
#define o21_re s[14]
#define o21_im s[15]
#define o22_re s[16]
#define o22_im s[17]
#define o30_re s[18]*/
volatile float o00_re;
volatile float o00_im;
volatile float o01_re;
......@@ -114,6 +95,7 @@ volatile float o32_re;
volatile float o32_im;
#include "read_gauge.h"
#include "io_spinor.h"
......@@ -125,9 +107,6 @@ int x3 = (X/(L2*L1)) % L3;
int x2 = (X/L1) % L2;
int x1 = X % L1;
//extern __shared__ float s_data[];
//volatile float *s = s_data+SHARED_FLOATS_PER_THREAD*threadIdx.x;
o00_re = o00_im = 0;
o01_re = o01_im = 0;
o02_re = o02_im = 0;
......@@ -157,6 +136,9 @@ o32_re = o32_im = 0;
// read spinor from device memory
READ_SPINOR(SPINORTEX);
// reconstruct gauge matrix
RECONSTRUCT_GAUGE_MATRIX(0);
// project spinor into half spinors
float a0_re = +i00_re+i30_im;
float a0_im = +i00_im-i30_re;
......@@ -235,6 +217,9 @@ o32_re = o32_im = 0;
// read spinor from device memory
READ_SPINOR(SPINORTEX);
// reconstruct gauge matrix
RECONSTRUCT_GAUGE_MATRIX(1);
// project spinor into half spinors
float a0_re = +i00_re-i30_im;
float a0_im = +i00_im+i30_re;
......@@ -313,6 +298,9 @@ o32_re = o32_im = 0;
// read spinor from device memory
READ_SPINOR(SPINORTEX);
// reconstruct gauge matrix
RECONSTRUCT_GAUGE_MATRIX(2);
// project spinor into half spinors
float a0_re = +i00_re-i30_re;
float a0_im = +i00_im-i30_im;
......@@ -391,6 +379,9 @@ o32_re = o32_im = 0;
// read spinor from device memory
READ_SPINOR(SPINORTEX);
// reconstruct gauge matrix
RECONSTRUCT_GAUGE_MATRIX(3);
// project spinor into half spinors
float a0_re = +i00_re+i30_re;
float a0_im = +i00_im+i30_im;
......@@ -469,6 +460,9 @@ o32_re = o32_im = 0;
// read spinor from device memory
READ_SPINOR(SPINORTEX);
// reconstruct gauge matrix
RECONSTRUCT_GAUGE_MATRIX(4);
// project spinor into half spinors
float a0_re = +i00_re+i20_im;
float a0_im = +i00_im-i20_re;
......@@ -547,6 +541,9 @@ o32_re = o32_im = 0;
// read spinor from device memory
READ_SPINOR(SPINORTEX);
// reconstruct gauge matrix
RECONSTRUCT_GAUGE_MATRIX(5);
// project spinor into half spinors
float a0_re = +i00_re-i20_im;
float a0_im = +i00_im+i20_re;
......@@ -669,6 +666,9 @@ o32_re = o32_im = 0;
// read spinor from device memory
READ_SPINOR_DOWN(SPINORTEX);
// reconstruct gauge matrix
RECONSTRUCT_GAUGE_MATRIX(6);
// project spinor into half spinors
float a0_re = +2*i20_re;
float a0_im = +2*i20_im;
......@@ -780,6 +780,9 @@ o32_re = o32_im = 0;
// read spinor from device memory
READ_SPINOR_UP(SPINORTEX);
// reconstruct gauge matrix
RECONSTRUCT_GAUGE_MATRIX(7);
// project spinor into half spinors
float a0_re = +2*i00_re;
float a0_im = +2*i00_im;
......@@ -833,12 +836,7 @@ o32_re = o32_im = 0;
#ifdef DSLASH_XPAY
float4 accum0 = tex1Dfetch(accumTex, sid + 0*Nh);
float4 accum1 = tex1Dfetch(accumTex, sid + 1*Nh);
float4 accum2 = tex1Dfetch(accumTex, sid + 2*Nh);
float4 accum3 = tex1Dfetch(accumTex, sid + 3*Nh);
float4 accum4 = tex1Dfetch(accumTex, sid + 4*Nh);
float4 accum5 = tex1Dfetch(accumTex, sid + 5*Nh);
READ_ACCUM(ACCUMTEX)
o00_re = a*o00_re + accum0.x;
o00_im = a*o00_im + accum0.y;
o01_re = a*o01_re + accum0.z;
......@@ -869,22 +867,4 @@ o32_re = o32_im = 0;
// write spinor field back to device memory
WRITE_SPINOR();
#undef o00_re
#undef o00_im
#undef o01_re
#undef o01_im
#undef o02_re
#undef o02_im
#undef o10_re
#undef o10_im
#undef o11_re
#undef o11_im
#undef o12_re
#undef o12_im
#undef o20_re
#undef o20_im
#undef o21_re
#undef o21_im
#undef o22_re
#undef o22_im
#undef o30_re
......@@ -94,7 +94,7 @@ projectors = [
### code generation ########################################################################
### parameters
sharedFloats = 19
sharedFloats = 0
dagger = False
def block(code):
......@@ -182,8 +182,9 @@ int x1 = X % L1;
""")
str.append("extern __shared__ float s_data[];\n")
str.append("volatile float *s = s_data+SHARED_FLOATS_PER_THREAD*threadIdx.x;\n\n")
if sharedFloats > 0:
str.append("extern __shared__ float s_data[];\n")
str.append("volatile float *s = s_data+SHARED_FLOATS_PER_THREAD*threadIdx.x;\n\n")
for s in range(0,4):
for c in range(0,3):
......@@ -200,12 +201,7 @@ def epilog():
str.append(
"""
#ifdef DSLASH_XPAY
float4 accum0 = tex1Dfetch(accumTex, sid + 0*Nh);
float4 accum1 = tex1Dfetch(accumTex, sid + 1*Nh);
float4 accum2 = tex1Dfetch(accumTex, sid + 2*Nh);
float4 accum3 = tex1Dfetch(accumTex, sid + 3*Nh);
float4 accum4 = tex1Dfetch(accumTex, sid + 4*Nh);
float4 accum5 = tex1Dfetch(accumTex, sid + 5*Nh);
READ_ACCUM(ACCUMTEX)
""")
for s in range(0,4):
......@@ -275,16 +271,20 @@ def gen(dir):
load_spinor = []
load_spinor.append("// read spinor from device memory\n")
if row_cnt[0] == 0:
load_spinor.append("READ_SPINOR_DOWN(spinorTex);\n\n")
load_spinor.append("READ_SPINOR_DOWN(SPINORTEX);\n\n")
elif row_cnt[2] == 0:
load_spinor.append("READ_SPINOR_UP(spinorTex);\n\n")
load_spinor.append("READ_SPINOR_UP(SPINORTEX);\n\n")
else:
load_spinor.append("READ_SPINOR(spinorTex);\n\n")
load_spinor.append("READ_SPINOR(SPINORTEX);\n\n")
load_gauge = []
load_gauge.append("// read gauge matrix from device memory\n")
load_gauge.append("READ_GAUGE_MATRIX(GAUGE"+`dir%2`+"TEX, "+`dir`+");\n\n")
reconstruct_gauge = []
reconstruct_gauge.append("// reconstruct gauge matrix\n")
reconstruct_gauge.append("RECONSTRUCT_GAUGE_MATRIX("+`dir`+");\n\n")
project = []
project.append("// project spinor into half spinors\n")
for h in range(0, 2):
......@@ -367,9 +367,11 @@ def gen(dir):
str.append("if (gauge_fixed && ga_idx < (L4-1)*L1h*L2*L3) ")
str.append(block(''.join(load_spinor) + ''.join(project) + ''.join(ident) + ''.join(reconstruct)))
str.append("else ")
str.append(block(''.join(load_gauge) + ''.join(load_spinor) + ''.join(project) + ''.join(mult) + ''.join(reconstruct)))
str.append(block(''.join(load_gauge) + ''.join(load_spinor) + ''.join(reconstruct_gauge) +
''.join(project) + ''.join(mult) + ''.join(reconstruct)))
else:
str.append(''.join(load_gauge) + ''.join(load_spinor) + ''.join(project) + ''.join(mult) + ''.join(reconstruct))
str.append(''.join(load_gauge) + ''.join(load_spinor) + ''.join(reconstruct_gauge) +
''.join(project) + ''.join(mult) + ''.join(reconstruct))
return block(''.join(str))+"\n"
# end def gen
......@@ -380,6 +382,7 @@ def generate():
return prolog() + gen(0) + gen(1) + gen(2) + gen(3) + gen(4) + gen(5) + gen(6) + gen(7) + epilog()
dagger = False
#dagger = True
sharedFloats = 0
dagger = True
print generate()
// *** CUDA DSLASH DAGGER ***
#define SHARED_FLOATS_PER_THREAD 19
#define SHARED_FLOATS_PER_THREAD 0
#define SHARED_BYTES (BLOCK_DIM*SHARED_FLOATS_PER_THREAD*sizeof(float))
#define i00_re I0.x
......@@ -69,25 +69,25 @@
#define A_re G4.z
#define A_im G4.w
#define o00_re s[0]
#define o00_im s[1]
#define o01_re s[2]
#define o01_im s[3]
#define o02_re s[4]
#define o02_im s[5]
#define o10_re s[6]
#define o10_im s[7]
#define o11_re s[8]
#define o11_im s[9]
#define o12_re s[10]
#define o12_im s[11]
#define o20_re s[12]
#define o20_im s[13]
#define o21_re s[14]
#define o21_im s[15]
#define o22_re s[16]
#define o22_im s[17]
#define o30_re s[18]
volatile float o00_re;
volatile float o00_im;
volatile float o01_re;
volatile float o01_im;
volatile float o02_re;
volatile float o02_im;
volatile float o10_re;
volatile float o10_im;
volatile float o11_re;
volatile float o11_im;
volatile float o12_re;
volatile float o12_im;
volatile float o20_re;
volatile float o20_im;
volatile float o21_re;
volatile float o21_im;
volatile float o22_re;
volatile float o22_im;
volatile float o30_re;
volatile float o30_im;
volatile float o31_re;
volatile float o31_im;
......@@ -107,9 +107,6 @@ int x3 = (X/(L2*L1)) % L3;
int x2 = (X/L1) % L2;
int x1 = X % L1;
extern __shared__ float s_data[];
volatile float *s = s_data+SHARED_FLOATS_PER_THREAD*threadIdx.x;
o00_re = o00_im = 0;
o01_re = o01_im = 0;
o02_re = o02_im = 0;
......@@ -139,6 +136,9 @@ o32_re = o32_im = 0;
// read spinor from device memory
READ_SPINOR(SPINORTEX);
// reconstruct gauge matrix
RECONSTRUCT_GAUGE_MATRIX(0);
// project spinor into half spinors
float a0_re = +i00_re-i30_im;
float a0_im = +i00_im+i30_re;
......@@ -217,6 +217,9 @@ o32_re = o32_im = 0;
// read spinor from device memory
READ_SPINOR(SPINORTEX);
// reconstruct gauge matrix
RECONSTRUCT_GAUGE_MATRIX(1);
// project spinor into half spinors
float a0_re = +i00_re+i30_im;
float a0_im = +i00_im-i30_re;
......@@ -295,6 +298,9 @@ o32_re = o32_im = 0;
// read spinor from device memory
READ_SPINOR(SPINORTEX);
// reconstruct gauge matrix
RECONSTRUCT_GAUGE_MATRIX(2);
// project spinor into half spinors
float a0_re = +i00_re+i30_re;
float a0_im = +i00_im+i30_im;
......@@ -373,6 +379,9 @@ o32_re = o32_im = 0;
// read spinor from device memory
READ_SPINOR(SPINORTEX);
// reconstruct gauge matrix
RECONSTRUCT_GAUGE_MATRIX(3);
// project spinor into half spinors
float a0_re = +i00_re-i30_re;
float a0_im = +i00_im-i30_im;
......@@ -451,6 +460,9 @@ o32_re = o32_im = 0;
// read spinor from device memory
READ_SPINOR(SPINORTEX);
// reconstruct gauge matrix
RECONSTRUCT_GAUGE_MATRIX(4);
// project spinor into half spinors
float a0_re = +i00_re-i20_im;
float a0_im = +i00_im+i20_re;
......@@ -529,6 +541,9 @@ o32_re = o32_im = 0;
// read spinor from device memory
READ_SPINOR(SPINORTEX);
// reconstruct gauge matrix
RECONSTRUCT_GAUGE_MATRIX(5);
// project spinor into half spinors
float a0_re = +i00_re+i20_im;
float a0_im = +i00_im-i20_re;
......@@ -651,6 +666,9 @@ o32_re = o32_im = 0;
// read spinor from device memory
READ_SPINOR_UP(SPINORTEX);
// reconstruct gauge matrix
RECONSTRUCT_GAUGE_MATRIX(6);
// project spinor into half spinors
float a0_re = +2*i00_re;
float a0_im = +2*i00_im;
......@@ -762,6 +780,9 @@ o32_re = o32_im = 0;
// read spinor from device memory
READ_SPINOR_DOWN(SPINORTEX);
// reconstruct gauge matrix
RECONSTRUCT_GAUGE_MATRIX(7);
// project spinor into half spinors
float a0_re = +2*i20_re;
float a0_im = +2*i20_im;
......@@ -815,12 +836,7 @@ o32_re = o32_im = 0;
#ifdef DSLASH_XPAY
float4 accum0 = tex1Dfetch(accumTex, sid + 0*Nh);
float4 accum1 = tex1Dfetch(accumTex, sid + 1*Nh);
float4 accum2 = tex1Dfetch(accumTex, sid + 2*Nh);
float4 accum3 = tex1Dfetch(accumTex, sid + 3*Nh);
float4 accum4 = tex1Dfetch(accumTex, sid + 4*Nh);
float4 accum5 = tex1Dfetch(accumTex, sid + 5*Nh);
READ_ACCUM(ACCUMTEX)
o00_re = a*o00_re + accum0.x;
o00_im = a*o00_im + accum0.y;
o01_re = a*o01_re + accum0.z;
......@@ -851,22 +867,4 @@ o32_re = o32_im = 0;
// write spinor field back to device memory
WRITE_SPINOR();
#undef o00_re
#undef o00_im
#undef o01_re
#undef o01_im
#undef o02_re
#undef o02_im
#undef o10_re
#undef o10_im
#undef o11_re
#undef o11_im
#undef o12_re
#undef o12_im
#undef o20_re
#undef o20_im
#undef o21_re
#undef o21_im
#undef o22_re
#undef o22_im
#undef o30_re
This diff is collapsed.
This diff is collapsed.
......@@ -24,22 +24,35 @@ extern "C" {
#endif
extern FullGauge cudaGauge;
extern FullGauge cudaHGauge;
extern QudaGaugeParam *gauge_param;
extern QudaInvertParam *invert_param;
extern short4 *spinorHalf;
extern float *spinorNorm;
extern ParityHSpinor hSpinor1;
extern ParityHSpinor hSpinor2;
// ---------- dslash_quda.cu ----------
int dslashCudaSharedBytes();
void setCudaGaugeParam();
void dslashCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit);
void dslashXpayCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, float a);
int dslashCudaSharedBytes();
// Single precision routines
void dslashSCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit);
void dslashXpaySCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, float a);
// Half precision dslash routines
void dslashHCuda(ParityHSpinor res, FullGauge gauge, ParityHSpinor spinor,
int oddBit, int daggerBit);
void dslashXpayHCuda(ParitySpinor res, FullGauge gauge, ParityHSpinor spinor,
int oddBit, int daggerBit, ParityHSpinor x, float a);
// wrapper to above
void dslashCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity, int dagger);
// Full Wilson matrix
void MatPCCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven,
float kappa, ParitySpinor tmp, MatPCType matpc_type);
void MatPCDagCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven,
......@@ -70,8 +83,9 @@ extern "C" {
ParitySpinor tmp, QudaInvertParam *param);
// -- inv_bicgstab_cuda.cpp
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge,
ParitySpinor tmp, QudaInvertParam *param, DagType dag_type);
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, FullGauge gaugeSloppy,
FullGauge gaugePrecise, ParitySpinor tmp,
QudaInvertParam *param, DagType dag_type);
// ---------- cg_reference.cpp ----------
void cgReference(float *out, float **gauge, float *in, float kappa, float tol);
......
......@@ -5,10 +5,12 @@
#include <util_quda.h>
#include <field_quda.h>
#define FULL_WILSON 0
#define FULL_WILSON 1
QudaGaugeParam param;
QudaGaugeParam gaugeParam;
QudaInvertParam inv_param;
FullGauge gauge;
FullSpinor cudaSpinor;
ParitySpinor tmp;
......@@ -19,6 +21,7 @@ float *spinorEven, *spinorOdd;
float kappa = 1.0;
int ODD_BIT = 0;
int DAGGER_BIT = 0;
int TRANSFER = 0; // include transfer time in the benchmark?
void printSpinorHalfField(float *spinor) {
printSpinor(&spinor[0*spinorSiteSize]);
......@@ -29,25 +32,23 @@ void printSpinorHalfField(float *spinor) {
void init() {
cudaGauge.even = 0;
cudaGauge.odd = 0;
param.cpu_prec = QUDA_SINGLE_PRECISION;
param.cuda_prec = QUDA_SINGLE_PRECISION;
param.X = L1;
param.Y = L2;
param.Z = L3;
param.T = L4;
param.anisotropy = 2.3;
param.reconstruct = QUDA_RECONSTRUCT_8;
param.gauge_order = QUDA_QDP_GAUGE_ORDER;
param.t_boundary = QUDA_ANTI_PERIODIC_T;
param.gauge_fix = QUDA_GAUGE_FIXED_NO;
gauge_param = &param;
gaugeParam.cpu_prec = QUDA_SINGLE_PRECISION;
gaugeParam.cuda_prec = QUDA_HALF_PRECISION;
gaugeParam.X = L1;
gaugeParam.Y = L2;
gaugeParam.Z = L3;
gaugeParam.T = L4;
gaugeParam.anisotropy = 2.3;
gaugeParam.reconstruct = QUDA_RECONSTRUCT_8;
gaugeParam.gauge_order = QUDA_QDP_GAUGE_ORDER;
gaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T;
gaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
gauge_param = &gaugeParam;
inv_param.cpu_prec = QUDA_SINGLE_PRECISION;
inv_param.cuda_prec = QUDA_HALF_PRECISION;
inv_param.dirac_order = QUDA_DIRAC_ORDER;
inv_param.kappa = kappa;
invert_param = &inv_param;
// construct input fields
......@@ -64,34 +65,24 @@ void init() {
constructGaugeField(hostGauge);
constructSpinorField(spinor);
/*{
float *spinor2 = (float*)malloc(N*spinorSiteSize*sizeof(float));
short *spinorHalf = (short*)malloc(Nh*spinorSiteSize*sizeof(short));
float *spinorNorm = (float*)malloc(Nh*sizeof(float));
spinorHalfPack(spinorNorm, spinorHalf, spinor);
spinorHalfUnpack(spinor2, spinorNorm, spinorHalf);
free(spinor2);
free(spinorNorm);
free(spinorHalf);
for (int i=0; i<24*Nh; i++) {
printf("%e %e %e\n", spinor[i], spinor2[i], fabs(spinor[i]-spinor2[i]));
}
//exit(0);
}*/
printf("done.\n"); fflush(stdout);
int dev = 1;
cudaSetDevice(dev);
int dev = 0;
initQuda(dev);
loadGaugeQuda((void*)hostGauge, &gaugeParam);
if (gaugeParam.cuda_prec == QUDA_SINGLE_PRECISION) gauge = cudaGauge;
else gauge = cudaHGauge;
printf("Sending fields to GPU..."); fflush(stdout);
loadGaugeField(hostGauge);
cudaSpinor = allocateSpinorField();
tmp = allocateParitySpinor();
loadSpinorField(cudaSpinor, (void*)spinor, inv_param.cpu_prec,
inv_param.cuda_prec, inv_param.dirac_order);
if (!TRANSFER) {
cudaSpinor = allocateSpinorField();
tmp = allocateParitySpinor();
loadSpinorField(cudaSpinor, (void*)spinor, inv_param.cpu_prec,
inv_param.cuda_prec, inv_param.dirac_order);
}
}
void end() {
......@@ -99,9 +90,11 @@ void end() {
for (int dir = 0; dir < 4; dir++) free(hostGauge[dir]);
free(spinor);
free(spinorRef);
freeSpinorField(cudaSpinor);
freeSpinorBuffer();
freeParitySpinor(tmp);
if (!TRANSFER) {
freeSpinorField(cudaSpinor);
freeParitySpinor(tmp);
}
endQuda();
}
void dslashRef() {
......@@ -126,9 +119,11 @@ double dslashCUDA() {
stopwatchStart();
for (int i = 0; i < LOOPS; i++) {
if (FULL_WILSON)
MatPCCuda(cudaSpinor.odd, cudaGauge, cudaSpinor.even, kappa, tmp, QUDA_MATPC_EVEN_EVEN);
if (TRANSFER) MatPCQuda(spinorOdd, spinorEven, &inv_param);
else MatPCCuda(cudaSpinor.odd, gauge, cudaSpinor.even, kappa, tmp, QUDA_MATPC_EVEN_EVEN);
else
dslashCuda(cudaSpinor.odd, cudaGauge, cudaSpinor.even, ODD_BIT, DAGGER_BIT);
if (TRANSFER) dslashQuda(spinorOdd, spinorEven, &inv_param, ODD_BIT, DAGGER_BIT);
else dslashCuda(cudaSpinor.odd, gauge, cudaSpinor.even, ODD_BIT, DAGGER_BIT);
}
// check for errors
......@@ -164,11 +159,11 @@ void strongCheck() {
for (int f=0; f<fail_check; f++)
if (diff > pow(10,-(f+1))) fail[f]++;
//if (diff > 1e-1) printf("%d %d %e\n", i, j, diff);
if (diff > 1e-6) iter[j]++;
if (diff > 1e-3) iter[j]++;
}
}
//for (int i=0; i<24; i++) printf("%d fails = %d\n", i, iter[i]);
for (int i=0; i<24; i++) printf("%d fails = %d\n", i, iter[i]);
for (int f=0; f<fail_check; f++) {
printf("%e Failures: %d / %d = %e\n", pow(10,-(f+1)), fail[f], Nh*24, fail[f] / (float)(Nh*24));
......@@ -184,7 +179,7 @@ void dslashTest() {
float spinorGiB = (float)Nh*spinorSiteSize*sizeof(float) / (1 << 30);
float sharedKB = (float)dslashCudaSharedBytes() / (1 << 10);
printf("\nSpinor mem: %.3f GiB\n", spinorGiB);
printf("Gauge mem: %.3f GiB\n", param.gaugeGiB);
printf("Gauge mem: %.3f GiB\n", gaugeParam.gaugeGiB);
printf("Shared mem: %.3f KB\n", sharedKB);
int attempts = 10000;
......@@ -192,13 +187,14 @@ void dslashTest() {
for (int i=0; i<attempts; i++) {
double secs = dslashCUDA();
retrieveParitySpinor(spinorOdd, cudaSpinor.odd, QUDA_SINGLE_PRECISION,
QUDA_SINGLE_PRECISION, QUDA_DIRAC_ORDER);
if (!TRANSFER) retrieveParitySpinor(spinorOdd, cudaSpinor.odd, QUDA_SINGLE_PRECISION,
QUDA_SINGLE_PRECISION, QUDA_DIRAC_ORDER);
// print timing information
printf("%fms per loop\n", 1000*secs);
int flops = FULL_WILSON ? 1320*2 + 48 : 1320;
int floats = FULL_WILSON ? 2*(7*24+8*param.packed_size+24)+24 : 7*24+8*param.packed_size+24;
int floats = FULL_WILSON ? 2*(7*24+8*gaugeParam.packed_size+24)+24 : 7*24+8*gaugeParam.packed_size+24;
printf("GFLOPS = %f\n", 1.0e-9*flops*Nh/secs);
printf("GiB/s = %f\n\n", Nh*floats*sizeof(float)/(secs*(1<<30)));
......
#include <stdlib.h>
#include <stdio.h>
......@@ -5,34 +6,34 @@
#include <field_quda.h>
#include <xmmintrin.h>
#define is_aligned(p,a) ((((size_t)(p))&((size_t)(a-1)))==0)
// GPU gauge field
FullGauge cudaGauge;
FullGauge cudaHGauge;
// Pinned memory for cpu-gpu memory copying
float4 *packedSpinor = 0;
// Half precision spinor field
short4 *spinorHalf = 0;
float *spinorNorm = 0;
// Half precision spinor field temporaries
ParityHSpinor hSpinor1, hSpinor2;
void allocateGaugeField(int packed_gauge_bytes) {
if (!cudaGauge.even) {
if (cudaMalloc((void **)&cudaGauge.even, packed_gauge_bytes) == cudaErrorMemoryAllocation) {
void allocateGaugeField(FullGauge *gauge, int packed_gauge_bytes) {
if (!gauge->even) {
if (cudaMalloc((void **)&gauge->even, packed_gauge_bytes) == cudaErrorMemoryAllocation) {
printf("Error allocating even gauge field\n");
exit(0);
}
}
if (!cudaGauge.odd) {
if (cudaMalloc((void **)&cudaGauge.odd, packed_gauge_bytes) == cudaErrorMemoryAllocation) {
if (!gauge->odd) {
if (cudaMalloc((void **)&gauge->odd, packed_gauge_bytes) == cudaErrorMemoryAllocation) {
printf("Error allocating even odd gauge field\n");
exit(0);
}
}
}
ParitySpinor allocateParitySpinor() {
ParitySpinor ret;
......@@ -60,6 +61,11 @@ void freeGaugeField() {
if (cudaGauge.odd) cudaFree(cudaGauge.odd);
cudaGauge.even = 0;
cudaGauge.odd = 0;
if (cudaHGauge.even) cudaFree(cudaHGauge.even);
if (cudaHGauge.odd) cudaFree(cudaHGauge.odd);
cudaHGauge.even = 0;
cudaHGauge.odd = 0;
}
void freeSpinorField(FullSpinor spinor) {
......@@ -77,15 +83,21 @@ void freeSpinorBuffer() {
}
void allocateSpinorHalf() {
cudaMalloc((void**)&spinorHalf, SPINOR_BYTES/2);
cudaMalloc((void**)&spinorNorm, SPINOR_BYTES/24);
cudaMalloc((void**)&hSpinor1.spinorHalf, SPINOR_BYTES/2);
cudaMalloc((void**)&hSpinor1.spinorNorm, SPINOR_BYTES/24);
cudaMalloc((void**)&hSpinor2.spinorHalf, SPINOR_BYTES/2);
cudaMalloc((void**)&hSpinor2.spinorNorm, SPINOR_BYTES/24);
}
void freeSpinorHalf() {
cudaFree(spinorHalf);
cudaFree(spinorNorm);
spinorHalf = 0;
spinorNorm = 0;
cudaFree(hSpinor1.spinorHalf);
cudaFree(hSpinor1.spinorNorm);
cudaFree(hSpinor2.spinorHalf);
cudaFree(hSpinor2.spinorNorm);
hSpinor1.spinorHalf = 0;
hSpinor1.spinorNorm = 0;
hSpinor2.spinorHalf = 0;
hSpinor2.spinorNorm = 0;
}
......@@ -158,6 +170,11 @@ inline void packHalf8Double(short4 *res, double *g) {
inline void packHalf8Single(short4 *res, float *g) {
res[0].x = floatToShort(atan2(g[1], g[0])/ M_PI);
res[0].y = floatToShort(atan2(g[13], g[12])/ M_PI);
/*float t1 = atan2(g[1], g[0])/ M_PI;
float t2 = atan2(g[13], g[12])/ M_PI;
if (fabs(t1)>0.99999) printf("%e %d\n", t1, res[0].x);
if (fabs(t2)>0.99999) printf("%e %d\n", t2, res[0].y);*/
res[0].z = floatToShort(g[2]);
res[0].w = floatToShort(g[3]);
res[Nh].x = floatToShort(g[4]);
......@@ -437,61 +454,69 @@ void loadGaugeField(void *gauge) {
}
int packed_gauge_bytes = (gauge_param->reconstruct == QUDA_RECONSTRUCT_8) ? 8 : 12;
if (gauge_param->cuda_prec == QUDA_HALF_PRECISION) packed_gauge_bytes /= 2;
gauge_param->packed_size = packed_gauge_bytes;
packed_gauge_bytes *= 4*Nh*sizeof(float);
allocateGaugeField(packed_gauge_bytes);
allocateGaugeField(&cudaGauge, packed_gauge_bytes);
cudaGauge.reconstruct = gauge_param->reconstruct;
cudaGauge.precision = QUDA_SINGLE_PRECISION;
// 2 since even-odd
gauge_param->gaugeGiB = (float)2*packed_gauge_bytes/ (1 << 30);
if (gauge_param->cuda_prec == QUDA_SINGLE_PRECISION) {
// Use pinned memory
float4 *packedEven, *packedOdd;
// Use pinned memory
float4 *packedEven, *packedOdd;
#ifndef __DEVICE_EMULATION__
cudaMallocHost((void**)&packedEven, packed_gauge_bytes);
cudaMallocHost((void**)&packedOdd, packed_gauge_bytes);
cudaMallocHost((void**)&packedEven, packed_gauge_bytes);
cudaMallocHost((void**)&packedOdd, packed_gauge_bytes);
#else
packedEven = (float4*)malloc(packed_gauge_bytes);
packedOdd = (float4*)malloc(packed_gauge_bytes);
packedEven = (float4*)malloc(packed_gauge_bytes);
packedOdd = (float4*)malloc(packed_gauge_bytes);
#endif
if (gauge_param->gauge_order == QUDA_QDP_GAUGE_ORDER) {
if (gauge_param->cpu_prec == QUDA_DOUBLE_PRECISION) {
packQDPDoubleGaugeField(packedEven, (double**)gauge, 0, gauge_param->reconstruct);
packQDPDoubleGaugeField(packedOdd, (double**)gauge, 1, gauge_param->reconstruct);
} else {
packQDPSingleGaugeField(packedEven, (float**)gauge, 0, gauge_param->reconstruct);
packQDPSingleGaugeField(packedOdd, (float**)gauge, 1, gauge_param->reconstruct);
}
} else if (gauge_param->gauge_order == QUDA_CPS_WILSON_GAUGE_ORDER) {
if (gauge_param->cpu_prec == QUDA_DOUBLE_PRECISION) {
packCPSDoubleGaugeField(packedEven, (double*)gauge, 0, gauge_param->reconstruct);
packCPSDoubleGaugeField(packedOdd, (double*)gauge, 1, gauge_param->reconstruct);
} else {
packCPSSingleGaugeField(packedEven, (float*)gauge, 0, gauge_param->reconstruct);
packCPSSingleGaugeField(packedOdd, (float*)gauge, 1, gauge_param->reconstruct);
}
if (gauge_param->gauge_order == QUDA_QDP_GAUGE_ORDER) {
if (gauge_param->cpu_prec == QUDA_DOUBLE_PRECISION) {
packQDPDoubleGaugeField(packedEven, (double**)gauge, 0, gauge_param->reconstruct);
packQDPDoubleGaugeField(packedOdd, (double**)gauge, 1, gauge_param->reconstruct);
} else {
printf("Sorry, %d GaugeFieldOrder not supported\n", gauge_param->gauge_order);
exit(-1);
packQDPSingleGaugeField(packedEven, (float**)gauge, 0, gauge_param->reconstruct);
packQDPSingleGaugeField(packedOdd, (float**)gauge, 1, gauge_param->reconstruct);
}
cudaMemcpy(cudaGauge.even, packedEven, packed_gauge_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(cudaGauge.odd, packedOdd, packed_gauge_bytes, cudaMemcpyHostToDevice);
} else if (gauge_param->gauge_order == QUDA_CPS_WILSON_GAUGE_ORDER) {
if (gauge_param->cpu_prec == QUDA_DOUBLE_PRECISION) {
packCPSDoubleGaugeField(packedEven, (double*)gauge, 0, gauge_param->reconstruct);
packCPSDoubleGaugeField(packedOdd, (double*)gauge, 1, gauge_param->reconstruct);
} else {
packCPSSingleGaugeField(packedEven, (float*)gauge, 0, gauge_param->reconstruct);
packCPSSingleGaugeField(packedOdd, (float*)gauge, 1, gauge_param->reconstruct);
}
} else {
printf("Sorry, %d GaugeFieldOrder not supported\n", gauge_param->gauge_order);
exit(-1);
}
cudaMemcpy(cudaGauge.even, packedEven, packed_gauge_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(cudaGauge.odd, packedOdd, packed_gauge_bytes, cudaMemcpyHostToDevice);
#ifndef __DEVICE_EMULATION__
cudaFreeHost(packedEven);
cudaFreeHost(packedOdd);
cudaFreeHost(packedEven);
cudaFreeHost(packedOdd);
#else
free(packedEven);
free(packedOdd);
free(packedEven);
free(packedOdd);
#endif
} else {
if (gauge_param->cuda_prec == QUDA_HALF_PRECISION) {
packed_gauge_bytes /= 2;
gauge_param->packed_size += packed_gauge_bytes/(4*Nh*sizeof(float));
allocateGaugeField(&cudaHGauge, packed_gauge_bytes);
cudaHGauge.reconstruct = gauge_param->reconstruct;
cudaHGauge.precision = QUDA_HALF_PRECISION;
}
if (gauge_param->cuda_prec == QUDA_HALF_PRECISION) {
// Use pinned memory
short4 *packedEven, *packedOdd;
#ifndef __DEVICE_EMULATION__
......@@ -501,7 +526,7 @@ void loadGaugeField(void *gauge) {
packedEven = (short4*)malloc(packed_gauge_bytes);
packedOdd = (short4*)malloc(packed_gauge_bytes);
#endif
if (gauge_param->gauge_order == QUDA_QDP_GAUGE_ORDER) {
if (gauge_param->cpu_prec == QUDA_DOUBLE_PRECISION) {
packHalfQDPDoubleGaugeField(packedEven, (double**)gauge, 0, gauge_param->reconstruct);
......@@ -523,8 +548,8 @@ void loadGaugeField(void *gauge) {
exit(-1);
}
cudaMemcpy(cudaGauge.even, packedEven, packed_gauge_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(cudaGauge.odd, packedOdd, packed_gauge_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(cudaHGauge.even, packedEven, packed_gauge_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(cudaHGauge.odd, packedOdd, packed_gauge_bytes, cudaMemcpyHostToDevice);
#ifndef __DEVICE_EMULATION__
cudaFreeHost(packedEven);
......@@ -744,10 +769,14 @@ void loadParitySpinor(ParitySpinor ret, void *spinor, Precision cpu_prec,
}
if (cuda_prec == QUDA_HALF_PRECISION) {
if (!spinorHalf && !spinorNorm) {
if (!hSpinor1.spinorHalf && !hSpinor1.spinorNorm &&
!hSpinor2.spinorHalf && !hSpinor2.spinorNorm ) {
allocateSpinorHalf();
} else if (!spinorHalf || !spinorNorm) {
printf("allocateSpinorHalf error %u %u\n", spinorHalf, spinorNorm);
} else if (!hSpinor1.spinorHalf || !hSpinor1.spinorNorm ||
!hSpinor2.spinorHalf || !hSpinor2.spinorNorm) {
printf("allocateSpinorHalf error %u %u %u %u\n",
hSpinor1.spinorHalf, hSpinor1.spinorNorm,
hSpinor2.spinorHalf, hSpinor2.spinorNorm);
exit(-1);
}
}
......
......@@ -7,8 +7,9 @@
#include <util_quda.h>
#include <field_quda.h>
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gauge,
ParitySpinor tmp, QudaInvertParam *invert_param, DagType dag_type)
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gaugeSloppy,
FullGauge gaugePrecise, ParitySpinor tmp,
QudaInvertParam *invert_param, DagType dag_type)
{
int len = Nh*spinorSiteSize;
......@@ -48,9 +49,11 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gauge,
QudaSumFloat r0Norm = rNorm;
QudaSumFloat maxrx = rNorm;
QudaSumFloat maxrr = rNorm;
QudaSumFloat delta = 1e-2;
QudaSumFloat delta = 5e-1;
int k=0;
int xUpdate = 0, rUpdate = 0;
//printf("%d iterations, r2 = %e\n", k, r2);
stopwatchStart();
while (r2 > stop && k<invert_param->maxiter) {
......@@ -69,11 +72,13 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gauge,
}
if (dag_type == QUDA_DAG_NO)
rv = MatPCcDotWXCuda(v, gauge, p, invert_param->kappa, tmp, b, invert_param->matpc_type);
//rv = MatPCcDotWXCuda(v, gauge, p, invert_param->kappa, tmp, b, invert_param->matpc_type);
MatPCCuda(v, gaugeSloppy, p, invert_param->kappa, tmp, invert_param->matpc_type);
else
rv = MatPCDagcDotWXCuda(v, gauge, p, invert_param->kappa, tmp, b, invert_param->matpc_type);
//rv = MatPCDagcDotWXCuda(v, gauge, p, invert_param->kappa, tmp, b, invert_param->matpc_type);
MatPCDagCuda(v, gaugeSloppy, p, invert_param->kappa, tmp, invert_param->matpc_type);
//rv = cDotProductCuda((float2*)b, (float2*)v, len/2);
rv = cDotProductCuda((float2*)b, (float2*)v, len/2);
cuComplex rv32 = make_cuFloatComplex((float)rv.x, (float)rv.y);
alpha = cuCdivf(rho, rv32);
......@@ -83,9 +88,9 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gauge,
alpha.x *= -1.0f; alpha.y *= -1.0f;
if (dag_type == QUDA_DAG_NO)
MatPCCuda(t, gauge, r, invert_param->kappa, tmp, invert_param->matpc_type);
MatPCCuda(t, gaugeSloppy, r, invert_param->kappa, tmp, invert_param->matpc_type);
else
MatPCDagCuda(t, gauge, r, invert_param->kappa, tmp, invert_param->matpc_type);
MatPCDagCuda(t, gaugeSloppy, r, invert_param->kappa, tmp, invert_param->matpc_type);
// omega = (t, r) / (t, t)
omega_t2 = cDotProductNormACuda((float2*)t, (float2*)r, len/2); // 6
......@@ -104,15 +109,15 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gauge,
int updateR = ((rNorm < delta*maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0;
if (updateR) {
QudaPrecision prec = invert_param ->cuda_prec;
QudaPrecision spinorPrec = invert_param ->cuda_prec;
invert_param -> cuda_prec = QUDA_SINGLE_PRECISION;
if (dag_type == QUDA_DAG_NO)
MatPCCuda(t, gauge, x, invert_param->kappa, tmp, invert_param->matpc_type);
MatPCCuda(t, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
else
MatPCDagCuda(t, gauge, x, invert_param->kappa, tmp, invert_param->matpc_type);
MatPCDagCuda(t, gaugePrecise, x, invert_param->kappa, tmp, invert_param->matpc_type);
invert_param -> cuda_prec = prec;
invert_param -> cuda_prec = spinorPrec;
copyCuda((float*)r, (float*)b, len);
mxpyCuda((float*)t, (float*)r, len);
......@@ -120,6 +125,8 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gauge,
rNorm = sqrt(r2);
maxrr = rNorm;
rUpdate++;
if (updateX) {
axpyCuda(1.0f, (float*)x, (float*)y, len);
zeroCuda((float*)x, len);
......@@ -127,13 +134,14 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gauge,
r0Norm = rNorm;
maxrx = rNorm;
xUpdate++;
}
}
k++;
//printf("%d iterations, r2 = %e, x2 = %e\n", k, r2, normCuda((float*)x, len));
printf("%d iterations, r2 = %e, x2 = %e\n", k, r2, normCuda((float*)x, len));
}
axpyCuda(1.0f, (float*)y, (float*)x, len);
......@@ -141,7 +149,11 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor source, FullGauge gauge,
//if (k==maxiters) printf("Exceeded maximum iterations %d\n", maxiters);
printf("Residual updates = %d, Solution updates = %d\n", rUpdate, xUpdate);
float gflops = (1.0e-9*Nh)*(2*(2*1320+48)*k + (32*k + 8*(k-1))*spinorSiteSize);
gflops += 1.0e-9*Nh*rUpdate*((2*1320+48) + 3*spinorSiteSize);
gflops += 1.0e-9*Nh*xUpdate*spinorSiteSize;
//printf("%f gflops\n", k*gflops / stopwatchReadSeconds());
invert_param->gflops += gflops;
invert_param->iter += k;
......
......@@ -8,6 +8,9 @@
#include <util_quda.h>
#include <field_quda.h>
FullGauge gaugeSloppy; // sloppy gauge field
FullGauge gaugePrecise; // precise gauge field
void printGaugeParam(QudaGaugeParam *param) {
printf("Gauge Params:\n");
......@@ -76,12 +79,24 @@ void initQuda(int dev)
cudaGauge.even = 0;
cudaGauge.odd = 0;
cudaHGauge.even = 0;
cudaHGauge.odd = 0;
hSpinor1.spinorHalf = 0;
hSpinor1.spinorNorm = 0;
hSpinor2.spinorHalf = 0;
hSpinor2.spinorNorm = 0;
}
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
{
gauge_param = param;
loadGaugeField(h_gauge);
gaugePrecise = cudaGauge;
gaugeSloppy = (param->cuda_prec == QUDA_HALF_PRECISION) ? cudaHGauge : cudaGauge;
}
void endQuda()
......@@ -90,6 +105,96 @@ void endQuda()
freeGaugeField();
}
void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int parity, int dagger)
{
ParitySpinor in = allocateParitySpinor();
ParitySpinor out = allocateParitySpinor();
loadParitySpinor(in, h_in, inv_param->cpu_prec,
inv_param->cuda_prec, inv_param->dirac_order);
if (gauge_param->cuda_prec == QUDA_SINGLE_PRECISION) {
dslashCuda(out, cudaGauge, in, parity, dagger);
} else if (inv_param->cuda_prec == QUDA_HALF_PRECISION) {
dslashCuda(out, cudaHGauge, in, parity, dagger);
}
retrieveParitySpinor(h_out, out, inv_param->cpu_prec,
inv_param->cuda_prec, inv_param->dirac_order);
freeParitySpinor(out);
freeParitySpinor(in);
}
void MatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
{
ParitySpinor in = allocateParitySpinor();
ParitySpinor out = allocateParitySpinor();
ParitySpinor tmp = allocateParitySpinor();
loadParitySpinor(in, h_in, inv_param->cpu_prec,
inv_param->cuda_prec, inv_param->dirac_order);
if (gauge_param->cuda_prec == QUDA_SINGLE_PRECISION) {
MatPCCuda(out, cudaGauge, in, inv_param->kappa, tmp, inv_param->matpc_type);
} else {
MatPCCuda(out, cudaHGauge, in, inv_param->kappa, tmp, inv_param->matpc_type);
}
retrieveParitySpinor(h_out, out, inv_param->cpu_prec,
inv_param->cuda_prec, inv_param->dirac_order);
freeParitySpinor(tmp);
freeParitySpinor(out);
freeParitySpinor(in);
}
void MatPCDagQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
{
ParitySpinor in = allocateParitySpinor();
ParitySpinor out = allocateParitySpinor();
ParitySpinor tmp = allocateParitySpinor();
loadParitySpinor(in, h_in, inv_param->cpu_prec,
inv_param->cuda_prec, inv_param->dirac_order);
if (gauge_param->cuda_prec == QUDA_SINGLE_PRECISION) {
MatPCDagCuda(out, cudaGauge, in, inv_param->kappa, tmp, inv_param->matpc_type);
} else {
MatPCDagCuda(out, cudaHGauge, in, inv_param->kappa, tmp, inv_param->matpc_type);
}
retrieveParitySpinor(h_out, out, inv_param->cpu_prec,
inv_param->cuda_prec, inv_param->dirac_order);
freeParitySpinor(tmp);
freeParitySpinor(out);
freeParitySpinor(in);
}
void MatPCDagMatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
{
ParitySpinor in = allocateParitySpinor();
ParitySpinor out = allocateParitySpinor();
ParitySpinor tmp = allocateParitySpinor();
loadParitySpinor(in, h_in, inv_param->cpu_prec,
inv_param->cuda_prec, inv_param->dirac_order);
if (gauge_param->cuda_prec == QUDA_SINGLE_PRECISION) {
MatPCDagMatPCCuda(out, cudaGauge, in, inv_param->kappa, tmp, inv_param->matpc_type);
} else {
MatPCDagMatPCCuda(out, cudaHGauge, in, inv_param->kappa, tmp, inv_param->matpc_type);
}
retrieveParitySpinor(h_out, out, inv_param->cpu_prec,
inv_param->cuda_prec, inv_param->dirac_order);
freeParitySpinor(tmp);
freeParitySpinor(out);
freeParitySpinor(in);
}
void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
{
invert_param = param;
......@@ -145,9 +250,9 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
}
if (param->matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashXpayCuda(in, cudaGauge, b.odd, 0, 0, b.even, kappa);
dslashXpaySCuda(in, gaugePrecise, b.odd, 0, 0, b.even, kappa);
} else {
dslashXpayCuda(in, cudaGauge, b.even, 1, 0, b.odd, kappa);
dslashXpaySCuda(in, gaugePrecise, b.even, 1, 0, b.odd, kappa);
}
} else if (param->solution_type == QUDA_MATPC_SOLUTION ||
......@@ -166,16 +271,16 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
case QUDA_CG_INVERTER:
if (param->solution_type != QUDA_MATPCDAG_MATPC_SOLUTION) {
copyCuda((float *)out, (float *)in, slenh);
MatPCDagCuda(in, cudaGauge, out, kappa, tmp, param->matpc_type);
MatPCDagCuda(in, gaugePrecise, out, kappa, tmp, param->matpc_type);
}
invertCgCuda(out, in, cudaGauge, tmp, param);
invertCgCuda(out, in, gaugePrecise, tmp, param);
break;
case QUDA_BICGSTAB_INVERTER:
if (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) {
invertBiCGstabCuda(out, in, cudaGauge, tmp, param, QUDA_DAG_YES);
invertBiCGstabCuda(out, in, gaugeSloppy, gaugePrecise, tmp, param, QUDA_DAG_YES);
copyCuda((float *)in, (float *)out, slenh);
}
invertBiCGstabCuda(out, in, cudaGauge, tmp, param, QUDA_DAG_NO);
invertBiCGstabCuda(out, in, gaugeSloppy, gaugePrecise, tmp, param, QUDA_DAG_NO);
break;
default:
printf("Inverter type %d not implemented\n", param->inv_type);
......@@ -191,9 +296,9 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
}
if (param->matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashXpayCuda(x.odd, cudaGauge, out, 1, 0, b.odd, kappa);
dslashXpaySCuda(x.odd, gaugePrecise, out, 1, 0, b.odd, kappa);
} else {
dslashXpayCuda(x.even, cudaGauge, out, 0, 0, b.even, kappa);
dslashXpaySCuda(x.even, gaugePrecise, out, 0, 0, b.even, kappa);
}
retrieveSpinorField(h_x, x, param->cpu_prec, param->cuda_prec, param->dirac_order);
......
......@@ -60,6 +60,12 @@ extern "C" {
void initQuda(int dev);
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param);
void invertQuda(void *h_x, void *h_b, QudaInvertParam *param);
void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int parity, int dagger);
void MatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param);
void MatPCDagQuda(void *h_out, void *h_in, QudaInvertParam *inv_param);
void MatPCDagMatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param);
void endQuda(void);
void printGaugeParam(QudaGaugeParam *);
......
......@@ -7,7 +7,7 @@
int main(int argc, char **argv)
{
int device = 1;
int device = 0;
float *gauge[4];
......@@ -15,7 +15,7 @@ int main(int argc, char **argv)
QudaInvertParam inv_param;
Gauge_param.cpu_prec = QUDA_SINGLE_PRECISION;
Gauge_param.cuda_prec = QUDA_SINGLE_PRECISION;
Gauge_param.cuda_prec = QUDA_HALF_PRECISION;
Gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
Gauge_param.X = L1;
Gauge_param.Y = L2;
......@@ -30,13 +30,13 @@ int main(int argc, char **argv)
Gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
gauge_param = &Gauge_param;
float mass = -0.95;
float mass = -0.96;
inv_param.kappa = 1.0 / (2.0*(4 + mass));
inv_param.tol = 1e-7;
inv_param.tol = 5e-7;
inv_param.maxiter = 5000;
inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION;
inv_param.cpu_prec = QUDA_SINGLE_PRECISION;
inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
inv_param.cuda_prec = QUDA_HALF_PRECISION;
inv_param.solution_type = QUDA_MAT_SOLUTION;
inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
inv_param.preserve_source = QUDA_PRESERVE_SOURCE_YES;
......
......@@ -49,6 +49,30 @@
I4.x *= C; I4.y *= C; I4.z *= C; I4.w *= C; \
I5.x *= C; I5.y *= C; I5.z *= C; I5.w *= C;
#define READ_ACCUM_SINGLE(spinor) \
float4 accum0 = tex1Dfetch((spinor), sid + 0*Nh); \
float4 accum1 = tex1Dfetch((spinor), sid + 1*Nh); \
float4 accum2 = tex1Dfetch((spinor), sid + 2*Nh); \
float4 accum3 = tex1Dfetch((spinor), sid + 3*Nh); \
float4 accum4 = tex1Dfetch((spinor), sid + 4*Nh); \
float4 accum5 = tex1Dfetch((spinor), sid + 5*Nh);
#define READ_ACCUM_HALF(spinor) \
float4 accum0 = tex1Dfetch((spinor), sid + 0*Nh); \
float4 accum1 = tex1Dfetch((spinor), sid + 1*Nh); \
float4 accum2 = tex1Dfetch((spinor), sid + 2*Nh); \
float4 accum3 = tex1Dfetch((spinor), sid + 3*Nh); \
float4 accum4 = tex1Dfetch((spinor), sid + 4*Nh); \
float4 accum5 = tex1Dfetch((spinor), sid + 5*Nh); \
float C = tex1Dfetch((accumTexNorm), sid); \
accum0.x *= C; accum0.y *= C; accum0.z *= C; accum0.w *= C; \
accum1.x *= C; accum1.y *= C; accum1.z *= C; accum1.w *= C; \
accum2.x *= C; accum2.y *= C; accum2.z *= C; accum2.w *= C; \
accum3.x *= C; accum3.y *= C; accum3.z *= C; accum3.w *= C; \
accum4.x *= C; accum4.y *= C; accum4.z *= C; accum4.w *= C; \
accum5.x *= C; accum5.y *= C; accum5.z *= C; accum5.w *= C;
#define WRITE_SPINOR_FLOAT4() \
g_out[0*Nh+sid] = make_float4(o00_re, o00_im, o01_re, o01_im); \
g_out[1*Nh+sid] = make_float4(o02_re, o02_im, o10_re, o10_im); \
......
......@@ -28,6 +28,8 @@
#define DagType QudaDagType
#define Tboundary QudaTboundary
#include <enum_quda.h>
#ifdef __cplusplus
extern "C" {
#endif
......@@ -41,10 +43,17 @@ extern "C" {
} FullSpinor;
typedef struct {
Precision precision;
ReconstructType reconstruct;
ParityGauge odd;
ParityGauge even;
} FullGauge;
typedef struct ParityHSpinor_S {
short4 *spinorHalf;
float *spinorNorm;
} ParityHSpinor;
#ifdef __cplusplus
}
#endif
......
......@@ -34,7 +34,9 @@
float4 G1 = tex1Dfetch((gauge), ga_idx + ((dir/2)*3+1)*Nh); \
float4 G2 = tex1Dfetch((gauge), ga_idx + ((dir/2)*3+2)*Nh); \
float4 G3 = make_float4(0,0,0,0); \
float4 G4 = make_float4(0,0,0,0); \
float4 G4 = make_float4(0,0,0,0);
#define RECONSTRUCT_MATRIX_12(dir) \
ACC_CONJ_PROD(g20, +g01, +g12); \
ACC_CONJ_PROD(g20, -g02, +g11); \
ACC_CONJ_PROD(g21, +g02, +g10); \
......@@ -51,60 +53,25 @@
float4 G2 = make_float4(0,0,0,0); \
float4 G3 = make_float4(0,0,0,0); \
float4 G4 = make_float4(0,0,0,0); \
float row_sum = g01_re*g01_re + g01_im*g01_im; \
row_sum += g02_re*g02_re + g02_im*g02_im; \
float u0 = (dir < 6 ? anisotropy : (ga_idx >= (L4-1)*L1h*L2*L3 ? t_boundary : 1)); \
float u02_inv = __fdividef(1.f, u0*u0); \
float column_sum = u02_inv - row_sum; \
float U00_mag = sqrtf(column_sum); \
g21_re = g00_re; \
g21_im = g00_im; \
__sincosf(g21_re, &g00_im, &g00_re); \
g00_re *= U00_mag; \
g00_im *= U00_mag; \
column_sum += g10_re*g10_re; \
column_sum += g10_im*g10_im; \
__sincosf(g21_im, &g20_im, &g20_re); \
float U20_mag = sqrtf(u02_inv - column_sum); \
g20_re *= U20_mag; \
g20_im *= U20_mag; \
float r_inv2 = __fdividef(1.0,u0*row_sum); \
COMPLEX_DOT_PRODUCT(A, g00, g10); \
A_re *= u0; A_im *= u0; \
COMPLEX_CONJUGATE_PRODUCT(g11, g20, g02); \
ACC_COMPLEX_PROD(g11, A, g01); \
g11_re *= -r_inv2; \
g11_im *= -r_inv2; \
COMPLEX_CONJUGATE_PRODUCT(g12, g20, g01); \
ACC_COMPLEX_PROD(g12, -A, g02); \
g12_re *= r_inv2; \
g12_im *= r_inv2; \
COMPLEX_DOT_PRODUCT(A, g00, g20); \
A_re *= u0; A_im *= u0; \
COMPLEX_CONJUGATE_PRODUCT(g21, g10, g02); \
ACC_COMPLEX_PROD(g21, -A, g01); \
g21_re *= r_inv2; \
g21_im *= r_inv2; \
COMPLEX_CONJUGATE_PRODUCT(g22, g10, g01); \
ACC_COMPLEX_PROD(g22, A, g02); \
g22_re *= -r_inv2; \
g22_im *= -r_inv2;
g21_im = g00_im;
// set A to be last components of G4 (otherwise unused)
#define READ_GAUGE_MATRIX_8_HALF(gauge, dir) \
float4 G0 = tex1Dfetch((gauge), ga_idx + ((dir/2)*2+0)*Nh); \
float4 G1 = tex1Dfetch((gauge), ga_idx + ((dir/2)*2+1)*Nh); \
float4 G2 = make_float4(0,0,0,0); \
float4 G3 = make_float4(0,0,0,0); \
float4 G4 = make_float4(0,0,0,0); \
g21_re = pi*g00_re; \
g21_im = pi*g00_im;
#define RECONSTRUCT_MATRIX_8(dir) \
float row_sum = g01_re*g01_re + g01_im*g01_im; \
row_sum += g02_re*g02_re + g02_im*g02_im; \
float u0 = (dir < 6 ? anisotropy : (ga_idx >= (L4-1)*L1h*L2*L3 ? t_boundary : 1)); \
float u02_inv = __fdividef(1.f, u0*u0); \
float column_sum = u02_inv - row_sum; \
float U00_mag = sqrtf(column_sum); \
g21_re = M_PI*g00_re; \
g21_im = M_PI*g00_im; \
float u02_inv = __fdividef(1.f, u0*u0); \
float column_sum = u02_inv - row_sum; \
float U00_mag = sqrtf(column_sum); \
__sincosf(g21_re, &g00_im, &g00_re); \
g00_re *= U00_mag; \
g00_im *= U00_mag; \
......@@ -114,7 +81,7 @@
float U20_mag = sqrtf(u02_inv - column_sum); \
g20_re *= U20_mag; \
g20_im *= U20_mag; \
float r_inv2 = __fdividef(1.0,u0*row_sum); \
float r_inv2 = __fdividef(1.0f, u0*row_sum); \
COMPLEX_DOT_PRODUCT(A, g00, g10); \
A_re *= u0; A_im *= u0; \
COMPLEX_CONJUGATE_PRODUCT(g11, g20, g02); \
......
......@@ -74,7 +74,7 @@ QudaSumComplex REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n) {
// copy result from device to host, and perform final reduction on CPU
cudaMemcpy(h_odata, d_odata, blocks*sizeof(QudaSumComplex), cudaMemcpyDeviceToHost);
QudaSumComplex gpu_result;
cuDoubleComplex gpu_result;
gpu_result.x = 0;
gpu_result.y = 0;
for (int i = 0; i < blocks; i++) {
......@@ -83,7 +83,11 @@ QudaSumComplex REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n) {
}
cudaFree(d_odata);
return gpu_result;
QudaSumComplex res;
res.x = (QudaSumFloat)gpu_result.x;
res.y = (QudaSumFloat)gpu_result.y;
return res;
}
......@@ -153,7 +157,8 @@ QudaSumComplex REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n) {
// copy result from device to host, and perform final reduction on CPU
cudaMemcpy(h_odata, d_odata, blocks*sizeof(QudaSumComplex), cudaMemcpyDeviceToHost);
QudaSumComplex gpu_result;
cuDoubleComplex gpu_result;
gpu_result.x = 0;
gpu_result.y = 0;
for (int i = 0; i < blocks; i++) {
......@@ -162,7 +167,11 @@ QudaSumComplex REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n) {
}
cudaFree(d_odata);
return gpu_result;
QudaSumComplex res;
res.x = (QudaSumFloat)gpu_result.x;
res.y = (QudaSumFloat)gpu_result.y;
return res;
}
#endif // REDUCE_DOUBLE_PRECISION
......
......@@ -65,12 +65,12 @@ QudaSumFloat REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n) {
// copy result from device to host, and perform final reduction on CPU
cudaMemcpy(h_odata, d_odata, blocks*sizeof(QudaSumFloat), cudaMemcpyDeviceToHost);
QudaSumFloat gpu_result = 0;
double gpu_result = 0;
for (int i = 0; i < blocks; i++)
gpu_result += h_odata[i];
cudaFree(d_odata);
return gpu_result;
return (QudaSumFloat)gpu_result;
}
......@@ -134,12 +134,12 @@ QudaSumFloat REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n) {
// copy result from device to host, and perform final reduction on CPU
cudaMemcpy(h_odata, d_odata, blocks*sizeof(QudaSumFloat), cudaMemcpyDeviceToHost);
QudaSumFloat gpu_result = 0;
double gpu_result = 0;
for (int i = 0; i < blocks; i++)
gpu_result += h_odata[i];
cudaFree(d_odata);
return gpu_result;
return (QudaSumFloat)gpu_result;
}
#endif // REDUCE_DOUBLE_PRECISION
......@@ -81,7 +81,7 @@ QudaSumFloat3 REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n) {
// copy result from device to host, and perform final reduction on CPU
cudaMemcpy(h_odata, d_odata, blocks*sizeof(QudaSumFloat3), cudaMemcpyDeviceToHost);
QudaSumFloat3 gpu_result;
double3 gpu_result;
gpu_result.x = 0;
gpu_result.y = 0;
gpu_result.z = 0;
......@@ -92,7 +92,12 @@ QudaSumFloat3 REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n) {
}
cudaFree(d_odata);
return gpu_result;
QudaSumFloat3 res;
res.x = (QudaSumFloat)gpu_result.x;
res.y = (QudaSumFloat)gpu_result.y;
res.z = (QudaSumFloat)gpu_result.z;
return res;
}
......@@ -168,7 +173,8 @@ QudaSumFloat3 REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n) {
// copy result from device to host, and perform final reduction on CPU
cudaMemcpy(h_odata, d_odata, blocks*sizeof(QudaSumFloat3), cudaMemcpyDeviceToHost);
QudaSumFloat3 gpu_result;
double3 gpu_result;
gpu_result.x = 0;
gpu_result.y = 0;
gpu_result.z = 0;
......@@ -179,7 +185,12 @@ QudaSumFloat3 REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n) {
}
cudaFree(d_odata);
return gpu_result;
QudaSumFloat3 res;
res.x = (QudaSumFloat)gpu_result.x;
res.y = (QudaSumFloat)gpu_result.y;
res.z = (QudaSumFloat)gpu_result.z;
return res;
}
#endif // REDUCE_DOUBLE_PRECISION
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment