Advanced Computing Platform for Theoretical Physics

Commit ca9d56f0 authored by mikeaclark's avatar mikeaclark
Browse files

Initial pass at production code

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@284 be54200a-260c-0410-bdd7-ce6af2a381ab
parents
CUFILES := dslash_cuda.cu blas_cuda.cu
CCFILES := inv_bicgstab_cuda.cpp inv_cg.cpp util_cuda.cpp field_cuda.cpp
CUDA_INSTALL_PATH = /usr/local/cuda
INCLUDES = -I. -I$(CUDA_INSTALL_PATH)/include
LIB = -L$(CUDA_INSTALL_PATH)/lib -lcudart -lcublas
CC = gcc
CFLAGS = -Wall -O3 -std=c99 $(INCLUDES)
CXX = g++
CXXFLAGS = -Wall $(INCLUDES) -DUNIX -O3
NVCC = $(CUDA_INSTALL_PATH)/bin/nvcc
NVCCFLAGS = $(INCLUDES) -DUNIX -O3
LDFLAGS = -fPIC $(LIB)
CCOBJECTS = $(CCFILES:.cpp=.o)
CUOBJECTS = $(CUFILES:.cu=.o)
all: dslash_test invert_test su3_test
ILIB = libquda.a
ILIB_OBJS = inv_bicgstab_quda.o inv_cg_quda.o dslash_quda.o blas_quda.o util_quda.o \
dslash_reference.o blas_reference.o invert_quda.o field_quda.o
ILIB_DEPS = $(ILIB_OBJS) blas_quda.h quda.h util_quda.h invert_quda.h field_quda.h enum_quda.h
$(ILIB): $(ILIB_DEPS)
ar cru $@ $(ILIB_OBJS)
invert_test: invert_test.o $(ILIB)
$(CXX) $(LDFLAGS) $< $(ILIB) -o $@
dslash_test: dslash_test.o $(ILIB)
$(CXX) $(LDFLAGS) $< $(ILIB) -o $@
su3_test: su3_test.o $(ILIB)
$(CXX) $(LDFLAGS) $< $(ILIB) -o $@
clean:
-rm -f *.o dslash_test invert_test su3_test $(ILIB)
%.o: %.c
$(CC) $(CFLAGS) $< -c -o $@
%.o: %.cpp
$(CXX) $(CXXFLAGS) $< -c -o $@
%.o: %.cu
$(NVCC) $(NVCCFLAGS) $< -c -o $@
#include <stdlib.h>
#include <stdio.h>
#include <quda.h>
#include <util_quda.h>
#define REDUCE_DOUBLE_PRECISION
#define REDUCE_THREADS 128
#define REDUCE_MAX_BLOCKS 64
void zeroCuda(float* dst, int len) {
// cuda's floating point format, IEEE-754, represents the floating point
// zero as 4 zero bytes
cudaMemset(dst, 0, len*sizeof(float));
}
void copyCuda(float* dst, float *src, int len) {
cudaMemcpy(dst, src, len*sizeof(float), cudaMemcpyDeviceToDevice);
}
__global__ void axpbyKernel(float a, float *x, float b, float *y, int len) {
unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
unsigned int gridSize = gridDim.x*blockDim.x;
while (i < len) {
y[i] = a*x[i] + b*y[i];
i += gridSize;
}
}
// performs the operation y[i] = a*x[i] + b*y[i]
void axpbyCuda(float a, float *x, float b, float *y, int len) {
int blocks = min(REDUCE_MAX_BLOCKS, max(len/REDUCE_THREADS, 1));
dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1);
axpbyKernel<<<dimGrid, dimBlock>>>(a, x, b, y, len);
}
// performs the operation y[i] = a*x[i] + y[i]
void axpyCuda(float a, float *x, float *y, int len) {
axpbyCuda(a, x, 1.0, y, len);
}
// performs the operation y[i] = x[i] + a*y[i]
void xpayCuda(float *x, float a, float *y, int len) {
axpbyCuda(1.0, x, a, y, len);
}
// performs the operation y[i] -= x[i] (minus x plus y)
void mxpyCuda(float *x, float *y, int len) {
axpbyCuda(-1.0, x, 1.0, y, len);
}
// performs the operation x[i] = a*x[i]
void axCuda(float a, float *x, int len) {
axpbyCuda(0.0, x, a, x, len);
}
__global__ void caxpyKernel(float2 a, float2 *x, float2 *y, int len) {
unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
unsigned int gridSize = gridDim.x*blockDim.x;
while (i < len) {
float Xx = x[i].x;
float Xy = x[i].y;
y[i].x += a.x*Xx - a.y*Xy;
y[i].y += a.y*Xx + a.x*Xy;
i += gridSize;
}
}
// performs the operation y[i] += a*x[i]
void caxpyCuda(float2 a, float2 *x, float2 *y, int len) {
int blocks = min(REDUCE_MAX_BLOCKS, max(len/REDUCE_THREADS, 1));
dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1);
caxpyKernel<<<dimGrid, dimBlock>>>(a, x, y, len);
}
__global__ void caxpbyKernel(float2 a, float2 *x, float2 b, float2 *y, int len) {
unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
unsigned int gridSize = gridDim.x*blockDim.x;
while (i < len) {
float Xx = x[i].x;
float Xy = x[i].y;
float Yx = y[i].x;
float Yy = y[i].y;
y[i].x = a.x*Xx + b.x*Yx - a.y*Xy - b.y*Yy;
y[i].y = a.y*Xx + b.y*Yx + a.x*Xy + b.x*Yy;
i += gridSize;
}
}
// performs the operation y[i] = c*x[i] + b*y[i]
void caxbyCuda(float2 a, float2 *x, float2 b, float2 *y, int len) {
int blocks = min(REDUCE_MAX_BLOCKS, max(len/REDUCE_THREADS, 1));
dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1);
caxpbyKernel<<<dimGrid, dimBlock>>>(a, x, b, y, len);
}
__global__ void cxpaypbzKernel(float2 *x, float2 a, float2 *y,
float2 b, float2 *z, int len) {
unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
unsigned int gridSize = gridDim.x*blockDim.x;
while (i < len) {
float Xx = x[i].x;
float Xy = x[i].y;
float Yx = y[i].x;
float Yy = y[i].y;
float Zx = z[i].x;
float Zy = z[i].y;
Xx += a.x*Yx - a.y*Yy;
Xy += a.y*Yx + a.x*Yy;
Xx += b.x*Zx - b.y*Zy;
Xy += b.y*Zx + b.x*Zy;
z[i].x = Xx;
z[i].y = Xy;
i += gridSize;
}
}
// performs the operation z[i] = x[i] + a*y[i] + b*z[i]
void cxpaypbzCuda(float2 *x, float2 a, float2 *y, float2 b, float2 *z, int len) {
int blocks = min(REDUCE_MAX_BLOCKS, max(len/REDUCE_THREADS, 1));
dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1);
cxpaypbzKernel<<<dimGrid, dimBlock>>>(x, a, y, b, z, len);
}
__global__ void axpyZpbxKernel(float a, float *x, float *y, float *z, float b, int len) {
unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
unsigned int gridSize = gridDim.x*blockDim.x;
while (i < len) {
float x_i = x[i];
y[i] = a*x_i + y[i];
x[i] = z[i] + b*x_i;
i += gridSize;
}
}
// performs the operations: {y[i] = a x[i] + y[i]; x[i] = z[i] + b x[i]}
void axpyZpbxCuda(float a, float *x, float *y, float *z, float b, int len) {
int blocks = min(REDUCE_MAX_BLOCKS, max(len/REDUCE_THREADS, 1));
dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1);
axpyZpbxKernel<<<dimGrid, dimBlock>>>(a, x, y, z, b, len);
}
__global__ void caxpbypzYmbwKernel(float2 a, float2 *x, float2 b, float2 *y,
float2 *z, float2 *w, int len) {
unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
unsigned int gridSize = gridDim.x*blockDim.x;
while (i < len) {
float Xx = x[i].x;
float Xy = x[i].y;
float Yx = y[i].x;
float Yy = y[i].y;
float Wx = w[i].x;
float Wy = w[i].y;
float Zx = a.x*Xx - a.y*Xy;
float Zy = a.y*Xx + a.x*Xy;
Zx += b.x*Yx - b.y*Yy;
Zy += b.y*Yx + b.x*Yy;
Yx -= b.x*Wx - b.y*Wy;
Yy -= b.y*Wx + b.x*Wy;
z[i].x += Zx;
z[i].y += Zy;
y[i].x = Yx;
y[i].y = Yy;
i += gridSize;
}
}
// performs the operation z[i] = a*x[i] + b*y[i] + z[i] and y[i] -= b*w[i]
void caxpbypzYmbwCuda(float2 a, float2 *x, float2 b, float2 *y,
float2 *z, float2 *w, int len) {
int blocks = min(REDUCE_MAX_BLOCKS, max(len/REDUCE_THREADS, 1));
dim3 dimBlock(REDUCE_THREADS, 1, 1);
dim3 dimGrid(blocks, 1, 1);
caxpbypzYmbwKernel<<<dimGrid, dimBlock>>>(a, x, b, y, z, w, len);
}
// performs the operation y[i] = a*x[i] + y[i], and returns norm(y)
// float axpyNormCuda(float a, float *x, float *y, int len);
// Computes c = a + b in "double single" precision.
__device__ void dsadd(float &c0, float &c1, const float a0, const float a1, const float b0, const float b1) {
// Compute dsa + dsb using Knuth's trick.
float t1 = a0 + b0;
float e = t1 - a0;
float t2 = ((b0 - e) + (a0 - (t1 - e))) + a1 + b1;
// The result is t1 + t2, after normalization.
c0 = e = t1 + t2;
c1 = t2 - (e - t1);
}
// Computes c = a + b in "double single" precision (complex version)
__device__ void zcadd(float2 &c0, float2 &c1, const float2 a0, const float2 a1, const float2 b0, const float2 b1) {
// Compute dsa + dsb using Knuth's trick.
float t1 = a0.x + b0.x;
float e = t1 - a0.x;
float t2 = ((b0.x - e) + (a0.x - (t1 - e))) + a1.x + b1.x;
// The result is t1 + t2, after normalization.
c0.x = e = t1 + t2;
c1.x = t2 - (e - t1);
// Compute dsa + dsb using Knuth's trick.
t1 = a0.y + b0.y;
e = t1 - a0.y;
t2 = ((b0.y - e) + (a0.y - (t1 - e))) + a1.y + b1.y;
// The result is t1 + t2, after normalization.
c0.y = e = t1 + t2;
c1.y = t2 - (e - t1);
}
// Computes c = a + b in "double single" precision (float3 version)
__device__ void dsadd3(float3 &c0, float3 &c1, const float3 a0, const float3 a1, const float3 b0, const float3 b1) {
// Compute dsa + dsb using Knuth's trick.
float t1 = a0.x + b0.x;
float e = t1 - a0.x;
float t2 = ((b0.x - e) + (a0.x - (t1 - e))) + a1.x + b1.x;
// The result is t1 + t2, after normalization.
c0.x = e = t1 + t2;
c1.x = t2 - (e - t1);
// Compute dsa + dsb using Knuth's trick.
t1 = a0.y + b0.y;
e = t1 - a0.y;
t2 = ((b0.y - e) + (a0.y - (t1 - e))) + a1.y + b1.y;
// The result is t1 + t2, after normalization.
c0.y = e = t1 + t2;
c1.y = t2 - (e - t1);
// Compute dsa + dsb using Knuth's trick.
t1 = a0.z + b0.z;
e = t1 - a0.z;
t2 = ((b0.z - e) + (a0.z - (t1 - e))) + a1.z + b1.z;
// The result is t1 + t2, after normalization.
c0.z = e = t1 + t2;
c1.z = t2 - (e - t1);
}
//
// float sumCuda(float *a, int n) {}
//
#define REDUCE_FUNC_NAME(suffix) sum##suffix
#define REDUCE_TYPES float *a
#define REDUCE_PARAMS a
#define REDUCE_AUXILIARY(i)
#define REDUCE_OPERATION(i) a[i]
#include "reduce_core.h"
#undef REDUCE_FUNC_NAME
#undef REDUCE_TYPES
#undef REDUCE_PARAMS
#undef REDUCE_AUXILIARY
#undef REDUCE_OPERATION
//
// float normCuda(float *a, int n) {}
//
#define REDUCE_FUNC_NAME(suffix) norm##suffix
#define REDUCE_TYPES float *a
#define REDUCE_PARAMS a
#define REDUCE_AUXILIARY(i)
#define REDUCE_OPERATION(i) (a[i]*a[i])
#include "reduce_core.h"
#undef REDUCE_FUNC_NAME
#undef REDUCE_TYPES
#undef REDUCE_PARAMS
#undef REDUCE_AUXILIARY
#undef REDUCE_OPERATION
//
// float reDotProductCuda(float *a, float *b, int n) {}
//
#define REDUCE_FUNC_NAME(suffix) reDotProduct##suffix
#define REDUCE_TYPES float *a, float *b
#define REDUCE_PARAMS a, b
#define REDUCE_AUXILIARY(i)
#define REDUCE_OPERATION(i) (a[i]*b[i])
#include "reduce_core.h"
#undef REDUCE_FUNC_NAME
#undef REDUCE_TYPES
#undef REDUCE_PARAMS
#undef REDUCE_AUXILIARY
#undef REDUCE_OPERATION
//
// float2 cDotProductCuda(float2 *a, float2 *b, int n) {}
//
#define REDUCE_FUNC_NAME(suffix) cDotProduct##suffix
#define REDUCE_TYPES float2 *a, float2 *b
#define REDUCE_PARAMS a, b
#define REDUCE_REAL_AUXILIARY(i)
#define REDUCE_IMAG_AUXILIARY(i)
#define REDUCE_REAL_OPERATION(i) (a[i].x*b[i].x + a[i].y*b[i].y)
#define REDUCE_IMAG_OPERATION(i) (a[i].x*b[i].y - a[i].y*b[i].x)
#include "reduce_complex_core.h"
#undef REDUCE_FUNC_NAME
#undef REDUCE_TYPES
#undef REDUCE_PARAMS
#undef REDUCE_REAL_AUXILIARY
#undef REDUCE_IMAG_AUXILIARY
#undef REDUCE_REAL_OPERATION
#undef REDUCE_IMAG_OPERATION
//
// float3 cDotProductNormACuda(float2 *a, float2 *b, int n) {}
//
#define REDUCE_FUNC_NAME(suffix) cDotProductNormA##suffix
#define REDUCE_TYPES float2 *a, float2 *b
#define REDUCE_PARAMS a, b
#define REDUCE_X_AUXILIARY(i)
#define REDUCE_Y_AUXILIARY(i)
#define REDUCE_Z_AUXILIARY(i)
#define REDUCE_X_OPERATION(i) (a[i].x*b[i].x + a[i].y*b[i].y)
#define REDUCE_Y_OPERATION(i) (a[i].x*b[i].y - a[i].y*b[i].x)
#define REDUCE_Z_OPERATION(i) (a[i].x*a[i].x + a[i].y*a[i].y)
#include "reduce_triple_core.h"
#undef REDUCE_FUNC_NAME
#undef REDUCE_TYPES
#undef REDUCE_PARAMS
#undef REDUCE_X_AUXILIARY
#undef REDUCE_Y_AUXILIARY
#undef REDUCE_Z_AUXILIARY
#undef REDUCE_X_OPERATION
#undef REDUCE_Y_OPERATION
#undef REDUCE_Z_OPERATION
//
// float3 cDotProductNormBCuda(float2 *a, float2 *b, int n) {}
//
#define REDUCE_FUNC_NAME(suffix) cDotProductNormB##suffix
#define REDUCE_TYPES float2 *a, float2 *b
#define REDUCE_PARAMS a, b
#define REDUCE_X_AUXILIARY(i)
#define REDUCE_Y_AUXILIARY(i)
#define REDUCE_Z_AUXILIARY(i)
#define REDUCE_X_OPERATION(i) (a[i].x*b[i].x + a[i].y*b[i].y)
#define REDUCE_Y_OPERATION(i) (a[i].x*b[i].y - a[i].y*b[i].x)
#define REDUCE_Z_OPERATION(i) (b[i].x*b[i].x + b[i].y*b[i].y)
#include "reduce_triple_core.h"
#undef REDUCE_FUNC_NAME
#undef REDUCE_TYPES
#undef REDUCE_PARAMS
#undef REDUCE_X_AUXILIARY
#undef REDUCE_Y_AUXILIARY
#undef REDUCE_Z_AUXILIARY
#undef REDUCE_X_OPERATION
#undef REDUCE_Y_OPERATION
#undef REDUCE_Z_OPERATION
//
// float3 caxpbypzYmbwcDotProductWYNormYCuda(float2 a, float2 *x, float2 b, float2 *y,
// float2 *z, float2 *w, float2 *u, int len)
//
#define REDUCE_FUNC_NAME(suffix) caxpbypzYmbwcDotProductWYNormY##suffix
#define REDUCE_TYPES float2 a, float2 *x, float2 b, float2 *y, float2 *z, float2 *w, float2 *u
#define REDUCE_PARAMS a, x, b, y, z, w, u
#define REDUCE_X_AUXILIARY(i) \
float Xx = x[i].x; \
float Xy = x[i].y; \
float Yx = y[i].x; \
float Yy = y[i].y; \
float Wx = w[i].x; \
float Wy = w[i].y;
#define REDUCE_Y_AUXILIARY(i) \
float Zx = a.x*Xx - a.y*Xy; \
float Zy = a.y*Xx + a.x*Xy; \
Zx += b.x*Yx - b.y*Yy; \
Zy += b.y*Yx + b.x*Yy; \
Yx -= b.x*Wx - b.y*Wy; \
Yy -= b.y*Wx + b.x*Wy;
#define REDUCE_Z_AUXILIARY(i) \
z[i].x += Zx; \
z[i].y += Zy; \
y[i].x = Yx; \
y[i].y = Yy;
#define REDUCE_X_OPERATION(i) (u[i].x*y[i].x + u[i].y*y[i].y)
#define REDUCE_Y_OPERATION(i) (u[i].x*y[i].y - u[i].y*y[i].x)
#define REDUCE_Z_OPERATION(i) (y[i].x*y[i].x + y[i].y*y[i].y)
#include "reduce_triple_core.h"
#undef REDUCE_FUNC_NAME
#undef REDUCE_TYPES
#undef REDUCE_PARAMS
#undef REDUCE_X_AUXILIARY
#undef REDUCE_Y_AUXILIARY
#undef REDUCE_Z_AUXILIARY
#undef REDUCE_X_OPERATION
#undef REDUCE_Y_OPERATION
#undef REDUCE_Z_OPERATION
//
// float axpyNormCuda(float a, float *x, float *y, n){}
//
// First performs the operation y[i] = a*x[i] + y[i]
// Second returns the norm of y
//
#define REDUCE_FUNC_NAME(suffix) axpyNorm##suffix
#define REDUCE_TYPES float a, float *x, float *y
#define REDUCE_PARAMS a, x, y
#define REDUCE_AUXILIARY(i) y[i] = a*x[i] + y[i]
#define REDUCE_OPERATION(i) (y[i]*y[i])
#include "reduce_core.h"
#undef REDUCE_FUNC_NAME
#undef REDUCE_TYPES
#undef REDUCE_PARAMS
#undef REDUCE_AUXILIARY
#undef REDUCE_OPERATION
//
// float2 xpaycDotzyCuda(float2 *x, float a, float2 *y, float2 *z, int n) {}
//
// First performs the operation y = x - a*y
// Second returns complex dot product (z,y)
//
#define REDUCE_FUNC_NAME(suffix) xpaycDotzy##suffix
#define REDUCE_TYPES float2 *x, float a, float2 *y, float2 *z
#define REDUCE_PARAMS x, a, y, z
#define REDUCE_REAL_AUXILIARY(i) y[i].x = x[i].x + a*y[i].x
#define REDUCE_IMAG_AUXILIARY(i) y[i].y = x[i].y + a*y[i].y
#define REDUCE_REAL_OPERATION(i) (z[i].x*y[i].x + z[i].y*y[i].y)
#define REDUCE_IMAG_OPERATION(i) (z[i].x*y[i].y - z[i].y*y[i].x)
#include "reduce_complex_core.h"
#undef REDUCE_FUNC_NAME
#undef REDUCE_TYPES
#undef REDUCE_PARAMS
#undef REDUCE_REAL_AUXILIARY
#undef REDUCE_IMAG_AUXILIARY
#undef REDUCE_REAL_OPERATION
#undef REDUCE_IMAG_OPERATION
double cpuDouble(float *data, int size) {
double sum = 0;
for (int i = 0; i < size; i++)
sum += data[i];
return sum;
}
void blasTest() {
int n = 3*1<<24;
float *h_data = (float *)malloc(n*sizeof(float));
float *d_data;
if (cudaMalloc((void **)&d_data, n*sizeof(float))) {
printf("Error allocating d_data\n");
exit(0);
}
for (int i = 0; i < n; i++) {
h_data[i] = rand()/(float)RAND_MAX - 0.5; // n-1.0-i;
}
cudaMemcpy(d_data, h_data, n*sizeof(float), cudaMemcpyHostToDevice);
cudaThreadSynchronize();
stopwatchStart();
int LOOPS = 20;
for (int i = 0; i < LOOPS; i++) {
sumCuda(d_data, n);
}
cudaThreadSynchronize();
float secs = stopwatchReadSeconds();
printf("%f GiB/s\n", 1.e-9*n*sizeof(float)*LOOPS / secs);
printf("Device: %f MiB\n", (float)n*sizeof(float) / (1 << 20));
printf("Shared: %f KiB\n", (float)REDUCE_THREADS*sizeof(float) / (1 << 10));
float correctDouble = cpuDouble(h_data, n);
printf("Total: %f\n", correctDouble);
printf("CUDA db: %f\n", fabs(correctDouble-sumCuda(d_data, n)));
cudaFree(d_data) ;
free(h_data);
}
void axpbyTest() {
int n = 3 * 1 << 20;
float *h_x = (float *)malloc(n*sizeof(float));
float *h_y = (float *)malloc(n*sizeof(float));
float *h_res = (float *)malloc(n*sizeof(float));
float *d_x, *d_y;
if (cudaMalloc((void **)&d_x, n*sizeof(float))) {
printf("Error allocating d_x\n");
exit(0);
}
if (cudaMalloc((void **)&d_y, n*sizeof(float))) {
printf("Error allocating d_y\n");
exit(0);
}
for (int i = 0; i < n; i++) {
h_x[i] = 1;
h_y[i] = 2;
}
cudaMemcpy(d_x, h_x, n*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, h_y, n*sizeof(float), cudaMemcpyHostToDevice);
axpbyCuda(4, d_x, 3, d_y, n/2);
cudaMemcpy( h_res, d_y, n*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < n; i++) {
float expect = (i < n/2) ? 4*h_x[i] + 3*h_y[i] : h_y[i];
if (h_res[i] != expect)
printf("FAILED %d : %f != %f\n", i, h_res[i], h_y[i]);
}
cudaFree(d_y);
cudaFree(d_x);
free(h_x);
free(h_y);
}
#ifndef _QUDA_BLAS_H
#define _QUDA_BLAS_H
#ifdef __cplusplus
extern "C" {
#endif
// ---------- blas_cuda.cu ----------
void zeroCuda(float* dst, int cnt);
void copyCuda(float* dst, float *src, int len);
void axpbyCuda(float a, float *x, float b, float *y, int len);
void axpyCuda(float a, float *x, float *y, int len);
void axCuda(float a, float *x, int len);
void xpayCuda(float *x, float a, float *y, int len);
void mxpyCuda(float *x, float *y, int len);
void axpyZpbxCuda(float a, float *x, float *y, float *z, float b, int len);
float axpyNormCuda(float a, float *x, float *y, int len);
float sumCuda(float *a, int n);
float normCuda(float *a, int n);
float reDotProductCuda(float *a, float *b, int n);
void blasTest();
void axpbyTest();
void caxpbyCuda(float2 a, float2 *x, float2 b, float2 *y, int len);
void caxpyCuda(float2 a, float2 *x, float2 *y, int len);
void cxpaypbzCuda(float2 *x, float2 b, float2 *y, float2 c, float2 *z, int len);
float2 cDotProductCuda(float2*, float2*, int len);
void caxpbypzYmbwCuda(float2, float2*, float2, float2*, float2*, float2*, int len);
float3 cDotProductNormACuda(float2 *a, float2 *b, int n);
float3 cDotProductNormBCuda(float2 *a, float2 *b, int n);
float3 caxpbypzYmbwcDotProductWYNormYCuda(float2 a, float2 *x, float2 b, float2 *y,
float2 *z, float2 *w, float2 *u, int len);
float2 xpaycDotzyCuda(float2 *x, float a, float2 *y, float2 *z, int len);
// ---------- blas_reference.cpp ----------
void zero(float* a, int cnt);
void copy(float* a, float *b, int len);
void ax(float a, float *x, int len);
void axpbyCuda(float a, float *x, float b, float *y, int len);
void axpy(float a, float *x, float *y, int len);
void xpay(float *x, float a, float *y, int len);
void mxpy(float *x, float *y, int len);
float norm(float *vector, int len);
float reDotProduct(float *v1, float *v2, int len);
float imDotProduct(float *v1, float *v2, int len);
double normD(float *vector, int len);
double reDotProductD(float *v1, float *v2, int len);
double imDotProductD(float *v1, float *v2, int len);
#ifdef __cplusplus
}
#endif
#endif // _QUDA_BLAS_H
#include <quda.h>
// sets all elements of the destination vector to zero
void zero(float* a, int cnt) {
for (int i = 0; i < cnt; i++)
a[i] = 0;
}
// copy one spinor to the other
void copy(float* a, float *b, int len) {
for (int i = 0; i < len; i++) a[i] = b[i];
}
// performs the operation x[i] *= a
void ax(float a, float *x, int len) {
for (int i=0; i<len; i++) x[i] *= a;
}
// performs the operation y[i] = a*x[i] + b*y[i]
void axpby(float a, float *x, float b, float *y, int len) {
for (int i=0; i<len; i++) y[i] = a*x[i] + b*y[i];
}
// performs the operation y[i] = a*x[i] + y[i]
void axpy(float a, float *x, float *y, int len) {
for (int i=0; i<len; i++) y[i] += a*x[i];
}
// performs the operation y[i] = x[i] + a*y[i]
void xpay(float *x, float a, float *y, int len) {
for (int i=0; i<len; i++) y[i] = x[i] + a*y[i];
}
// performs the operation y[i] -= x[i] (minus x plus y)
void mxpy(float *x, float *y, int len) {
for (int i=0; i<len; i++) y[i] -= x[i];
}
// returns the square of the L2 norm of the vector
float norm(float *v, int len) {
float sum=0.0;
for (int i=0; i<len; i++) {
sum += v[i]*v[i];
}
return sum;
}
// returns the real part of the dot product of 2 complex valued vectors
float reDotProduct(float *v1, float *v2, int len) {
float dot=0.0;
for (int i=0; i<len; i++) {
dot += v1[i]*v2[i];
}
return dot;
}
// returns the imaginary part of the dot product of 2 complex valued vectors
float imDotProduct(float *v1, float *v2, int len) {
float dot=0.0;
for (int i=0; i<len; i+=2) {
dot += v1[i]*v2[i+1] - v1[i+1]*v2[i];
}
return dot;
}
// returns the square of the L2 norm of the vector
double normD(float *v, int len) {
double sum=0.0;
for (int i=0; i<len; i++) {
sum += v[i]*v[i];
}
return sum;
}
// returns the real part of the dot product of 2 complex valued vectors
double reDotProductD(float *v1, float *v2, int len) {
double dot=0.0;
for (int i=0; i<len; i++) {
dot += v1[i]*v2[i];
}
return dot;
}
// returns the imaginary part of the dot product of 2 complex valued vectors
double imDotProductD(float *v1, float *v2, int len) {
double dot=0.0;
for (int i=0; i<len; i+=2) {
dot += v1[i]*v2[i+1] - v1[i+1]*v2[i];
}
return dot;
}
This diff is collapsed.
### complex numbers ########################################################################
def complexify(a):
return [complex(x) for x in a]
def complexToStr(c):
def fltToString(a):
if a == int(a): return `int(a)`
else: return `a`
def imToString(a):
if a == 0: return "0i"
elif a == -1: return "-i"
elif a == 1: return "i"
else: return fltToString(a)+"i"
re = c.real
im = c.imag
if re == 0 and im == 0: return "0"
elif re == 0: return imToString(im)
elif im == 0: return fltToString(re)
else:
im_str = "-"+imToString(-im) if im < 0 else "+"+imToString(im)
return fltToString(re)+im_str
### projector matrices ########################################################################
id = complexify([
1, 0, 0, 0,
0, 1, 0, 0,
0, 0, 1, 0,
0, 0, 0, 1
])
gamma1 = complexify([
0, 0, 0, 1j,
0, 0, 1j, 0,
0, -1j, 0, 0,
-1j, 0, 0, 0
])
gamma2 = complexify([
0, 0, 0, -1,
0, 0, 1, 0,
0, 1, 0, 0,
-1, 0, 0, 0
])
gamma3 = complexify([
0, 0, 1j, 0,
0, 0, 0, -1j,
-1j, 0, 0, 0,
0, 1j, 0, 0
])
gamma4 = complexify([
0, 0, 1, 0,
0, 0, 0, 1,
1, 0, 0, 0,
0, 1, 0, 0
])
def gplus(g1, g2):
return [x+y for (x,y) in zip(g1,g2)]
def gminus(g1, g2):
return [x-y for (x,y) in zip(g1,g2)]
def projectorToStr(p):
out = ""
for i in range(0, 4):
for j in range(0,4):
out += complexToStr(p[4*i+j]) + " "
out += "\n"
return out
projectors = [
gminus(id,gamma1), gplus(id,gamma1),
gminus(id,gamma2), gplus(id,gamma2),
gminus(id,gamma3), gplus(id,gamma3),
gminus(id,gamma4), gplus(id,gamma4)
]
### code generation ########################################################################
### parameters
sharedFloats = 19
dagger = False
def block(code):
lines = ''.join([" "+line+"\n" for line in code.splitlines()])
return "{\n"+lines+"}\n"
def sign(x):
if x==1: return "+"
elif x==-1: return "-"
def nthFloat4(n):
return `(n/4)` + "." + ["x", "y", "z", "w"][n%4]
def in_re(s, c): return "i"+`s`+`c`+"_re"
def in_im(s, c): return "i"+`s`+`c`+"_im"
def g_re(d, m, n): return ("g" if (d%2==0) else "gT")+`m`+`n`+"_re"
def g_im(d, m, n): return ("g" if (d%2==0) else "gT")+`m`+`n`+"_im"
def out_re(s, c): return "o"+`s`+`c`+"_re"
def out_im(s, c): return "o"+`s`+`c`+"_im"
def h1_re(h, c): return ["a","b"][h]+`c`+"_re"
def h1_im(h, c): return ["a","b"][h]+`c`+"_im"
def h2_re(h, c): return ["A","B"][h]+`c`+"_re"
def h2_im(h, c): return ["A","B"][h]+`c`+"_im"
def prolog():
str = []
str.append("// *** CUDA DSLASH ***\n\n" if not dagger else "// *** CUDA DSLASH DAGGER ***\n\n")
str.append("#define SHARED_FLOATS_PER_THREAD "+`sharedFloats`+"\n")
str.append("#define SHARED_BYTES (BLOCK_DIM*SHARED_FLOATS_PER_THREAD*sizeof(float))\n\n")
for s in range(0,4):
for c in range(0,3):
i = 3*s+c
str.append("#define "+in_re(s,c)+" I"+nthFloat4(2*i+0)+"\n")
str.append("#define "+in_im(s,c)+" I"+nthFloat4(2*i+1)+"\n")
str.append("\n")
for m in range(0,3):
for n in range(0,3):
i = 3*m+n
str.append("#define "+g_re(0,m,n)+" G"+nthFloat4(2*i+0)+"\n")
str.append("#define "+g_im(0,m,n)+" G"+nthFloat4(2*i+1)+"\n")
str.append("\n")
for m in range(0,3):
for n in range(0,3):
i = 3*m+n
str.append("#define "+g_re(1,m,n)+" (+"+g_re(0,n,m)+")\n")
str.append("#define "+g_im(1,m,n)+" (-"+g_im(0,n,m)+")\n")
str.append("\n")
for s in range(0,4):
for c in range(0,3):
i = 3*s+c
if 2*i < sharedFloats:
str.append("#define "+out_re(s,c)+" s["+`(2*i+0)`+"]\n")
else:
str.append("volatile float "+out_re(s,c)+";\n")
if 2*i+1 < sharedFloats:
str.append("#define "+out_im(s,c)+" s["+`(2*i+1)`+"]\n")
else:
str.append("volatile float "+out_im(s,c)+";\n")
str.append("\n")
str.append(
"""
// Performs the complex conjugated accumulation: a += b* c*
#define ACC_CONJ_PROD(a, b, c) \\
a##_re += b##_re * c##_re - b##_im * c##_im, \\
a##_im -= b##_re * c##_im + b##_im * c##_re
#define READ_GAUGE_MATRIX(gauge, dir) \\
float4 G0 = tex1Dfetch((gauge), ga_idx + ((dir/2)*3+0)*Nh); \\
float4 G1 = tex1Dfetch((gauge), ga_idx + ((dir/2)*3+1)*Nh); \\
float4 G2 = tex1Dfetch((gauge), ga_idx + ((dir/2)*3+2)*Nh); \\
float4 G3 = make_float4(0,0,0,0); \\
float4 G4 = make_float4(0,0,0,0); \\
ACC_CONJ_PROD(g20, +g01, +g12); \\
ACC_CONJ_PROD(g20, -g02, +g11); \\
ACC_CONJ_PROD(g21, +g02, +g10); \\
ACC_CONJ_PROD(g21, -g00, +g12); \\
ACC_CONJ_PROD(g22, +g00, +g11); \\
ACC_CONJ_PROD(g22, -g01, +g10); \\
float u0 = (dir < 6 ? SPATIAL_SCALING : (ga_idx >= (L4-1)*L1h*L2*L3 ? TIME_SYMMETRY : 1)); \\
G3.x*=u0; G3.y*=u0; G3.z*=u0; G3.w*=u0; G4.x*=u0; G4.y*=u0;
#define READ_SPINOR(spinor) \\
float4 I0 = tex1Dfetch((spinor), sp_idx + 0*Nh); \\
float4 I1 = tex1Dfetch((spinor), sp_idx + 1*Nh); \\
float4 I2 = tex1Dfetch((spinor), sp_idx + 2*Nh); \\
float4 I3 = tex1Dfetch((spinor), sp_idx + 3*Nh); \\
float4 I4 = tex1Dfetch((spinor), sp_idx + 4*Nh); \\
float4 I5 = tex1Dfetch((spinor), sp_idx + 5*Nh);
int sid = BLOCK_DIM*blockIdx.x + threadIdx.x;
int boundaryCrossings = sid/L1h + sid/(L2*L1h) + sid/(L3*L2*L1h);
int X = 2*sid + (boundaryCrossings + oddBit) % 2;
int x4 = X/(L3*L2*L1);
int x3 = (X/(L2*L1)) % L3;
int x2 = (X/L1) % L2;
int x1 = X % L1;
""")
str.append("extern __shared__ float s_data[];\n")
str.append("volatile float *s = s_data+SHARED_FLOATS_PER_THREAD*threadIdx.x;\n\n")
for s in range(0,4):
for c in range(0,3):
str.append(out_re(s,c) + " = " + out_im(s,c)+" = 0;\n")
str.append("\n")
return ''.join(str)
# end def prolog
def epilog():
str = []
str.append(
"""
#ifdef DSLASH_XPAY
float4 accum0 = tex1Dfetch(accumTex, sid + 0*Nh);
float4 accum1 = tex1Dfetch(accumTex, sid + 1*Nh);
float4 accum2 = tex1Dfetch(accumTex, sid + 2*Nh);
float4 accum3 = tex1Dfetch(accumTex, sid + 3*Nh);
float4 accum4 = tex1Dfetch(accumTex, sid + 4*Nh);
float4 accum5 = tex1Dfetch(accumTex, sid + 5*Nh);
""")
for s in range(0,4):
for c in range(0,3):
i = 3*s+c
str.append(" "+out_re(s,c) +" = a*"+out_re(s,c)+" + accum"+nthFloat4(2*i+0)+";\n")
str.append(" "+out_im(s,c) +" = a*"+out_im(s,c)+" + accum"+nthFloat4(2*i+1)+";\n")
str.append("#endif\n\n")
str.append(
"""
#ifdef WRITE_FLOAT4
// this code exhibits a hardware bug in our C870 card
g_out[0*Nh+sid] = make_float4(o00_re, o00_im, o01_re, o01_im);
g_out[1*Nh+sid] = make_float4(o02_re, o02_im, o10_re, o10_im);
g_out[2*Nh+sid] = make_float4(o11_re, o11_im, o12_re, o12_im);
g_out[3*Nh+sid] = make_float4(o20_re, o20_im, o21_re, o21_im);
g_out[4*Nh+sid] = make_float4(o22_re, o22_im, o30_re, o30_im);
g_out[5*Nh+sid] = make_float4(o31_re, o31_im, o32_re, o32_im);
#endif
#ifdef WRITE_FLOAT1_SMEM
int t = threadIdx.x;
int B = BLOCK_DIM;
int b = blockIdx.x;
int f = SHARED_FLOATS_PER_THREAD;
__syncthreads();
for (int i = 0; i < 6; i++) // spinor indices
for (int c = 0; c < 4; c++) // components of float4
((float*)g_out)[i*(Nh*4) + b*(B*4) + c*(B) + t] = s_data[(c*B/4 + t/4)*(f) + i*(4) + t%4];
#endif
#ifdef WRITE_FLOAT1_STAGGERED
// the alternative to writing float4's directly: almost as fast, a lot more confusing
int t = threadIdx.x;
int B = BLOCK_DIM;
int b = blockIdx.x;
int f = SHARED_FLOATS_PER_THREAD;
__syncthreads();
for (int i = 0; i < 4; i++) // spinor indices
for (int c = 0; c < 4; c++) // components of float4
((float*)g_out)[i*(Nh*4) + b*(B*4) + c*(B) + t] = s_data[(c*B/4 + t/4)*(f) + i*(4) + t%4];
__syncthreads();
s[0] = o22_re;
s[1] = o22_im;
s[2] = o30_re;
s[3] = o30_im;
s[4] = o31_re;
s[5] = o31_im;
s[6] = o32_re;
s[7] = o32_im;
__syncthreads();
for (int i = 0; i < 2; i++)
for (int c = 0; c < 4; c++)
((float*)g_out)[(i+4)*(Nh*4) + b*(B*4) + c*(B) + t] = s_data[(c*B/4 + t/4)*(f) + i*(4) + t%4];
#endif
""")
return ''.join(str)
# end def prolog
def gen(dir):
projIdx = dir if not dagger else dir + (1 - 2*(dir%2))
projStr = projectorToStr(projectors[projIdx])
def proj(i,j):
return projectors[projIdx][4*i+j]
# if row(i) = (j, c), then the i'th row of the projector can be represented
# as a multiple of the j'th row: row(i) = c row(j)
def row(i):
assert i==2 or i==3
if proj(i,0) == 0j:
return (1, proj(i,1))
if proj(i,1) == 0j:
return (0, proj(i,0))
str = []
projName = "P"+`dir/2`+["-","+"][projIdx%2]
str.append("// Projector "+projName+"\n")
for l in projStr.splitlines():
str.append("// "+l+"\n")
str.append("\n")
if dir == 0: str.append("int sp_idx = ((x1==L1-1) ? X-(L1-1) : X+1) / 2;\n")
if dir == 1: str.append("int sp_idx = ((x1==0) ? X+(L1-1) : X-1) / 2;\n")
if dir == 2: str.append("int sp_idx = ((x2==L2-1) ? X-(L2-1)*L1 : X+L1) / 2;\n")
if dir == 3: str.append("int sp_idx = ((x2==0) ? X+(L2-1)*L1 : X-L1) / 2;\n")
if dir == 4: str.append("int sp_idx = ((x3==L3-1) ? X-(L3-1)*L2*L1 : X+L2*L1) / 2;\n")
if dir == 5: str.append("int sp_idx = ((x3==0) ? X+(L3-1)*L2*L1 : X-L2*L1) / 2;\n")
if dir == 6: str.append("int sp_idx = ((x4==L4-1) ? X-(L4-1)*L3*L2*L1 : X+L3*L2*L1) / 2;\n")
if dir == 7: str.append("int sp_idx = ((x4==0) ? X+(L4-1)*L3*L2*L1 : X-L3*L2*L1) / 2;\n")
ga_idx = "sid" if dir % 2 == 0 else "sp_idx"
str.append("int ga_idx = "+ga_idx+";\n\n")
project = []
project.append("// read spinor from device memory\n")
project.append("READ_SPINOR(spinorTex);\n\n")
project.append("// project spinor into half spinors\n")
for h in range(0, 2):
for c in range(0, 3):
strRe = []
strIm = []
for s in range(0, 4):
re = proj(h,s).real
im = proj(h,s).imag
if re==0 and im==0: ()
elif im==0:
strRe.append(sign(re)+in_re(s,c))
strIm.append(sign(re)+in_im(s,c))
elif re==0:
strRe.append(sign(-im)+in_im(s,c))
strIm.append(sign(im)+in_re(s,c))
project.append("float "+h1_re(h,c)+ " = "+''.join(strRe)+";\n")
project.append("float "+h1_im(h,c)+ " = "+''.join(strIm)+";\n")
project.append("\n")
ident = []
ident.append("// identity gauge matrix\n")
for m in range(0,3):
for h in range(0,2):
ident.append("float "+h2_re(h,m)+" = " + h1_re(h,m) + "; ")
ident.append("float "+h2_im(h,m)+" = " + h1_im(h,m) + ";\n")
ident.append("\n")
mult = []
mult.append("// read gauge matrix from device memory\n")
mult.append("READ_GAUGE_MATRIX(gauge"+`dir%2`+"Tex, "+`dir`+");\n\n")
for m in range(0,3):
mult.append("// multiply row "+`m`+"\n")
for h in range(0,2):
re = ["float "+h2_re(h,m)+" ="]
im = ["float "+h2_im(h,m)+" ="]
for c in range(0,3):
re.append(" + ("+g_re(dir,m,c)+" * "+h1_re(h,c)+" - "+g_im(dir,m,c)+" * "+h1_im(h,c)+")")
im.append(" + ("+g_re(dir,m,c)+" * "+h1_im(h,c)+" + "+g_im(dir,m,c)+" * "+h1_re(h,c)+")")
mult.append(''.join(re)+";\n")
mult.append(''.join(im)+";\n")
mult.append("\n")
reconstruct = []
for m in range(0,3):
reconstruct.append(out_re(0, m) + " += " + h2_re(0,m) + ";\n")
reconstruct.append(out_im(0, m) + " += " + h2_im(0,m) + ";\n")
reconstruct.append(out_re(1, m) + " += " + h2_re(1,m) + ";\n")
reconstruct.append(out_im(1, m) + " += " + h2_im(1,m) + ";\n")
for s in range(2,4):
(h,c) = row(s)
re = c.real
im = c.imag
if im == 0:
reconstruct.append(out_re(s, m) + " " + sign(re) + "= " + h2_re(h,m) + ";\n")
reconstruct.append(out_im(s, m) + " " + sign(re) + "= " + h2_im(h,m) + ";\n")
elif re == 0:
reconstruct.append(out_re(s, m) + " " + sign(-im) + "= " + h2_im(h,m) + ";\n")
reconstruct.append(out_im(s, m) + " " + sign(+im) + "= " + h2_re(h,m) + ";\n")
reconstruct.append("\n")
if dir >= 6:
str.append("if (GAUGE_FIXED && ga_idx >= L1h*L2*L3) ")
str.append(block(''.join(project) + ''.join(ident) + ''.join(reconstruct)))
str.append("else ")
str.append(block(''.join(project) + ''.join(mult) + ''.join(reconstruct)))
else:
str.append(''.join(project) + ''.join(mult) + ''.join(reconstruct))
return block(''.join(str))+"\n"
# end def gen
def generate():
return prolog() + gen(0) + gen(1) + gen(2) + gen(3) + gen(4) + gen(5) + gen(6) + gen(7) + epilog()
# dagger = False
dagger = True
print generate()
This diff is collapsed.
// All dslash definitions
__global__ void
dslashSingle12Kernel(float4* g_out, int oddBit) {
#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_12
#define GAUGE0TEX gauge0TexSingle
#define GAUGE1TEX gauge1TexSingle
#include "dslash_core.h"
#undef GAUGE1TEX
#undef GAUGE0TEX
#undef READ_GAUGE_MATRIX
}
__global__ void
dslashSingle12DaggerKernel(float4* g_out, int oddBit) {
#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_12
#define GAUGE0TEX gauge0TexSingle
#define GAUGE1TEX gauge1TexSingle
#include "dslash_dagger_core.h"
#undef GAUGE1TEX
#undef GAUGE0TEX
#undef READ_GAUGE_MATRIX
}
#define DSLASH_XPAY
__global__ void
dslashSingle12XpayKernel(float4* g_out, int oddBit, float a) {
#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_12
#define GAUGE0TEX gauge0TexSingle
#define GAUGE1TEX gauge1TexSingle
#include "dslash_core.h"
#undef GAUGE1TEX
#undef GAUGE0TEX
#undef READ_GAUGE_MATRIX
}
__global__ void
dslashSingle12DaggerXpayKernel(float4* g_out, int oddBit, float a) {
#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_12
#define GAUGE0TEX gauge0TexSingle
#define GAUGE1TEX gauge1TexSingle
#include "dslash_dagger_core.h"
#undef GAUGE1TEX
#undef GAUGE0TEX
#undef READ_GAUGE_MATRIX
}
#undef DSLASH_XPAY
__global__ void
dslashSingle8Kernel(float4* g_out, int oddBit) {
#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_8
#define GAUGE0TEX gauge0TexSingle
#define GAUGE1TEX gauge1TexSingle
#include "dslash_core.h"
#undef GAUGE1TEX
#undef GAUGE0TEX
#undef READ_GAUGE_MATRIX
}
__global__ void
dslashSingle8DaggerKernel(float4* g_out, int oddBit) {
#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_8
#define GAUGE0TEX gauge0TexSingle
#define GAUGE1TEX gauge1TexSingle
#include "dslash_dagger_core.h"
#undef GAUGE1TEX
#undef GAUGE0TEX
#undef READ_GAUGE_MATRIX
}
#define DSLASH_XPAY
__global__ void
dslashSingle8XpayKernel(float4* g_out, int oddBit, float a) {
#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_8
#define GAUGE0TEX gauge0TexSingle
#define GAUGE1TEX gauge1TexSingle
#include "dslash_core.h"
#undef GAUGE1TEX
#undef GAUGE0TEX
#undef READ_GAUGE_MATRIX
}
__global__ void
dslashSingle8DaggerXpayKernel(float4* g_out, int oddBit, float a) {
#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_8
#define GAUGE0TEX gauge0TexSingle
#define GAUGE1TEX gauge1TexSingle
#include "dslash_dagger_core.h"
#undef GAUGE1TEX
#undef GAUGE0TEX
#undef READ_GAUGE_MATRIX
}
#undef DSLASH_XPAY
//////////////////////////
__global__ void
dslashHalf12Kernel(float4* g_out, int oddBit) {
#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_12
#define GAUGE0TEX gauge0TexHalf
#define GAUGE1TEX gauge1TexHalf
#include "dslash_core.h"
#undef GAUGE1TEX
#undef GAUGE0TEX
#undef READ_GAUGE_MATRIX
}
__global__ void
dslashHalf12DaggerKernel(float4* g_out, int oddBit) {
#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_12
#define GAUGE0TEX gauge0TexHalf
#define GAUGE1TEX gauge1TexHalf
#include "dslash_dagger_core.h"
#undef GAUGE1TEX
#undef GAUGE0TEX
#undef READ_GAUGE_MATRIX
}
#define DSLASH_XPAY
__global__ void
dslashHalf12XpayKernel(float4* g_out, int oddBit, float a) {
#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_12
#define GAUGE0TEX gauge0TexHalf
#define GAUGE1TEX gauge1TexHalf
#include "dslash_core.h"
#undef GAUGE1TEX
#undef GAUGE0TEX
#undef READ_GAUGE_MATRIX
}
__global__ void
dslashHalf12DaggerXpayKernel(float4* g_out, int oddBit, float a) {
#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_12
#define GAUGE0TEX gauge0TexHalf
#define GAUGE1TEX gauge1TexHalf
#include "dslash_dagger_core.h"
#undef GAUGE1TEX
#undef GAUGE0TEX
#undef READ_GAUGE_MATRIX
}
#undef DSLASH_XPAY
__global__ void
dslashHalf8Kernel(float4* g_out, int oddBit) {
#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_8_HALF
#define GAUGE0TEX gauge0TexHalf
#define GAUGE1TEX gauge1TexHalf
#include "dslash_core.h"
#undef GAUGE1TEX
#undef GAUGE0TEX
#undef READ_GAUGE_MATRIX
}
__global__ void
dslashHalf8DaggerKernel(float4* g_out, int oddBit) {
#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_8_HALF
#define GAUGE0TEX gauge0TexHalf
#define GAUGE1TEX gauge1TexHalf
#include "dslash_dagger_core.h"
#undef GAUGE1TEX
#undef GAUGE0TEX
#undef READ_GAUGE_MATRIX
}
#define DSLASH_XPAY
__global__ void
dslashHalf8XpayKernel(float4* g_out, int oddBit, float a) {
#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_8_HALF
#define GAUGE0TEX gauge0TexHalf
#define GAUGE1TEX gauge1TexHalf
#include "dslash_core.h"
#undef GAUGE1TEX
#undef GAUGE0TEX
#undef READ_GAUGE_MATRIX
}
__global__ void
dslashHalf8DaggerXpayKernel(float4* g_out, int oddBit, float a) {
#define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_8_HALF
#define GAUGE0TEX gauge0TexHalf
#define GAUGE1TEX gauge1TexHalf
#include "dslash_dagger_core.h"
#undef GAUGE1TEX
#undef GAUGE0TEX
#undef READ_GAUGE_MATRIX
}
#undef DSLASH_XPAY
#include <stdlib.h>
#include <stdio.h>
#include <dslash_quda.h>
// ----------------------------------------------------------------------
// Cuda code
// Half precision gauge field
texture<short4, 1, cudaReadModeNormalizedFloat> gauge0TexHalf;
texture<short4, 1, cudaReadModeNormalizedFloat> gauge1TexHalf;
// Single precision gauge field
texture<float4, 1, cudaReadModeElementType> gauge0TexSingle;
texture<float4, 1, cudaReadModeElementType> gauge1TexSingle;
texture<float4, 1, cudaReadModeElementType> spinorTex;
texture<float4, 1, cudaReadModeElementType> accumTex;
QudaGaugeParam *gauge_param;
__constant__ int X1;
__constant__ int X2;
__constant__ int X3;
__constant__ int X4;
__constant__ int X1h;
__constant__ float anisotropy;
__constant__ float t_boundary;
__constant__ int gauge_fixed;
#define WRITE_SPINOR WRITE_SPINOR_FLOAT4
//#define WRITE_SPINOR WRITE_FLOAT1_SMEM
//#define WRITE_SPINOR WRITE_SPINOR_FLOAT1_STAGGERED
#include <dslash_def.h>
void setCudaGaugeParam() {
cudaMemcpyToSymbol("anisotropy", &(gauge_param->anisotropy), sizeof(float));
float t_bc = (gauge_param->t_boundary == QUDA_PERIODIC_T) ? 1.0 : -1.0;
cudaMemcpyToSymbol("t_boundary", &(t_bc), sizeof(float));
int gf = (gauge_param->gauge_fix == QUDA_GAUGE_FIXED_YES) ? 1 : 0;
cudaMemcpyToSymbol("gauge_fixed", &(gf), sizeof(int));
}
// ----------------------------------------------------------------------
void dslashCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit) {
int packed_gauge_bytes = (gauge_param->reconstruct == QUDA_RECONSTRUCT_12) ?
PACKED12_GAUGE_BYTES : PACKED8_GAUGE_BYTES;
if (gauge_param->cuda_prec == QUDA_HALF_PRECISION) packed_gauge_bytes/=2;
if (gauge_param->cuda_prec == QUDA_SINGLE_PRECISION) {
if (oddBit) {
cudaBindTexture(0, gauge0TexSingle, gauge.odd, packed_gauge_bytes);
cudaBindTexture(0, gauge1TexSingle, gauge.even, packed_gauge_bytes);
} else {
cudaBindTexture(0, gauge0TexSingle, gauge.even, packed_gauge_bytes);
cudaBindTexture(0, gauge1TexSingle, gauge.odd, packed_gauge_bytes);
}
} else {
if (oddBit) {
cudaBindTexture(0, gauge0TexHalf, gauge.odd, packed_gauge_bytes);
cudaBindTexture(0, gauge1TexHalf, gauge.even, packed_gauge_bytes);
} else {
cudaBindTexture(0, gauge0TexHalf, gauge.even, packed_gauge_bytes);
cudaBindTexture(0, gauge1TexHalf, gauge.odd, packed_gauge_bytes);
}
}
cudaBindTexture(0, spinorTex, spinor, SPINOR_BYTES);
dim3 gridDim(GRID_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
if (gauge_param->cuda_prec == QUDA_SINGLE_PRECISION) {
if (gauge_param->reconstruct == QUDA_RECONSTRUCT_12) {
if (!daggerBit) {
dslashSingle12Kernel <<<gridDim, blockDim, SHARED_BYTES>>>
((float4 *)res, oddBit);
} else {
dslashSingle12DaggerKernel <<<gridDim, blockDim, SHARED_BYTES>>>
((float4 *)res, oddBit);
}
} else {
if (!daggerBit) {
dslashSingle8Kernel <<<gridDim, blockDim, SHARED_BYTES>>>
((float4 *)res, oddBit);
} else {
dslashSingle8DaggerKernel <<<gridDim, blockDim, SHARED_BYTES>>>
((float4 *)res, oddBit);
}
}
} else {
if (gauge_param->reconstruct == QUDA_RECONSTRUCT_12) {
if (!daggerBit) {
dslashHalf12Kernel <<<gridDim, blockDim, SHARED_BYTES>>>
((float4 *)res, oddBit);
} else {
dslashHalf12DaggerKernel <<<gridDim, blockDim, SHARED_BYTES>>>
((float4 *)res, oddBit);
}
} else {
if (!daggerBit) {
dslashHalf8Kernel <<<gridDim, blockDim, SHARED_BYTES>>>
((float4 *)res, oddBit);
} else {
dslashHalf8DaggerKernel <<<gridDim, blockDim, SHARED_BYTES>>>
((float4 *)res, oddBit);
}
}
}
}
void dslashXpayCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, float a) {
int packed_gauge_bytes = (gauge_param->reconstruct == QUDA_RECONSTRUCT_12) ?
PACKED12_GAUGE_BYTES : PACKED8_GAUGE_BYTES;
if (gauge_param->cuda_prec == QUDA_HALF_PRECISION) packed_gauge_bytes/=2;
if (gauge_param->cuda_prec == QUDA_SINGLE_PRECISION) {
if (oddBit) {
cudaBindTexture(0, gauge0TexSingle, gauge.odd, packed_gauge_bytes);
cudaBindTexture(0, gauge1TexSingle, gauge.even, packed_gauge_bytes);
} else {
cudaBindTexture(0, gauge0TexSingle, gauge.even, packed_gauge_bytes);
cudaBindTexture(0, gauge1TexSingle, gauge.odd, packed_gauge_bytes);
}
} else {
if (oddBit) {
cudaBindTexture(0, gauge0TexHalf, gauge.odd, packed_gauge_bytes);
cudaBindTexture(0, gauge1TexHalf, gauge.even, packed_gauge_bytes);
} else {
cudaBindTexture(0, gauge0TexHalf, gauge.even, packed_gauge_bytes);
cudaBindTexture(0, gauge1TexHalf, gauge.odd, packed_gauge_bytes);
}
}
cudaBindTexture(0, spinorTex, spinor, SPINOR_BYTES);
cudaBindTexture(0, accumTex, x, SPINOR_BYTES);
dim3 gridDim(GRID_DIM, 1, 1);
dim3 blockDim(BLOCK_DIM, 1, 1);
if (gauge_param->cuda_prec == QUDA_SINGLE_PRECISION) {
if (gauge_param->reconstruct == QUDA_RECONSTRUCT_12) {
if (!daggerBit) {
dslashSingle12XpayKernel <<<gridDim, blockDim, SHARED_BYTES>>>
((float4 *)res, oddBit, a);
} else {
dslashSingle12DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES>>>
((float4 *)res, oddBit, a);
}
} else if (gauge_param->reconstruct == QUDA_RECONSTRUCT_8) {
if (!daggerBit) {
dslashSingle8XpayKernel <<<gridDim, blockDim, SHARED_BYTES>>>
((float4 *)res, oddBit, a);
}
else {
dslashSingle8DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES>>>
((float4 *)res, oddBit, a);
}
}
} else {
if (gauge_param->reconstruct == QUDA_RECONSTRUCT_12) {
if (!daggerBit) {
dslashHalf12XpayKernel <<<gridDim, blockDim, SHARED_BYTES>>>
((float4 *)res, oddBit, a);
} else {
dslashHalf12DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES>>>
((float4 *)res, oddBit, a);
}
} else if (gauge_param->reconstruct == QUDA_RECONSTRUCT_8) {
if (!daggerBit) {
dslashHalf8XpayKernel <<<gridDim, blockDim, SHARED_BYTES>>>
((float4 *)res, oddBit, a);
}
else {
dslashHalf8DaggerXpayKernel <<<gridDim, blockDim, SHARED_BYTES>>>
((float4 *)res, oddBit, a);
}
}
}
}
int dslashCudaSharedBytes() {
return SHARED_BYTES;
}
// Apply the even-odd preconditioned Dirac operator
void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, float kappa,
ParitySpinor tmp, MatPCType matpc_type) {
float kappa2 = -kappa*kappa;
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashCuda(tmp, gauge, in, 1, 0);
dslashXpayCuda(out, gauge, tmp, 0, 0, in, kappa2);
} else {
dslashCuda(tmp, gauge, in, 0, 0);
dslashXpayCuda(out, gauge, tmp, 1, 0, in, kappa2);
}
}
// Apply the even-odd preconditioned Dirac operator
void MatPCDagCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, float kappa,
ParitySpinor tmp, MatPCType matpc_type) {
float kappa2 = -kappa*kappa;
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashCuda(tmp, gauge, in, 1, 1);
dslashXpayCuda(out, gauge, tmp, 0, 1, in, kappa2);
} else {
dslashCuda(tmp, gauge, in, 0, 1);
dslashXpayCuda(out, gauge, tmp, 1, 1, in, kappa2);
}
}
// Apply the even-odd preconditioned Dirac operator
cuComplex MatPCcDotWXCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, float kappa,
ParitySpinor tmp, ParitySpinor d, MatPCType matpc_type) {
float kappa2 = -kappa*kappa;
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashCuda(tmp, gauge, in, 1, 0);
dslashCuda(out, gauge, tmp, 0, 0);
} else {
dslashCuda(tmp, gauge, in, 0, 0);
dslashCuda(out, gauge, tmp, 1, 0);
}
// out = in - kappa2*out, dot = (d, out)
return xpaycDotzyCuda((float2*)in, kappa2, (float2*)out, (float2*)d, Nh*spinorSiteSize/2);
}
// Apply the even-odd preconditioned Dirac operator
cuComplex MatPCDagcDotWXCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, float kappa,
ParitySpinor tmp, ParitySpinor d, MatPCType matpc_type) {
float kappa2 = -kappa*kappa;
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashCuda(tmp, gauge, in, 1, 1);
dslashCuda(out, gauge, tmp, 0, 1);
} else {
dslashCuda(tmp, gauge, in, 0, 1);
dslashCuda(out, gauge, tmp, 1, 1);
}
// out = in - kappa2*out, dot = (d, out)
return xpaycDotzyCuda((float2*)in, kappa2, (float2*)out, (float2*)d, Nh*spinorSiteSize/2);
}
void MatPCDagMatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in,
float kappa, ParitySpinor tmp, MatPCType matpc_type) {
MatPCCuda(out, gauge, in, kappa, tmp, matpc_type);
MatPCDagCuda(out, gauge, out, kappa, tmp, matpc_type);
}
#ifndef _QUDA_DSLASH_H
#define _QUDA_DSLASH_H
#include <cuComplex.h>
#include <quda.h>
#define packed12GaugeSiteSize 12 // real numbers per link, using SU(3) reconstruction
#define packed8GaugeSiteSize 8 // real numbers per link, using SU(3) reconstruction
#define gaugeSiteSize 18 // real numbers per link
#define spinorSiteSize 24 // real numbers per spinor
#define BLOCK_DIM (64) // threads per block
#define GRID_DIM (Nh/BLOCK_DIM) // there are Nh threads in total
#define SPINOR_BYTES (Nh*spinorSiteSize*sizeof(float))
#define PACKED12_GAUGE_BYTES (4*Nh*packed12GaugeSiteSize*sizeof(float))
#define PACKED8_GAUGE_BYTES (4*Nh*packed8GaugeSiteSize*sizeof(float))
#ifdef __cplusplus
extern "C" {
#endif
extern QudaGaugeParam *gauge_param;
// ---------- dslash_quda.cu ----------
void setCudaGaugeParam();
void dslashCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit);
void dslashXpayCuda(ParitySpinor res, FullGauge gauge, ParitySpinor spinor,
int oddBit, int daggerBit, ParitySpinor x, float a);
int dslashCudaSharedBytes();
void MatPCCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven,
float kappa, ParitySpinor tmp, MatPCType matpc_type);
void MatPCDagCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven,
float kappa, ParitySpinor tmp, MatPCType matpc_type);
void MatPCDagMatPCCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven,
float kappa, ParitySpinor tmp, MatPCType matpc_type);
cuComplex MatPCcDotWXCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven,
float kappa, ParitySpinor tmp, ParitySpinor d, MatPCType matpc_type);
cuComplex MatPCDagcDotWXCuda(ParitySpinor outEven, FullGauge gauge, ParitySpinor inEven,
float kappa, ParitySpinor tmp, ParitySpinor d, MatPCType matpc_type);
// ---------- dslash_reference.cpp ----------
void dslashReference(float *res, float **gauge, float *spinorField,
int oddBit, int daggerBit);
void Mat(float *out, float **gauge, float *in, float kappa);
void MatDag(float *out, float **gauge, float *in, float kappa);
void MatDagMat(float *out, float **gauge, float *in, float kappa);
void MatPC(float *out, float **gauge, float *in, float kappa, MatPCType matpc_type);
void MatPCDag(float *out, float **gauge, float *in, float kappa, MatPCType matpc_type);
void MatPCDagMatPC(float *out, float **gauge, float *in, float kappa, MatPCType matpc_type);
// -- inv_cg_cuda.cpp
void invertCgCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge,
ParitySpinor tmp, QudaInvertParam *param);
// -- inv_bicgstab_cuda.cpp
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge,
ParitySpinor tmp, QudaInvertParam *param, DagType dag_type);
// ---------- cg_reference.cpp ----------
void cgReference(float *out, float **gauge, float *in, float kappa, float tol);
#ifdef __cplusplus
}
#endif
#endif // _QUDA_DLASH_H
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <quda.h>
#include <util_quda.h>
void sum(float *dst, float *a, float *b, int cnt) {
for (int i = 0; i < cnt; i++)
dst[i] = a[i] + b[i];
}
// i represents a "half index" into an even or odd "half lattice".
// when oddBit={0,1} the half lattice is {even,odd}.
//
// the displacements, such as dx, refer to the full lattice coordinates.
//
// neighborIndex() takes a "half index", displaces it, and returns the
// new "half index", which can be an index into either the even or odd lattices.
// displacements of magnitude one always interchange odd and even lattices.
//
int neighborIndex(int i, int oddBit, int dx4, int dx3, int dx2, int dx1) {
int X = fullLatticeIndex(i, oddBit);
int x4 = X/(L3*L2*L1);
int x3 = (X/(L2*L1)) % L3;
int x2 = (X/L1) % L2;
int x1 = X % L1;
// assert (oddBit == (x+y+z+t)%2);
x4 = (x4+dx4+L4) % L4;
x3 = (x3+dx3+L3) % L3;
x2 = (x2+dx2+L2) % L2;
x1 = (x1+dx1+L1) % L1;
return (x4*(L3*L2*L1) + x3*(L2*L1) + x2*(L1) + x1) / 2;
}
float *gaugeLink(int i, int dir, int oddBit, float **gaugeEven, float **gaugeOdd) {
float **gaugeField;
int j;
if (dir % 2 == 0) {
j = i;
gaugeField = (oddBit ? gaugeOdd : gaugeEven);
}
else {
switch (dir) {
case 1: j = neighborIndex(i, oddBit, 0, 0, 0, -1); break;
case 3: j = neighborIndex(i, oddBit, 0, 0, -1, 0); break;
case 5: j = neighborIndex(i, oddBit, 0, -1, 0, 0); break;
case 7: j = neighborIndex(i, oddBit, -1, 0, 0, 0); break;
default: j = -1; break;
}
gaugeField = (oddBit ? gaugeEven : gaugeOdd);
}
return &gaugeField[dir/2][j*(3*3*2)];
}
float *spinorNeighbor(int i, int dir, int oddBit, float *spinorField) {
int j;
switch (dir) {
case 0: j = neighborIndex(i, oddBit, 0, 0, 0, +1); break;
case 1: j = neighborIndex(i, oddBit, 0, 0, 0, -1); break;
case 2: j = neighborIndex(i, oddBit, 0, 0, +1, 0); break;
case 3: j = neighborIndex(i, oddBit, 0, 0, -1, 0); break;
case 4: j = neighborIndex(i, oddBit, 0, +1, 0, 0); break;
case 5: j = neighborIndex(i, oddBit, 0, -1, 0, 0); break;
case 6: j = neighborIndex(i, oddBit, +1, 0, 0, 0); break;
case 7: j = neighborIndex(i, oddBit, -1, 0, 0, 0); break;
default: j = -1; break;
}
return &spinorField[j*(4*3*2)];
}
void dot(float* res, float* a, float* b) {
res[0] = res[1] = 0;
for (int m = 0; m < 3; m++) {
float a_re = a[2*m+0];
float a_im = a[2*m+1];
float b_re = b[2*m+0];
float b_im = b[2*m+1];
res[0] += a_re * b_re - a_im * b_im;
res[1] += a_re * b_im + a_im * b_re;
}
}
void su3_transpose(float *res, float *mat) {
for (int m = 0; m < 3; m++) {
for (int n = 0; n < 3; n++) {
res[m*(3*2) + n*(2) + 0] = + mat[n*(3*2) + m*(2) + 0];
res[m*(3*2) + n*(2) + 1] = - mat[n*(3*2) + m*(2) + 1];
}
}
}
void su3_mul(float *res, float *mat, float *vec) {
for (int n = 0; n < 3; n++) {
dot(&res[n*(2)], &mat[n*(3*2)], vec);
}
}
void su3_Tmul(float *res, float *mat, float *vec) {
float matT[3*3*2];
su3_transpose(matT, mat);
su3_mul(res, matT, vec);
}
const float projector[8][4][4][2] = {
{
{{1,0}, {0,0}, {0,0}, {0,-1}},
{{0,0}, {1,0}, {0,-1}, {0,0}},
{{0,0}, {0,1}, {1,0}, {0,0}},
{{0,1}, {0,0}, {0,0}, {1,0}}
},
{
{{1,0}, {0,0}, {0,0}, {0,1}},
{{0,0}, {1,0}, {0,1}, {0,0}},
{{0,0}, {0,-1}, {1,0}, {0,0}},
{{0,-1}, {0,0}, {0,0}, {1,0}}
},
{
{{1,0}, {0,0}, {0,0}, {1,0}},
{{0,0}, {1,0}, {-1,0}, {0,0}},
{{0,0}, {-1,0}, {1,0}, {0,0}},
{{1,0}, {0,0}, {0,0}, {1,0}}
},
{
{{1,0}, {0,0}, {0,0}, {-1,0}},
{{0,0}, {1,0}, {1,0}, {0,0}},
{{0,0}, {1,0}, {1,0}, {0,0}},
{{-1,0}, {0,0}, {0,0}, {1,0}}
},
{
{{1,0}, {0,0}, {0,-1}, {0,0}},
{{0,0}, {1,0}, {0,0}, {0,1}},
{{0,1}, {0,0}, {1,0}, {0,0}},
{{0,0}, {0,-1}, {0,0}, {1,0}}
},
{
{{1,0}, {0,0}, {0,1}, {0,0}},
{{0,0}, {1,0}, {0,0}, {0,-1}},
{{0,-1}, {0,0}, {1,0}, {0,0}},
{{0,0}, {0,1}, {0,0}, {1,0}}
},
{
{{1,0}, {0,0}, {-1,0}, {0,0}},
{{0,0}, {1,0}, {0,0}, {-1,0}},
{{-1,0}, {0,0}, {1,0}, {0,0}},
{{0,0}, {-1,0}, {0,0}, {1,0}}
},
{
{{1,0}, {0,0}, {1,0}, {0,0}},
{{0,0}, {1,0}, {0,0}, {1,0}},
{{1,0}, {0,0}, {1,0}, {0,0}},
{{0,0}, {1,0}, {0,0}, {1,0}}
}
};
// todo pass projector
void multiplySpinorByDiracProjector(float *res, int projIdx, float *spinorIn) {
zero(res, 4*3*2);
for (int s = 0; s < 4; s++) {
for (int t = 0; t < 4; t++) {
float projRe = projector[projIdx][s][t][0];
float projIm = projector[projIdx][s][t][1];
for (int m = 0; m < 3; m++) {
float spinorRe = spinorIn[t*(3*2) + m*(2) + 0];
float spinorIm = spinorIn[t*(3*2) + m*(2) + 1];
res[s*(3*2) + m*(2) + 0] += projRe*spinorRe - projIm*spinorIm;
res[s*(3*2) + m*(2) + 1] += projRe*spinorIm + projIm*spinorRe;
}
}
}
}
//
// dslashReference()
//
// if oddBit is zero: calculate odd parity spinor elements (using even parity spinor)
// if oddBit is one: calculate even parity spinor elements
//
// if daggerBit is zero: perform ordinary dslash operator
// if daggerBit is one: perform hermitian conjugate of dslash
//
void dslashReference(float *res, float **gaugeFull, float *spinorField, int oddBit, int daggerBit) {
zero(res, Nh*4*3*2);
float *gaugeEven[4], *gaugeOdd[4];
for (int dir = 0; dir < 4; dir++) {
gaugeEven[dir] = gaugeFull[dir];
gaugeOdd[dir] = gaugeFull[dir]+Nh*gaugeSiteSize;
}
for (int i = 0; i < Nh; i++) {
for (int dir = 0; dir < 8; dir++) {
float *gauge = gaugeLink(i, dir, oddBit, gaugeEven, gaugeOdd);
float *spinor = spinorNeighbor(i, dir, oddBit, spinorField);
float projectedSpinor[4*3*2], gaugedSpinor[4*3*2];
int projIdx = 2*(dir/2)+(dir+daggerBit)%2;
multiplySpinorByDiracProjector(projectedSpinor, projIdx, spinor);
for (int s = 0; s < 4; s++) {
if (dir % 2 == 0)
su3_mul(&gaugedSpinor[s*(3*2)], gauge, &projectedSpinor[s*(3*2)]);
else
su3_Tmul(&gaugedSpinor[s*(3*2)], gauge, &projectedSpinor[s*(3*2)]);
}
sum(&res[i*(4*3*2)], &res[i*(4*3*2)], gaugedSpinor, 4*3*2);
}
}
}
void Mat(float *out, float **gauge, float *in, float kappa) {
float *inEven = in;
float *inOdd = in + Nh*spinorSiteSize;
float *outEven = out;
float *outOdd = out + Nh*spinorSiteSize;
// full dslash operator
dslashReference(outOdd, gauge, inEven, 1, 0);
dslashReference(outEven, gauge, inOdd, 0, 0);
// lastly apply the kappa term
xpay(in, -kappa, out, N*spinorSiteSize);
}
void MatDag(float *out, float **gauge, float *in, float kappa) {
float *inEven = in;
float *inOdd = in + Nh*spinorSiteSize;
float *outEven = out;
float *outOdd = out + Nh*spinorSiteSize;
// full dslash operator
dslashReference(outOdd, gauge, inEven, 1, 1);
dslashReference(outEven, gauge, inOdd, 0, 1);
// lastly apply the kappa term
xpay(in, -kappa, out, N*spinorSiteSize);
}
void MatDagMat(float *out, float **gauge, float *in, float kappa) {
float *tmp = (float*)malloc(N*spinorSiteSize*sizeof(float));
Mat(tmp, gauge, in, kappa);
MatDag(out, gauge, tmp, kappa);
free(tmp);
}
// Apply the even-odd preconditioned Dirac operator
void MatPC(float *outEven, float **gauge, float *inEven, float kappa,
MatPCType matpc_type) {
float *tmp = (float*)malloc(Nh*spinorSiteSize*sizeof(float));
// full dslash operator
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashReference(tmp, gauge, inEven, 1, 0);
dslashReference(outEven, gauge, tmp, 0, 0);
} else {
dslashReference(tmp, gauge, inEven, 0, 0);
dslashReference(outEven, gauge, tmp, 1, 0);
}
// lastly apply the kappa term
float kappa2 = -kappa*kappa;
xpay(inEven, kappa2, outEven, Nh*spinorSiteSize);
free(tmp);
}
// Apply the even-odd preconditioned Dirac operator
void MatPCDag(float *outEven, float **gauge, float *inEven, float kappa,
MatPCType matpc_type) {
float *tmp = (float*)malloc(Nh*spinorSiteSize*sizeof(float));
// full dslash operator
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
dslashReference(tmp, gauge, inEven, 1, 1);
dslashReference(outEven, gauge, tmp, 0, 1);
} else {
dslashReference(tmp, gauge, inEven, 0, 1);
dslashReference(outEven, gauge, tmp, 1, 1);
}
float kappa2 = -kappa*kappa;
xpay(inEven, kappa2, outEven, Nh*spinorSiteSize);
free(tmp);
}
void MatPCDagMatPC(float *out, float **gauge, float *in, float kappa,
MatPCType matpc_type) {
float *tmp = (float*)malloc(Nh*spinorSiteSize*sizeof(float));
MatPC(tmp, gauge, in, kappa, matpc_type);
MatPCDag(out, gauge, tmp, kappa, matpc_type);
free(tmp);
}
#include <stdio.h>
#include <stdlib.h>
#include <quda.h>
#include <util_quda.h>
#include <field_quda.h>
#define FULL_WILSON 1
QudaGaugeParam param;
FullGauge cudaGauge;
FullSpinor cudaSpinor;
ParitySpinor tmp;
float *hostGauge[4];
float *spinor, *spinorRef;
float *spinorEven, *spinorOdd;
float kappa = 1.0;
int ODD_BIT = 0;
int DAGGER_BIT = 0;
void printSpinorHalfField(float *spinor) {
printSpinor(&spinor[0*spinorSiteSize]);
printf("...\n");
printSpinor(&spinor[(Nh-1)*spinorSiteSize]);
printf("\n");
}
void init() {
param.cpu_prec = QUDA_SINGLE_PRECISION;
param.cuda_prec = QUDA_HALF_PRECISION;
param.X = L1;
param.Y = L2;
param.Z = L3;
param.T = L4;
param.anisotropy = 2.3;
param.reconstruct = QUDA_RECONSTRUCT_8;
param.gauge_order = QUDA_QDP_GAUGE_ORDER;
param.t_boundary = QUDA_ANTI_PERIODIC_T;
param.gauge_fix = QUDA_GAUGE_FIXED_NO;
gauge_param = &param;
// construct input fields
for (int dir = 0; dir < 4; dir++) {
hostGauge[dir] = (float*)malloc(N*gaugeSiteSize*sizeof(float));
}
spinor = (float*)malloc(N*spinorSiteSize*sizeof(float));
spinorRef = (float*)malloc(N*spinorSiteSize*sizeof(float));
spinorEven = spinor;
spinorOdd = spinor + Nh*spinorSiteSize;
printf("Randomizing fields...");
constructGaugeField(hostGauge);
constructSpinorField(spinor);
printf("done.\n"); fflush(stdout);
int dev = 0;
cudaSetDevice(dev);
printf("Sending fields to GPU..."); fflush(stdout);
cudaGauge = loadGaugeField(hostGauge);
cudaSpinor = allocateSpinorField();
tmp = allocateParitySpinor();
loadSpinorField(cudaSpinor, (void*)spinor, QUDA_SINGLE_PRECISION,
QUDA_SINGLE_PRECISION, QUDA_DIRAC_ORDER);
}
void end() {
// release memory
for (int dir = 0; dir < 4; dir++) free(hostGauge[dir]);
free(spinor);
free(spinorRef);
freeSpinorField(cudaSpinor);
freeSpinorBuffer();
freeParitySpinor(tmp);
}
void dslashRef() {
// compare to dslash reference implementation
printf("Calculating reference implementation...");
fflush(stdout);
if (FULL_WILSON)
MatPC(spinorRef, hostGauge, spinorEven, kappa, QUDA_MATPC_EVEN_EVEN);
else
dslashReference(spinorRef, hostGauge, spinorEven, ODD_BIT, DAGGER_BIT);
printf("done.\n");
}
double dslashCUDA() {
// execute kernel
const int LOOPS = 200;
printf("Executing %d kernel loops...", LOOPS);
fflush(stdout);
stopwatchStart();
for (int i = 0; i < LOOPS; i++) {
if (FULL_WILSON)
MatPCCuda(cudaSpinor.odd, cudaGauge, cudaSpinor.even, kappa, tmp, QUDA_MATPC_EVEN_EVEN);
else
dslashCuda(cudaSpinor.odd, cudaGauge, cudaSpinor.even, ODD_BIT, DAGGER_BIT);
}
// check for errors
cudaError_t stat = cudaGetLastError();
if (stat != cudaSuccess)
printf("with ERROR: %s\n", cudaGetErrorString(stat));
cudaThreadSynchronize();
double secs = stopwatchReadSeconds() / LOOPS;
printf("done.\n\n");
return secs;
}
void dslashTest() {
init();
float spinorGiB = (float)Nh*spinorSiteSize*sizeof(float) / (1 << 30);
float sharedKB = (float)dslashCudaSharedBytes() / (1 << 10);
printf("\nSpinor mem: %.3f GiB\n", spinorGiB);
printf("Gauge mem: %.3f GiB\n", param.gaugeGiB);
printf("Shared mem: %.3f KB\n", sharedKB);
int attempts = 100;
dslashRef();
for (int i=0; i<attempts; i++) {
double secs = dslashCUDA();
retrieveParitySpinor(spinorOdd, cudaSpinor.odd, QUDA_SINGLE_PRECISION,
QUDA_SINGLE_PRECISION, QUDA_DIRAC_ORDER);
// print timing information
printf("%fms per loop\n", 1000*secs);
int flops = FULL_WILSON ? 1320*2 + 48 : 1320;
int floats = FULL_WILSON ? 2*(7*24+8*param.packed_size+24)+24 : 7*24+8*param.packed_size+24;
printf("GFLOPS = %f\n", 1.0e-9*flops*Nh/secs);
printf("GiB/s = %f\n\n", Nh*floats*sizeof(float)/(secs*(1<<30)));
int res = compareFloats(spinorOdd, spinorRef, Nh*4*3*2, 1e-1);
printf("%d Test %s\n", i, (1 == res) ? "PASSED" : "FAILED");
}
end();
}
int main(int argc, char **argv) {
dslashTest();
}
/*
printf("Reference:\n");
printSpinorHalfField(spinorRef);
printf("\nCUDA:\n");
printSpinorHalfField(spinorOdd);
int fail_check = 12;
int fail[fail_check];
for (int f=0; f<fail_check; f++) fail[f] = 0;
int iter[24];
for (int i=0; i<24; i++) iter[i] = 0;
for (int i=0; i<Nh; i++) {
for (int j=0; j<24; j++) {
int is = i*24+j;
float diff = fabs(spinorRef[is]-spinorOdd[is]);
for (int f=0; f<fail_check; f++)
if (diff > pow(10,-(f+1))) fail[f]++;
//if (diff > 1e-1) printf("%d %d %e\n", i, j, diff);
if (diff > 1e-6) iter[j]++;
}
}
//for (int i=0; i<24; i++) printf("%d fails = %d\n", i, iter[i]);
for (int f=0; f<fail_check; f++) {
printf("%e Failures: %d / %d = %e\n", pow(10,-(f+1)), fail[f], Nh*24, fail[f] / (float)(Nh*24));
}
if (stat != cudaSuccess)
printf("Cuda failed with ERROR: %s\n", cudaGetErrorString(stat));
*/
#ifndef _ENUM_QUDA_H
#define _ENUM_QUDA_H
#ifdef __cplusplus
extern "C" {
#endif
typedef enum QudaGaugeFieldOrder_s {
QUDA_QDP_GAUGE_ORDER, // even-odd, row-column colour
QUDA_CPS_WILSON_GAUGE_ORDER, // even-odd, column-row colour
} QudaGaugeFieldOrder;
typedef enum QudaDiracFieldOrder_s {
QUDA_DIRAC_ORDER, // even-odd, colour inside spin
QUDA_QDP_DIRAC_ORDER, // even-odd, spin inside colour
QUDA_CPS_WILSON_DIRAC_ORDER, // odd-even, colour inside spin
} QudaDiracFieldOrder;
typedef enum QudaInverterType_s {
QUDA_CG_INVERTER,
QUDA_BICGSTAB_INVERTER
} QudaInverterType;
typedef enum QudaPrecision_s {
QUDA_HALF_PRECISION,
QUDA_SINGLE_PRECISION,
QUDA_DOUBLE_PRECISION
} QudaPrecision;
// Whether the preconditioned matrix is (1-k^2 Deo Doe) or (1-k^2 Doe Deo)
typedef enum QudaMatPCType_s {
QUDA_MATPC_EVEN_EVEN,
QUDA_MATPC_ODD_ODD
} QudaMatPCType;
// The different solutions supported
typedef enum QudaSolutionType_s {
QUDA_MAT_SOLUTION,
QUDA_MATPC_SOLUTION,
QUDA_MATPCDAG_SOLUTION,
QUDA_MATPCDAG_MATPC_SOLUTION,
} QudaSolutionType;
typedef enum QudaMassNormalization_s {
QUDA_KAPPA_NORMALIZATION,
QUDA_MASS_NORMALIZATION
} QudaMassNormalization;
typedef enum QudaPreserveSource_s {
QUDA_PRESERVE_SOURCE_NO, // use the source for the residual
QUDA_PRESERVE_SOURCE_YES // keep the source intact
} QudaPreserveSource;
typedef enum QudaReconstructType_s {
QUDA_RECONSTRUCT_NO, // store all 18 real numbers epxlicitly
QUDA_RECONSTRUCT_8, // reconstruct from 8 real numbers
QUDA_RECONSTRUCT_12 // reconstruct from 12 real numbers
} QudaReconstructType;
typedef enum QudaGaugeFixed_s {
QUDA_GAUGE_FIXED_NO, // No gauge fixing
QUDA_GAUGE_FIXED_YES // Gauge field stored in temporal gauge
} QudaGaugeFixed;
typedef enum QudaDagType_s {
QUDA_DAG_NO,
QUDA_DAG_YES
} QudaDagType;
typedef enum QudaTboundary_s {
QUDA_ANTI_PERIODIC_T = -1,
QUDA_PERIODIC_T = 1
} QudaTboundary;
#ifdef __cplusplus
}
#endif
#endif // _ENUM_QUDA_H
This diff is collapsed.
#ifndef _QUDA_FIELD_H
#define _QUDA_FIELD_H
#include <enum_quda.h>
#include <dslash_quda.h>
#ifdef __cplusplus
extern "C" {
#endif
ParitySpinor allocateParitySpinor();
FullSpinor allocateSpinorField();
void freeGaugeField(FullGauge gauge);
void freeParitySpinor(ParitySpinor spinor);
void freeSpinorField(FullSpinor spinor);
void freeSpinorBuffer();
FullGauge loadGaugeField(void *gauge);
void loadParitySpinor(ParitySpinor, void *spinor, Precision cpu_prec, Precision cuda_prec,
DiracFieldOrder dirac_order);
void loadSpinorField(FullSpinor, void *spinor, Precision cpu_prec, Precision cuda_prec,
DiracFieldOrder dirac_order);
void retrieveParitySpinor(void *res, ParitySpinor spinor, Precision cpu_prec, Precision cuda_prec,
DiracFieldOrder dirac_order);
void retrieveSpinorField(void *res, FullSpinor spinor, Precision cpu_prec, Precision cuda_prec,
DiracFieldOrder dirac_order);
#ifdef __cplusplus
}
#endif
#endif // _QUDA_FIELD_H
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuComplex.h>
#include <quda.h>
#include <util_quda.h>
#include <field_quda.h>
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge,
ParitySpinor tmp, QudaInvertParam *perf, DagType dag_type)
{
int len = Nh*spinorSiteSize;
ParitySpinor r = allocateParitySpinor();
ParitySpinor p = allocateParitySpinor();
ParitySpinor v = allocateParitySpinor();
ParitySpinor t = allocateParitySpinor();
copyCuda((float *)r, (float *)b, len);
zeroCuda((float *)x, len);
float b2 = normCuda((float *)b, len);
float r2 = b2;
float stop = b2*perf->tol*perf->tol; // stopping condition of solver
cuComplex rho = make_cuFloatComplex(1.0f, 0.0f);
cuComplex rho0 = rho;
cuComplex alpha = make_cuFloatComplex(1.0f, 0.0f);
cuComplex omega = make_cuFloatComplex(1.0f, 0.0f);
cuComplex beta;
cuComplex rv;
cuComplex rho_rho0;
cuComplex alpha_omega;
cuComplex beta_omega;
cuComplex one = make_cuFloatComplex(1.0f, 0.0f);
float3 rho_r2;
float3 omega_t2;
int k=0;
//printf("%d iterations, r2 = %e\n", k, r2);
stopwatchStart();
while (r2 > stop && k<perf->maxiter) {
if (k==0) {
rho = make_cuFloatComplex(r2, 0.0);
copyCuda((float *)p, (float *)r, len);
} else {
alpha_omega = cuCdivf(alpha, omega);
rho_rho0 = cuCdivf(rho, rho0);
beta = cuCmulf(rho_rho0, alpha_omega);
// p = r - beta*omega*v + beta*(p)
beta_omega = cuCmulf(beta, omega); beta_omega.x *= -1.0f; beta_omega.y *= -1.0f;
cxpaypbzCuda((float2*)r, beta_omega, (float2*)v, beta, (float2*)p, len/2); // 8
}
if (dag_type == QUDA_DAG_NO)
rv = MatPCcDotWXCuda(v, gauge, p, perf->kappa, tmp, b, perf->matpc_type);
else
rv = MatPCDagcDotWXCuda(v, gauge, p, perf->kappa, tmp, b, perf->matpc_type);
alpha = cuCdivf(rho, rv);
// r -= alpha*v
alpha.x *= -1.0f; alpha.y *= -1.0f;
caxpyCuda(alpha, (float2*)v, (float2*)r, len/2); // 4
alpha.x *= -1.0f; alpha.y *= -1.0f;
if (dag_type == QUDA_DAG_NO) MatPCCuda(t, gauge, r, perf->kappa, tmp, perf->matpc_type);
else MatPCDagCuda(t, gauge, r, perf->kappa, tmp, perf->matpc_type);
// omega = (t, r) / (t, t)
omega_t2 = cDotProductNormACuda((float2*)t, (float2*)r, len/2); // 6
omega.x = omega_t2.x / omega_t2.z; omega.y = omega_t2.y/omega_t2.z;
//x += alpha*p + omega*r, r -= omega*t, r2 = (r,r), rho = (r0, r)
rho_r2 = caxpbypzYmbwcDotProductWYNormYCuda(alpha, (float2*)p, omega, (float2*)r,
(float2*)x, (float2*)t, (float2*)b, len/2);
rho0 = rho; rho.x = rho_r2.x; rho.y = rho_r2.y; r2 = rho_r2.z;
k++;
//printf("%d iterations, r2 = %e, x2 = %e\n", k, r2, normCuda((float*)x, len));
}
perf->secs += stopwatchReadSeconds();
//if (k==maxiters) printf("Exceeded maximum iterations %d\n", maxiters);
float gflops = (1.0e-9*Nh)*(2*(2*1320+48)*k + (32*k + 8*(k-1))*spinorSiteSize);
//printf("%f gflops\n", k*gflops / stopwatchReadSeconds());
perf->gflops += gflops;
perf->iter += k;
#if 0
// Calculate the true residual
if (dag_type == QUDA_DAG_NO) MatPCCuda(t, gauge, x, perf->kappa, tmp, perf->matpc_type);
else MatPCDagCuda(t, gauge, x, perf->kappa, tmp, perf->matpc_type);
copyCuda((float *)r, (float *)b, len);
mxpyCuda((float *)t, (float *)r, len);
double true_res = normCuda((float *)r, len);
printf("Converged after %d iterations, r2 = %e, true_r2 = %e\n",
k, r2, true_res / b2);
#endif
freeParitySpinor(r);
freeParitySpinor(v);
freeParitySpinor(t);
freeParitySpinor(p);
return;
}
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <quda.h>
#include <util_quda.h>
#include <field_quda.h>
void invertCgCuda(ParitySpinor x, ParitySpinor b, FullGauge gauge,
ParitySpinor tmp, QudaInvertParam *perf)
{
int len = Nh*spinorSiteSize;
ParitySpinor r;
ParitySpinor p = allocateParitySpinor();
ParitySpinor Ap = allocateParitySpinor();
float b2 = normCuda((float *)b, len);
float r2 = b2;
float r2_old;
float stop = r2*perf->tol*perf->tol; // stopping condition of solver
float alpha, beta, pAp;
if (perf->preserve_source == QUDA_PRESERVE_SOURCE_YES) {
r = allocateParitySpinor();
copyCuda((float *)r, (float *)b, len);
} else {
r = b;
}
copyCuda((float *)p, (float *)r, len);
zeroCuda((float *)x, len);
int k=0;
//printf("%d iterations, r2 = %e\n", k, r2);
stopwatchStart();
while (r2 > stop && k<perf->maxiter) {
MatPCDagMatPCCuda(Ap, gauge, p, perf->kappa, tmp, perf->matpc_type);
pAp = reDotProductCuda((float *)p, (float *)Ap, len);
alpha = r2 / pAp;
r2_old = r2;
r2 = axpyNormCuda(-alpha, (float *)Ap, (float *)r, len);
beta = r2 / r2_old;
axpyZpbxCuda(alpha, (float *)p, (float *)x, (float *)r, beta, len);
k++;
//printf("%d iterations, r2 = %e\n", k, r2);
}
perf->secs = stopwatchReadSeconds();
//if (k==maxiters)
//printf("Exceeded maximum iterations %d\n", maxiters);
float gflops = k*(1.0e-9*Nh)*(2*(2*1320+48) + 10*spinorSiteSize);
//printf("%f gflops\n", k*gflops / stopwatchReadSeconds());
perf->gflops = gflops;
perf->iter = k;
#if 0
// Calculate the true residual
MatPCDagMatPCCuda(Ap, gauge, x, perf->kappa, tmp, perf->matpc_type);
copyCuda((float *)r, (float *)b, len);
mxpyCuda((float *)Ap, (float *)r, len);
double true_res = normCuda((float *)r, len);
printf("Converged after %d iterations, r2 = %e, true_r2 = %e\n",
k, r2, true_res / b2);
#endif
if (perf->preserve_source == QUDA_PRESERVE_SOURCE_YES) freeParitySpinor(r);
freeParitySpinor(p);
freeParitySpinor(Ap);
return;
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment