Advanced Computing Platform for Theoretical Physics

Commit 12b32dea authored by rbabich's avatar rbabich
Browse files

quda: standardized error reporting


git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@599 be54200a-260c-0410-bdd7-ce6af2a381ab
parent d8c36033
......@@ -4,8 +4,6 @@ Version 0.x
- Introduced new interface functions newQudaGaugeParam() and
newQudaInvertParam() to allow for enhanced error checking. See
invert_test for an example of their use.
- Modified loadCloverQuda() to take gauge_param as an additional
parameter.
- Added auto-tuning blas to improve performance (see README for details).
- Improved stability of the half precision 8-parameter SU(3)
reconstruction (with thanks to Guochun Shi).
......
......@@ -83,7 +83,7 @@ extern "C" {
void initQuda(int dev);
void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param);
void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param);
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param);
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param);
void invertQuda(void *h_x, void *h_b, QudaInvertParam *param);
......
......@@ -3,6 +3,10 @@
#include <cuda_runtime.h>
#ifdef USE_QMP
#include <qmp.h>
#endif
//#define L1 4 // "x" dimension
//#define L2 4 // "y" dimension
//#define L3 4 // "z" dimension
......@@ -31,6 +35,7 @@
#define Tboundary QudaTboundary
#include <enum_quda.h>
#include <util_quda.h>
#ifdef __cplusplus
extern "C" {
......
#ifndef _UTIL_QUDA_H
#define _UTIL_QUDA_H
#ifdef USE_QMP
#include <qmp.h>
#define printfQuda(...) do { \
if (QMP_get_node_number() == 0) { \
printf(__VA_ARGS__); \
fflush(stdout); \
} \
} while (0)
#define errorQuda(...) do { \
printf("QUDA error: " __VA_ARGS__); \
printf(" (node %d, " __FILE__ ":%d)\n", \
QMP_get_node_number(), __LINE__); \
QMP_abort(1); \
} while (0)
#else
#define printfQuda(...) do { printf(__VA_ARGS__); fflush(stdout); } while (0)
#define errorQuda(...) do { \
printf("QUDA error: " __VA_ARGS__); \
printf(" (" __FILE__ ":%d)\n", __LINE__); \
exit(1); \
} while (0)
#endif // USE_QMP
#define warningQuda(...) do { \
printfQuda("QUDA warning: " __VA_ARGS__); \
printfQuda("\n"); \
} while (0)
#define checkCudaError() do { \
cudaError_t error = cudaGetLastError(); \
if (error != cudaSuccess) \
errorQuda("CUDA: %s", cudaGetErrorString(error)); \
} while (0)
#ifdef __cplusplus
extern "C" {
#endif
......
......@@ -45,43 +45,37 @@ void initBlas(void) {
if (!d_reduceFloat) {
if (cudaMalloc((void**) &d_reduceFloat, REDUCE_MAX_BLOCKS*sizeof(QudaSumFloat)) == cudaErrorMemoryAllocation) {
printf("Error allocating device reduction array\n");
exit(0);
errorQuda("Error allocating device reduction array");
}
}
if (!d_reduceComplex) {
if (cudaMalloc((void**) &d_reduceComplex, REDUCE_MAX_BLOCKS*sizeof(QudaSumComplex)) == cudaErrorMemoryAllocation) {
printf("Error allocating device reduction array\n");
exit(0);
errorQuda("Error allocating device reduction array");
}
}
if (!d_reduceFloat3) {
if (cudaMalloc((void**) &d_reduceFloat3, REDUCE_MAX_BLOCKS*sizeof(QudaSumFloat3)) == cudaErrorMemoryAllocation) {
printf("Error allocating device reduction array\n");
exit(0);
errorQuda("Error allocating device reduction array");
}
}
if (!h_reduceFloat) {
if (cudaMallocHost((void**) &h_reduceFloat, REDUCE_MAX_BLOCKS*sizeof(QudaSumFloat)) == cudaErrorMemoryAllocation) {
printf("Error allocating host reduction array\n");
exit(0);
errorQuda("Error allocating host reduction array");
}
}
if (!h_reduceComplex) {
if (cudaMallocHost((void**) &h_reduceComplex, REDUCE_MAX_BLOCKS*sizeof(QudaSumComplex)) == cudaErrorMemoryAllocation) {
printf("Error allocating host reduction array\n");
exit(0);
errorQuda("Error allocating host reduction array");
}
}
if (!h_reduceFloat3) {
if (cudaMallocHost((void**) &h_reduceFloat3, REDUCE_MAX_BLOCKS*sizeof(QudaSumFloat3)) == cudaErrorMemoryAllocation) {
printf("Error allocating host reduction array\n");
exit(0);
errorQuda("Error allocating host reduction array");
}
}
......@@ -309,21 +303,18 @@ texture<float, 1, cudaReadModeElementType> texNorm5;
inline void checkSpinor(ParitySpinor &a, ParitySpinor &b) {
if (a.precision != b.precision) {
printf("checkSpinor error, precisions do not match: %d %d\n", a.precision, b.precision);
exit(-1);
errorQuda("checkSpinor: precisions do not match: %d %d", a.precision, b.precision);
}
if (a.length != b.length) {
printf("checkSpinor error, lengths do not match: %d %d\n", a.length, b.length);
exit(-1);
errorQuda("checkSpinor: lengths do not match: %d %d", a.length, b.length);
}
}
// For kernels with precision conversion built in
inline void checkSpinorLength(ParitySpinor &a, ParitySpinor &b) {
if (a.length != b.length) {
printf("checkSpinor error, lengths do not match: %d %d\n", a.length, b.length);
exit(-1);
errorQuda("checkSpinor: lengths do not match: %d %d", a.length, b.length);
}
}
......
......@@ -2,8 +2,13 @@
// This file defines functions to either initialize, check, or print
// the QUDA gauge and inverter parameters. It gets included in
// invert_quda.cpp, after either INIT_PARAM, CHECK_PARAM, or
// interface_quda.cpp, after either INIT_PARAM, CHECK_PARAM, or
// PRINT_PARAM is defined.
//
// If you're reading this file because it was mentioned in a "QUDA
// error" message, it probably means that you forgot to set one of the
// gauge or inverter parameters in your application before calling
// loadGaugeQuda() or invertQuda().
#include <float.h>
#define INVALID_INT QUDA_INVALID_ENUM
......@@ -12,18 +17,12 @@
// define macro to carry out the appropriate action for a given parameter
#if defined INIT_PARAM
#define P(x, val) do { ret.x = val; } while (0)
#define P(x, val) ret.x = val
#elif defined CHECK_PARAM
#define P(x, val) do { \
if (param->x == val) { \
printf("QUDA error: " #x " undefined.\n"); \
exit(1); \
} \
} while (0)
#define P(x, val) if (param->x == val) errorQuda("Parameter " #x " undefined")
#elif defined PRINT_PARAM
#define P(x, val) do { \
printf((val == INVALID_DOUBLE) ? #x " = %g\n" : #x " = %d\n", param->x); \
} while (0)
#define P(x, val) \
printfQuda((val == INVALID_DOUBLE) ? #x " = %g\n" : #x " = %d\n", param->x)
#else
#error INIT_PARAM, CHECK_PARAM, and PRINT_PARAM all undefined in check_params.h
#endif
......@@ -38,7 +37,7 @@ QudaGaugeParam newQudaGaugeParam(void) {
static void checkGaugeParam(QudaGaugeParam *param) {
#else
void printQudaGaugeParam(QudaGaugeParam *param) {
printf("QUDA Gauge Parameters:\n");
printfQuda("QUDA Gauge Parameters:\n");
#endif
P(X[0], INVALID_INT);
......@@ -76,7 +75,7 @@ QudaInvertParam newQudaInvertParam(void) {
static void checkInvertParam(QudaInvertParam *param) {
#else
void printQudaInvertParam(QudaInvertParam *param) {
printf("QUDA Inverter Parameters:\n");
printfQuda("QUDA Inverter Parameters:\n");
#endif
P(dslash_type, QUDA_INVALID_DSLASH);
......
......@@ -27,16 +27,14 @@ void allocateParityClover(ParityClover *ret, int *X, int pad, Precision precisio
if (!ret->clover) {
if (cudaMalloc((void**)&(ret->clover), ret->bytes) == cudaErrorMemoryAllocation) {
printf("Error allocating clover term\n");
exit(0);
errorQuda("Error allocating clover term");
}
}
if (!ret->cloverNorm) {
if (precision == QUDA_HALF_PRECISION) {
if (cudaMalloc((void**)&ret->cloverNorm, ret->bytes/18) == cudaErrorMemoryAllocation) {
printf("Error allocating cloverNorm\n");
exit(0);
errorQuda("Error allocating cloverNorm");
}
}
}
......@@ -178,23 +176,19 @@ void loadParityClover(ParityClover ret, void *clover, Precision cpu_prec,
void *packedClover, *packedCloverNorm;
if (ret.precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) {
printf("QUDA error: cannot have CUDA double precision without double CPU precision\n");
exit(-1);
errorQuda("Cannot have CUDA double precision without CPU double precision");
}
if (clover_order != QUDA_PACKED_CLOVER_ORDER) {
printf("QUDA error: invalid clover order\n");
exit(-1);
errorQuda("Invalid clover_order");
}
#ifndef __DEVICE_EMULATION__
if (cudaMallocHost(&packedClover, ret.bytes) == cudaErrorMemoryAllocation) {
printf("Error allocating clover pinned memory\n");
exit(0);
errorQuda("Error allocating clover pinned memory");
}
if (ret.precision == QUDA_HALF_PRECISION)
if (cudaMallocHost(&packedCloverNorm, ret.bytes/18) == cudaErrorMemoryAllocation) {
printf("Error allocating clover pinned memory\n");
exit(0);
errorQuda("Error allocating clover pinned memory");
}
#else
packedClover = malloc(ret.bytes);
......@@ -241,12 +235,10 @@ void loadFullClover(FullClover ret, void *clover, Precision cpu_prec,
void *packedEven, *packedEvenNorm, *packedOdd, *packedOddNorm;
if (ret.even.precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) {
printf("QUDA error: cannot have CUDA double precision without double CPU precision\n");
exit(-1);
errorQuda("Cannot have CUDA double precision without CPU double precision");
}
if (clover_order != QUDA_LEX_PACKED_CLOVER_ORDER) {
printf("QUDA error: invalid clover order\n");
exit(-1);
errorQuda("Invalid clover order");
}
#ifndef __DEVICE_EMULATION__
......@@ -321,8 +313,7 @@ void loadCloverField(FullClover ret, void *clover, Precision cpu_prec, CloverFie
loadParityClover(ret.even, clover, cpu_prec, clover_order);
loadParityClover(ret.odd, clover_odd, cpu_prec, clover_order);
} else {
printf("QUDA error: CloverFieldOrder %d not supported\n", clover_order);
exit(-1);
errorQuda("Invalid clover_order");
}
}
......@@ -330,8 +321,7 @@ void loadCloverField(FullClover ret, void *clover, Precision cpu_prec, CloverFie
void createCloverField(FullClover *cudaClover, void *cpuClover, int *X, Precision precision, QudaInvertParam invert_param)
{
if (invert_param->clover_cpu_prec == QUDA_HALF_PRECISION) {
printf("QUDA error: half precision not supported on cpu\n");
exit(-1);
errorQuda("Half precision not supported on CPU");
}
// X should contain the dimensions of the even/odd sublattice
......
static void checkSpinor(ParitySpinor out, ParitySpinor in) {
if (in.precision != out.precision) {
printf("Error in dslash quda: input and out spinor precisions don't match\n");
exit(-1);
errorQuda("Input and output spinor precisions don't match in dslash_quda");
}
if (in.stride != out.stride) {
printf("Error in dslash quda: input and out spinor strides don't match\n");
exit(-1);
errorQuda("Input and output spinor strides don't match in dslash_quda");
}
#if (__CUDA_ARCH__ != 130)
if (in.precision == QUDA_DOUBLE_PRECISION) {
printf("Double precision not supported on this GPU\n");
exit(-1);
errorQuda("Double precision not supported on this GPU");
}
#endif
}
static void checkGaugeSpinor(ParitySpinor spinor, FullGauge gauge) {
if (spinor.volume != gauge.volume) {
printf("Error, spinor volume %d doesn't match gauge volume %d\n", spinor.volume, gauge.volume);
exit(-1);
errorQuda("Spinor volume %d doesn't match gauge volume %d", spinor.volume, gauge.volume);
}
#if (__CUDA_ARCH__ != 130)
if (gauge.precision == QUDA_DOUBLE_PRECISION) {
printf("Double precision not supported on this GPU\n");
exit(-1);
errorQuda("Double precision not supported on this GPU");
}
#endif
}
static void checkCloverSpinor(ParitySpinor spinor, FullClover clover) {
if (spinor.volume != clover.even.volume) {
printf("Error, spinor volume %d doesn't match even clover volume %d\n",
spinor.volume, clover.even.volume);
exit(-1);
errorQuda("Spinor volume %d doesn't match even clover volume %d",
spinor.volume, clover.even.volume);
}
if (spinor.volume != clover.odd.volume) {
printf("Error, spinor volume %d doesn't match odd clover volume %d\n",
spinor.volume, clover.odd.volume);
exit(-1);
errorQuda("Spinor volume %d doesn't match odd clover volume %d",
spinor.volume, clover.odd.volume);
}
#if (__CUDA_ARCH__ != 130)
if ((clover.even.precision == QUDA_DOUBLE_PRECISION) ||
(clover.odd.precision == QUDA_DOUBLE_PRECISION)) {
printf("Double precision not supported on this GPU\n");
exit(-1);
errorQuda("Double precision not supported on this GPU");
}
#endif
}
......@@ -47,8 +47,7 @@ void initDslashConstants(FullGauge gauge, int sp_stride, int cl_stride) {
cudaMemcpyToSymbol("cl_stride", &cl_stride, sizeof(int));
if (Vh%BLOCK_DIM != 0) {
printf("Error, volume not a multiple of the thread block size\n");
exit(-1);
errorQuda("Error, Volume not a multiple of the thread block size");
}
int X1 = 2*gauge.X[0];
......@@ -122,12 +121,7 @@ void initDslashConstants(FullGauge gauge, int sp_stride, int cl_stride) {
float h_pi_f = M_PI;
cudaMemcpyToSymbol("pi_f", &(h_pi_f), sizeof(float));
cudaError_t error = cudaGetLastError();
cudaGetLastError();
if(error != cudaSuccess) {
printf("initDslashConstants error: %s\n", cudaGetErrorString(error));
exit(0);
}
checkCudaError();
initDslash = 1;
}
......@@ -175,13 +169,7 @@ void dslashCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int parity,
} else if (in.precision == QUDA_HALF_PRECISION) {
dslashHCuda(out, gauge, in, parity, dagger);
}
cudaError_t error = cudaGetLastError();
cudaGetLastError();
if(error != cudaSuccess) {
printf("dslashCuda error: %s\n", cudaGetErrorString(error));
exit(0);
}
checkCudaError();
dslash_quda_flops += 1320*in.volume;
}
......@@ -386,13 +374,7 @@ void dslashXpayCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, int pari
} else if (in.precision == QUDA_HALF_PRECISION) {
dslashXpayHCuda(out, gauge, in, parity, dagger, x, a);
}
cudaError_t error = cudaGetLastError();
cudaGetLastError();
if(error != cudaSuccess) {
printf("dslashXpayCuda error: %s\n", cudaGetErrorString(error));
exit(0);
}
checkCudaError();
dslash_quda_flops += (1320+48)*in.volume;
}
......@@ -616,8 +598,7 @@ void MatPCCuda(ParitySpinor out, FullGauge gauge, ParitySpinor in, double kappa,
dslashCuda(tmp, gauge, in, 0, dagger);
dslashXpayCuda(out, gauge, tmp, 1, dagger, in, kappa2);
} else {
printf("QUDA error: matpc_type not valid for plain Wilson\n");
exit(-1);
errorQuda("matpc_type not valid for plain Wilson");
}
}
......@@ -666,13 +647,7 @@ void cloverDslashCuda(ParitySpinor out, FullGauge gauge, FullClover cloverInv,
} else if (in.precision == QUDA_HALF_PRECISION) {
cloverDslashHCuda(out, gauge, cloverInv, in, parity, dagger);
}
cudaError_t error = cudaGetLastError();
cudaGetLastError();
if(error != cudaSuccess) {
printf("cloverDslashCuda error: %s\n", cudaGetErrorString(error));
exit(0);
}
checkCudaError();
dslash_quda_flops += (1320+504)*in.volume;
}
......@@ -1179,13 +1154,7 @@ void cloverDslashXpayCuda(ParitySpinor out, FullGauge gauge, FullClover cloverIn
} else if (in.precision == QUDA_HALF_PRECISION) {
cloverDslashXpayHCuda(out, gauge, cloverInv, in, parity, dagger, x, a);
}
cudaError_t error = cudaGetLastError();
cudaGetLastError();
if(error != cudaSuccess) {
printf("cloverDslashXpayCuda error: %s\n", cudaGetErrorString(error));
exit(0);
}
checkCudaError();
dslash_quda_flops += (1320+504+48)*in.volume;
}
......@@ -1725,7 +1694,7 @@ void cloverMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover clover, FullC
if (((matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) || (matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC))
&& (clover.even.clover == NULL)) {
printf("QUDA error: For asymmetric matpc_type, the uninverted clover term must be loaded\n");
errorQuda("For asymmetric matpc_type, the uninverted clover term must be loaded");
}
// FIXME: For asymmetric, a "dslashCxpay" kernel would improve performance.
......@@ -1746,8 +1715,7 @@ void cloverMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover clover, FullC
cloverDslashCuda(tmp, gauge, cloverInv, in, 0, dagger);
cloverDslashXpayCuda(out, gauge, cloverInv, tmp, 1, dagger, in, kappa2);
} else {
printf("QUDA error: invalid matpc_type\n");
exit(-1);
errorQuda("Invalid matpc_type");
}
} else { // symmetric preconditioning, dagger
if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
......@@ -1759,8 +1727,7 @@ void cloverMatPCCuda(ParitySpinor out, FullGauge gauge, FullClover clover, FullC
cloverDslashCuda(tmp, gauge, cloverInv, out, 0, dagger);
dslashXpayCuda(out, gauge, tmp, 1, dagger, in, kappa2);
} else {
printf("QUDA error: invalid matpc_type\n");
exit(-1);
errorQuda("Invalid matpc_type");
}
}
}
......@@ -1803,13 +1770,7 @@ void cloverCuda(ParitySpinor out, FullGauge gauge, FullClover clover,
} else if (in.precision == QUDA_HALF_PRECISION) {
cloverHCuda(out, gauge, clover, in, parity);
}
cudaError_t error = cudaGetLastError();
cudaGetLastError();
if(error != cudaSuccess) {
printf("cloverCuda error: %s\n", cudaGetErrorString(error));
exit(0);
}
checkCudaError();
dslash_quda_flops += 504*in.volume;
}
......
......@@ -488,15 +488,13 @@ static void allocateGaugeField(FullGauge *cudaGauge, ReconstructType reconstruct
if (!cudaGauge->even) {
if (cudaMalloc((void **)&cudaGauge->even, cudaGauge->bytes) == cudaErrorMemoryAllocation) {
printf("Error allocating even gauge field\n");
exit(0);
errorQuda("Error allocating even gauge field");
}
}
if (!cudaGauge->odd) {
if (cudaMalloc((void **)&cudaGauge->odd, cudaGauge->bytes) == cudaErrorMemoryAllocation) {
printf("Error allocating even odd gauge field\n");
exit(0);
errorQuda("Error allocating even odd gauge field");
}
}
......@@ -531,22 +529,14 @@ static void loadGaugeField(FloatN *even, FloatN *odd, Float *cpuGauge, GaugeFiel
packCPSGaugeField(packedEven, (Float*)cpuGauge, 0, reconstruct, Vh, pad);
packCPSGaugeField(packedOdd, (Float*)cpuGauge, 1, reconstruct, Vh, pad);
} else {
printf("Sorry, %d GaugeFieldOrder not supported\n", gauge_order);
exit(-1);
}
cudaError_t error = cudaMemcpy(even, packedEven, bytes, cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
printf("Error: %s\n", cudaGetErrorString(error));
exit(-1);
errorQuda("Invalid gauge_order");
}
error = cudaMemcpy(odd, packedOdd, bytes, cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
printf("Error: %s\n", cudaGetErrorString(error));
exit(-1);
}
cudaMemcpy(even, packedEven, bytes, cudaMemcpyHostToDevice);
checkCudaError();
cudaMemcpy(odd, packedOdd, bytes, cudaMemcpyHostToDevice);
checkCudaError();
#ifndef __DEVICE_EMULATION__
cudaFreeHost(packedEven);
......@@ -583,8 +573,7 @@ static void retrieveGaugeField(Float *cpuGauge, FloatN *even, FloatN *odd, Gauge
unpackCPSGaugeField((Float*)cpuGauge, packedEven, 0, reconstruct, Vh, pad);
unpackCPSGaugeField((Float*)cpuGauge, packedOdd, 1, reconstruct, Vh, pad);
} else {
printf("Sorry, %d GaugeFieldOrder not supported\n", gauge_order);
exit(-1);
errorQuda("Invalid gauge_order");
}
#ifndef __DEVICE_EMULATION__
......@@ -602,8 +591,7 @@ void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, Precision cuda_prec,
Tboundary t_boundary, int *XX, double anisotropy, int pad)
{
if (cpu_prec == QUDA_HALF_PRECISION) {
printf("QUDA error: half precision not supported on cpu\n");
exit(-1);
errorQuda("Half precision not supported on CPU");
}
Anisotropy = anisotropy;
......@@ -658,8 +646,7 @@ void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, Precision cuda_prec,
void restoreGaugeField(void *cpuGauge, FullGauge *cudaGauge, Precision cpu_prec, GaugeFieldOrder gauge_order)
{
if (cpu_prec == QUDA_HALF_PRECISION) {
printf("QUDA error: half precision not supported on cpu\n");
exit(-1);
errorQuda("Half precision not supported on CPU");
}
if (cudaGauge->precision == QUDA_DOUBLE_PRECISION) {
......
......@@ -40,8 +40,7 @@ FullClover cudaCloverInvSloppy;
static void checkPrecision(QudaPrecision precision)
{
if (precision == QUDA_HALF_PRECISION) {
printf("Half precision not supported on cpu\n");
exit(-1);
errorQuda("Half precision not supported on CPU");
}
}
......@@ -50,29 +49,26 @@ void initQuda(int dev)
int deviceCount;
cudaGetDeviceCount(&deviceCount);
if (deviceCount == 0) {
fprintf(stderr, "No devices supporting CUDA.\n");
exit(EXIT_FAILURE);
errorQuda("No devices supporting CUDA");
}
for(int i=0; i<deviceCount; i++) {
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, i);
fprintf(stderr, "found device %d: %s\n", i, deviceProp.name);
fprintf(stderr, "QUDA: Found device %d: %s\n", i, deviceProp.name);
}
if(dev<0) {
if (dev < 0) {
dev = deviceCount - 1;
//dev = 0;
}
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
if (deviceProp.major < 1) {
fprintf(stderr, "Device %d does not support CUDA.\n", dev);
exit(EXIT_FAILURE);
errorQuda("Device %d does not support CUDA", dev);
}
fprintf(stderr, "Using device %d: %s\n", dev, deviceProp.name);
fprintf(stderr, "QUDA: Using device %d: %s\n", dev, deviceProp.name);
cudaSetDevice(dev);
cudaGaugePrecise.even = NULL;
......@@ -126,30 +122,25 @@ void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
restoreGaugeField(h_gauge, &cudaGaugePrecise, param->cpu_prec, param->gauge_order);
}
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param)
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
{
if (!h_clover && !h_clovinv) {
printf("QUDA error: loadCloverQuda() called with neither clover term nor inverse\n");
exit(-1);
errorQuda("loadCloverQuda() called with neither clover term nor inverse");
}
if (inv_param->clover_cpu_prec == QUDA_HALF_PRECISION) {
printf("QUDA error: half precision not supported on CPU\n");
exit(-1);
errorQuda("Half precision not supported on CPU");
}
if (cudaGaugePrecise.even == NULL) {
printf("QUDA error: gauge field must be loaded before clover\n");
exit(-1);
errorQuda("Gauge field must be loaded before clover");
}
if (inv_param->dslash_type != QUDA_CLOVER_WILSON_DSLASH) {
printf("QUDA error: wrong dslash_type in loadCloverQuda()\n");
exit(-1);
errorQuda("Wrong dslash_type in loadCloverQuda()");
}
int X[4];
for (int i=0; i<4; i++) {
X[i] = gauge_param->X[i];
X[i] = cudaGaugePrecise.X[i];
}
X[0] /= 2; // dimensions of the even-odd sublattice
inv_param->cloverGiB = 0;
......@@ -172,8 +163,7 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaGaugeParam *gauge_param
allocateCloverField(&cudaCloverInvPrecise, X, inv_param->cl_pad, inv_param->clover_cuda_prec);
if (!h_clovinv) {
printf("QUDA error: clover term inverse not implemented yet\n");
exit(-1);
errorQuda("Clover term inverse not implemented yet");
} else {
loadCloverField(cudaCloverInvPrecise, h_clovinv, inv_param->clover_cpu_prec, inv_param->clover_order);
}
......@@ -232,8 +222,7 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int parity,
} else if (inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
cloverDslashCuda(out, cudaGaugePrecise, cudaCloverInvPrecise, in, parity, dagger);
} else {
printf("QUDA error: unsupported dslash_type\n");
exit(-1);
errorQuda("Unsupported dslash_type");
}
retrieveParitySpinor(h_out, out, inv_param->cpu_prec, inv_param->dirac_order);
......@@ -261,8 +250,7 @@ void MatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int dagger)
cloverMatPCCuda(out, cudaGaugePrecise, cudaCloverPrecise, cudaCloverInvPrecise, in, kappa,
tmp, inv_param->matpc_type, dagger);
} else {
printf("QUDA error: unsupported dslash_type\n");
exit(-1);
errorQuda("Unsupported dslash_type");
}
retrieveParitySpinor(h_out, out, inv_param->cpu_prec, inv_param->dirac_order);
......@@ -291,8 +279,7 @@ void MatPCDagMatPCQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
cloverMatPCDagMatPCCuda(out, cudaGaugePrecise, cudaCloverPrecise, cudaCloverInvPrecise, in, kappa,
tmp, inv_param->matpc_type);
} else {
printf("QUDA error: unsupported dslash_type\n");
exit(-1);
errorQuda("Unsupported dslash_type");
}
retrieveParitySpinor(h_out, out, inv_param->cpu_prec, inv_param->dirac_order);
......@@ -321,8 +308,7 @@ void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, int dagger)
cloverMatCuda(out, cudaGaugePrecise, cudaCloverPrecise, in, kappa, tmp, dagger);
freeParitySpinor(tmp);
} else {
printf("QUDA error: unsupported dslash_type\n");
exit(-1);
errorQuda("Unsupported dslash_type");
}
retrieveSpinorField(h_out, out, inv_param->cpu_prec, inv_param->dirac_order);
......@@ -388,8 +374,7 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
// in = b_o + k D_oe b_e
dslashXpayCuda(in, cudaGaugePrecise, b.even, 1, 0, b.odd, kappa);
} else {
printf("QUDA error: matpc_type not valid for plain Wilson\n");
exit(-1);
errorQuda("matpc_type not valid for plain Wilson");
}
} else if (param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
if (param->matpc_type == QUDA_MATPC_EVEN_EVEN) {
......@@ -415,12 +400,10 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
cloverCuda(aux, cudaGaugePrecise, cudaCloverInvPrecise, b.even, 0); // safe even when aux = b.even
dslashXpayCuda(in, cudaGaugePrecise, aux, 1, 0, b.odd, kappa);
} else {
printf("QUDA error: invalid matpc_type\n");
exit(-1);
errorQuda("Invalid matpc_type");
}
} else {
printf("QUDA error: unsupported dslash_type\n");
exit(-1);
errorQuda("Unsupported dslash_type");
}
} else if (param->solution_type == QUDA_MATPC_SOLUTION ||
......@@ -465,8 +448,7 @@ void invertQuda(void *h_x, void *h_b, QudaInvertParam *param)
invertBiCGstabCuda(out, in, tmp, param, QUDA_DAG_NO);
break;
default:
printf("Inverter type %d not implemented\n", param->inv_type);
exit(-1);
errorQuda("Inverter type %d not implemented", param->inv_type);
}
if (param->solution_type == QUDA_MAT_SOLUTION) {
......
......@@ -13,12 +13,7 @@
void MatVec(ParitySpinor out, FullGauge gauge, FullClover clover, FullClover cloverInv, ParitySpinor in,
QudaInvertParam *invert_param, ParitySpinor tmp, DagType dag_type) {
{cudaError_t error = cudaGetLastError();
cudaGetLastError();
if(error != cudaSuccess) {
printf("B4 MatVec: %s\n", cudaGetErrorString(error));
exit(0);
}}
checkCudaError();
double kappa = invert_param->kappa;
if (invert_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER)
......@@ -30,14 +25,7 @@ void MatVec(ParitySpinor out, FullGauge gauge, FullClover clover, FullClover cl
cloverMatPCCuda(out, gauge, clover, cloverInv, in, kappa, tmp,
invert_param->matpc_type, dag_type);
}
cudaError_t error = cudaGetLastError();
cudaGetLastError();
if(error != cudaSuccess) {
printf("MatVec: %s\n", cudaGetErrorString(error));
exit(0);
}
checkCudaError();
}
void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, ParitySpinor r,
......@@ -95,7 +83,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, ParitySpinor r,
double maxrr = rNorm;
double maxrx = rNorm;
if (invert_param->verbosity >= QUDA_VERBOSE) printf("%d iterations, r2 = %e\n", k, r2);
if (invert_param->verbosity >= QUDA_VERBOSE) printfQuda("%d iterations, r2 = %e\n", k, r2);
blas_quda_flops = 0;
dslash_quda_flops = 0;
......@@ -166,20 +154,20 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, ParitySpinor r,
}
k++;
if (invert_param->verbosity >= QUDA_VERBOSE) printf("%d iterations, r2 = %e\n", k, r2);
if (invert_param->verbosity >= QUDA_VERBOSE) printfQuda("%d iterations, r2 = %e\n", k, r2);
}
if (x.precision != x_sloppy.precision) copyCuda(x, x_sloppy);
xpyCuda(y, x);
if (k==invert_param->maxiter) printf("Exceeded maximum iterations %d\n", invert_param->maxiter);
if (k==invert_param->maxiter) warningQuda("Exceeded maximum iterations %d", invert_param->maxiter);
if (invert_param->verbosity >= QUDA_VERBOSE) printf("Reliable updates = %d\n", rUpdate);
if (invert_param->verbosity >= QUDA_VERBOSE) printfQuda("Reliable updates = %d\n", rUpdate);
invert_param->secs += stopwatchReadSeconds();
float gflops = (blas_quda_flops + dslash_quda_flops)*1e-9;
// printf("%f gflops\n", gflops / stopwatchReadSeconds());
// printfQuda("%f gflops\n", gflops / stopwatchReadSeconds());
invert_param->gflops += gflops;
invert_param->iter += k;
......@@ -190,7 +178,7 @@ void invertBiCGstabCuda(ParitySpinor x, ParitySpinor b, ParitySpinor r,
copyCuda(b, src);
if (invert_param->verbosity >= QUDA_SUMMARIZE)
printf("Converged after %d iterations, r2 = %e, true_r2 = %e\n", k, sqrt(r2/b2), sqrt(true_res / b2));
printfQuda("Converged after %d iterations, r2 = %e, true_r2 = %e\n", k, sqrt(r2/b2), sqrt(true_res / b2));
#endif
if (invert_param->cuda_prec_sloppy != x.precision) {
......
......@@ -64,7 +64,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor b, ParitySpinor y, QudaInvertPara
int k=0;
int rUpdate = 0;
if (invert_param->verbosity >= QUDA_VERBOSE) printf("%d iterations, r2 = %e\n", k, r2);
if (invert_param->verbosity >= QUDA_VERBOSE) printfQuda("%d iterations, r2 = %e\n", k, r2);
blas_quda_flops = 0;
......@@ -112,7 +112,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor b, ParitySpinor y, QudaInvertPara
k++;
if (invert_param->verbosity >= QUDA_VERBOSE)
printf("%d iterations, r2 = %e\n", k, r2);
printfQuda("%d iterations, r2 = %e\n", k, r2);
}
if (x.precision != x_sloppy.precision) copyCuda(x, x_sloppy);
......@@ -122,13 +122,13 @@ void invertCgCuda(ParitySpinor x, ParitySpinor b, ParitySpinor y, QudaInvertPara
if (k==invert_param->maxiter)
printf("Exceeded maximum iterations %d\n", invert_param->maxiter);
warningQuda("Exceeded maximum iterations %d", invert_param->maxiter);
if (invert_param->verbosity >= QUDA_SUMMARIZE)
printf("Reliable updates = %d\n", rUpdate);
printfQuda("Reliable updates = %d\n", rUpdate);
float gflops = (blas_quda_flops + dslash_quda_flops)*1e-9;
// printf("%f gflops\n", gflops / stopwatchReadSeconds());
// printfQuda("%f gflops\n", gflops / stopwatchReadSeconds());
invert_param->gflops = gflops;
invert_param->iter = k;
......@@ -140,7 +140,7 @@ void invertCgCuda(ParitySpinor x, ParitySpinor b, ParitySpinor y, QudaInvertPara
MatVec(r, cudaGaugePrecise, cudaCloverPrecise, cudaCloverInvPrecise, x, y);
double true_res = xmyNormCuda(b, r);
printf("Converged after %d iterations, r2 = %e, true_r2 = %e\n",
printfQuda("Converged after %d iterations, r2 = %e, true_r2 = %e\n",
k, r2, true_res / b2);
#endif
......
......@@ -103,13 +103,11 @@ cuDoubleComplex REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n, int kernel, QudaPre
setBlock(kernel, n, precision);
if (n % blasBlock.x != 0) {
printf("ERROR reduce_complex(): length %d must be a multiple of %d\n", n, blasBlock.x);
exit(-1);
errorQuda("reduce_complex: length %d must be a multiple of %d", n, blasBlock.x);
}
if (blasBlock.x > REDUCE_MAX_BLOCKS) {
printf("ERROR reduce_complex: block size greater then maximum permitted\n");
exit(-1);
errorQuda("reduce_complex: block size greater than maximum permitted");
}
#if (REDUCE_TYPE == REDUCE_KAHAN)
......@@ -129,16 +127,12 @@ cuDoubleComplex REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n, int kernel, QudaPre
} else if (blasBlock.x == 1024) {
REDUCE_FUNC_NAME(Kernel)<1024><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceComplex, n);
} else {
printf("Reduction not implemented for %d threads\n", blasBlock.x);
exit(-1);
errorQuda("Reduction not implemented for %d threads", blasBlock.x);
}
// copy result from device to host, and perform final reduction on CPU
cudaError_t error = cudaMemcpy(h_reduceComplex, d_reduceComplex, blasGrid.x*sizeof(QudaSumComplex), cudaMemcpyDeviceToHost);
if (error != cudaSuccess) {
printf("Error: %s\n", cudaGetErrorString(error));
exit(-1);
}
cudaMemcpy(h_reduceComplex, d_reduceComplex, blasGrid.x*sizeof(QudaSumComplex), cudaMemcpyDeviceToHost);
checkCudaError();
cuDoubleComplex gpu_result;
gpu_result.x = 0;
......
......@@ -85,13 +85,11 @@ double REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n, int kernel, QudaPrecision pr
setBlock(kernel, n, precision);
if (n % blasBlock.x != 0) {
printf("ERROR reduce_core: length %d must be a multiple of %d\n", n, blasBlock.x);
exit(-1);
errorQuda("reduce_core: length %d must be a multiple of %d", n, blasBlock.x);
}
if (blasBlock.x > REDUCE_MAX_BLOCKS) {
printf("ERROR reduce_core: block size greater then maximum permitted\n");
exit(-1);
errorQuda("reduce_core: block size greater then maximum permitted");
}
#if (REDUCE_TYPE == REDUCE_KAHAN)
......@@ -111,16 +109,12 @@ double REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n, int kernel, QudaPrecision pr
} else if (blasBlock.x == 1024) {
REDUCE_FUNC_NAME(Kernel)<1024><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat, n);
} else {
printf("Reduction not implemented for %d threads\n", blasBlock.x);
exit(-1);
errorQuda("Reduction not implemented for %d threads", blasBlock.x);
}
// copy result from device to host, and perform final reduction on CPU
cudaError_t error = cudaMemcpy(h_reduceFloat, d_reduceFloat, blasGrid.x*sizeof(QudaSumFloat), cudaMemcpyDeviceToHost);
if (error != cudaSuccess) {
printf("Error: %s\n", cudaGetErrorString(error));
exit(-1);
}
cudaMemcpy(h_reduceFloat, d_reduceFloat, blasGrid.x*sizeof(QudaSumFloat), cudaMemcpyDeviceToHost);
checkCudaError();
double cpu_sum = 0;
for (int i = 0; i < blasGrid.x; i++) cpu_sum += h_reduceFloat[i];
......
......@@ -117,13 +117,11 @@ double3 REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n, int kernel, QudaPrecision p
setBlock(kernel, n, precision);
if (n % blasBlock.x != 0) {
printf("ERROR reduce_triple_core: length %d must be a multiple of %d\n", n, blasBlock.x);
exit(-1);
errorQuda("reduce_triple_core: length %d must be a multiple of %d", n, blasBlock.x);
}
if (blasBlock.x > REDUCE_MAX_BLOCKS) {
printf("ERROR reduce_triple_core: block size greater then maximum permitted\n");
exit(-1);
errorQuda("reduce_triple_core: block size greater then maximum permitted");
}
#if (REDUCE_TYPE == REDUCE_KAHAN)
......@@ -143,16 +141,12 @@ double3 REDUCE_FUNC_NAME(Cuda) (REDUCE_TYPES, int n, int kernel, QudaPrecision p
} else if (blasBlock.x == 1024) {
REDUCE_FUNC_NAME(Kernel)<1024><<< blasGrid, blasBlock, smemSize >>>(REDUCE_PARAMS, d_reduceFloat3, n);
} else {
printf("Reduction not implemented for %d threads\n", blasBlock.x);
exit(-1);
errorQuda("Reduction not implemented for %d threads", blasBlock.x);
}
// copy result from device to host, and perform final reduction on CPU
cudaError_t error = cudaMemcpy(h_reduceFloat3, d_reduceFloat3, blasGrid.x*sizeof(QudaSumFloat3), cudaMemcpyDeviceToHost);
if (error != cudaSuccess) {
printf("Error: %s\n", cudaGetErrorString(error));
exit(-1);
}
cudaMemcpy(h_reduceFloat3, d_reduceFloat3, blasGrid.x*sizeof(QudaSumFloat3), cudaMemcpyDeviceToHost);
checkCudaError();
double3 gpu_result;
gpu_result.x = 0;
......
......@@ -35,14 +35,12 @@ ParitySpinor allocateParitySpinor(int *X, Precision precision, int pad) {
else ret.bytes = ret.length*sizeof(short);
if (cudaMalloc((void**)&ret.spinor, ret.bytes) == cudaErrorMemoryAllocation) {
printf("Error allocating spinor\n");
exit(0);
errorQuda("Error allocating spinor");
}
if (precision == QUDA_HALF_PRECISION) {
if (cudaMalloc((void**)&ret.spinorNorm, ret.bytes/12) == cudaErrorMemoryAllocation) {
printf("Error allocating spinorNorm\n");
exit(0);
errorQuda("Error allocating spinorNorm");
}
}
......@@ -370,8 +368,7 @@ void loadParitySpinor(ParitySpinor ret, void *spinor, Precision cpu_prec,
DiracFieldOrder dirac_order) {
if (ret.precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) {
printf("Error, cannot have CUDA double precision without double CPU precision\n");
exit(-1);
errorQuda("Cannot have CUDA double precision without CPU double precision");
}
if (ret.precision != QUDA_HALF_PRECISION) {
......@@ -462,8 +459,7 @@ void loadSpinorField(FullSpinor ret, void *spinor, Precision cpu_prec, DiracFiel
loadParitySpinor(ret.even, spinor_odd, cpu_prec, dirac_order);
loadParitySpinor(ret.odd, spinor, cpu_prec, dirac_order);
} else {
printf("DiracFieldOrder %d not supported\n", dirac_order);
exit(-1);
errorQuda("Invalid dirac_order");
}
}
......@@ -543,8 +539,7 @@ void retrieveSpinorField(void *res, FullSpinor spinor, Precision cpu_prec, Dirac
retrieveParitySpinor(res, spinor.odd, cpu_prec, dirac_order);
retrieveParitySpinor(res_odd, spinor.even, cpu_prec, dirac_order);
} else {
printf("DiracFieldOrder %d not supported\n", dirac_order);
exit(-1);
errorQuda("Invalid dirac_order");
}
}
......
......@@ -52,7 +52,7 @@ void init() {
gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T;
gauge_param.cpu_prec = QUDA_DOUBLE_PRECISION;
gauge_param.cuda_prec = QUDA_DOUBLE_PRECISION;
gauge_param.cuda_prec = QUDA_SINGLE_PRECISION;
gauge_param.reconstruct = QUDA_RECONSTRUCT_12;
gauge_param.reconstruct_sloppy = gauge_param.reconstruct;
gauge_param.cuda_prec_sloppy = gauge_param.cuda_prec;
......@@ -69,7 +69,7 @@ void init() {
inv_param.matpc_type = QUDA_MATPC_ODD_ODD;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
gauge_param.ga_pad = 0;
inv_param.sp_pad = 0;
......@@ -143,7 +143,7 @@ void init() {
gauge = cudaGaugePrecise;
if (clover_yes) {
loadCloverQuda(NULL, hostCloverInv, &gauge_param, &inv_param);
loadCloverQuda(NULL, hostCloverInv, &inv_param);
clover = cudaCloverPrecise;
cloverInv = cudaCloverInvPrecise;
}
......
......@@ -29,9 +29,9 @@ int main(int argc, char **argv)
gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T;
gauge_param.cpu_prec = QUDA_DOUBLE_PRECISION;
gauge_param.cuda_prec = QUDA_DOUBLE_PRECISION;
gauge_param.cuda_prec = QUDA_SINGLE_PRECISION;
gauge_param.reconstruct = QUDA_RECONSTRUCT_12;
gauge_param.cuda_prec_sloppy = QUDA_DOUBLE_PRECISION;
gauge_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12;
gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
......@@ -55,8 +55,8 @@ int main(int argc, char **argv)
inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION;
inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec_sloppy = QUDA_DOUBLE_PRECISION;
inv_param.cuda_prec = QUDA_SINGLE_PRECISION;
inv_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
inv_param.preserve_source = QUDA_PRESERVE_SOURCE_YES;
inv_param.dirac_order = QUDA_DIRAC_ORDER;
......@@ -66,8 +66,8 @@ int main(int argc, char **argv)
if (clover_yes) {
inv_param.clover_cpu_prec = QUDA_DOUBLE_PRECISION;
inv_param.clover_cuda_prec = QUDA_DOUBLE_PRECISION;
inv_param.clover_cuda_prec_sloppy = QUDA_DOUBLE_PRECISION;
inv_param.clover_cuda_prec = QUDA_SINGLE_PRECISION;
inv_param.clover_cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER;
}
inv_param.verbosity = QUDA_VERBOSE;
......@@ -102,7 +102,7 @@ int main(int argc, char **argv)
initQuda(device);
loadGaugeQuda((void*)gauge, &gauge_param);
if (clover_yes) loadCloverQuda(NULL, clover_inv, &gauge_param, &inv_param);
if (clover_yes) loadCloverQuda(NULL, clover_inv, &inv_param);
invertQuda(spinorOut, spinorIn, &inv_param);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment