Advanced Computing Platform for Theoretical Physics

Commit 9425ca6f authored by Mathias Wagner's avatar Mathias Wagner
Browse files

Merge pull request #434 from lattice/hotfix/0.8_update

added date to NEWS,README file and fixed some typos
parents d38ed85e d744cc15
Copyright (c) 2009-2015 QUDA Developers
Copyright (c) 2009-2016 QUDA Developers
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
......
Version 0.8.0 - xxth December 2015
Version 0.8.0 - 1st February 2016
- Removed all Tesla-generation GPU support from QUDA (sm_1x). As a
result, QUDA now requires a minimum of the Fermi-generation GPUs.
- Added support for building QUDA using cmake. This gives a much more
flexible and extensible build system as well as allowing
out-of-source-directory building.
out-of-source-directory building. For details see:
https://github.com/lattice/quda/wiki/Building-QUDA-with-cmake
- Improved strong scaling of the multi-shift solver by overlapping the
shift updates with the subsequent iteration's dslash comms waiting.
- Improved performance of multi-shift solver by preventing unecessary
- Improved performance of multi-shift solver by preventing unnecessary
refinement of shifted solutions once the residual falls below
floating point precision.
......@@ -45,9 +46,9 @@ Version 0.8.0 - xxth December 2015
force kernels. This also improves compilation time and reduces
library size.
- Added support for imaginary chemical potential to the staggeed phase
- Added support for imaginary chemical potential to the staggered phase
application / removal kernel, as well as fixing bugs in this
reoutine.
routine.
- Algorithms that previously used double-precision atomics now use a
cub reduction. This drastically improves performance of such
......
Release Notes for QUDA v0.8.0 xxth December 2015
Release Notes for QUDA v0.8.0 1st February 2016
-----------------------------
Overview:
......
......@@ -68,12 +68,14 @@ extern "C" {
int overlap; /**< Width of overlapping domains */
int use_resident_gauge; /**< Use the resident gauge field */
int use_resident_mom; /**< Use the resident mom field */
int make_resident_gauge; /**< Make the gauge field resident */
int make_resident_mom; /**< Make the mom field resident */
int return_gauge; /**< Return the new gauge field */
int return_mom; /**< Return the new mom field */
int overwrite_mom; /**< When computing momentum, should we overwrite it or accumulate to to */
int use_resident_gauge; /**< Use the resident gauge field as input */
int use_resident_mom; /**< Use the resident momentum field as input*/
int make_resident_gauge; /**< Make the result gauge field resident */
int make_resident_mom; /**< Make the result momentum field resident */
int return_result_gauge; /**< Return the result gauge field */
int return_result_mom; /**< Return the result momentum field */
} QudaGaugeParam;
......
......@@ -109,19 +109,21 @@ void printQudaGaugeParam(QudaGaugeParam *param) {
#endif
#if defined INIT_PARAM
P(overwrite_mom, 0);
P(use_resident_gauge, 0);
P(use_resident_mom, 0);
P(make_resident_gauge, 0);
P(make_resident_mom, 0);
P(return_gauge, 1);
P(return_mom, 1);
P(return_result_gauge, 1);
P(return_result_mom, 1);
#else
P(overwrite_mom, INVALID_INT);
P(use_resident_gauge, INVALID_INT);
P(use_resident_mom, INVALID_INT);
P(make_resident_gauge, INVALID_INT);
P(make_resident_mom, INVALID_INT);
P(return_gauge, INVALID_INT);
P(return_mom, INVALID_INT);
P(return_result_gauge, INVALID_INT);
P(return_result_mom, INVALID_INT);
#endif
#ifdef INIT_PARAM
......
......@@ -357,7 +357,6 @@ extern char* gitversion;
* Set the device that QUDA uses.
*/
void initQudaDevice(int dev) {
//static bool initialized = false;
if (initialized) return;
initialized = true;
......@@ -434,6 +433,8 @@ void initQudaDevice(int dev) {
cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
//cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
cudaGetDeviceProperties(&deviceProp, dev);
profileInit.TPSTOP(QUDA_PROFILE_TOTAL);
}
/*
......@@ -441,6 +442,8 @@ void initQudaDevice(int dev) {
*/
void initQudaMemory()
{
profileInit.TPSTART(QUDA_PROFILE_TOTAL);
if (!comms_initialized) init_default_comms();
streams = new cudaStream_t[Nstream];
......@@ -470,6 +473,8 @@ void initQudaMemory()
cudaHostGetDevicePointer(&num_failures_d, num_failures_h, 0);
loadTuneCache(getVerbosity());
profileInit.TPSTOP(QUDA_PROFILE_TOTAL);
}
void initQuda(int dev)
......@@ -489,8 +494,6 @@ void initQuda(int dev)
pthread_mutexattr_settype(&mutex_attr, PTHREAD_MUTEX_RECURSIVE);
pthread_mutex_init(&pthread_mutex, &mutex_attr);
#endif
profileInit.TPSTOP(QUDA_PROFILE_TOTAL);
}
......@@ -3391,15 +3394,17 @@ int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int
if (qudaGaugeParam->use_resident_mom) {
if (!gaugePrecise) errorQuda("No resident momentum field to use");
cudaMom = momResident;
if (qudaGaugeParam->overwrite_mom) cudaMom->zero();
profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
} else {
gParamMom.create = QUDA_ZERO_FIELD_CREATE;
gParamMom.create = qudaGaugeParam->overwrite_mom ? QUDA_ZERO_FIELD_CREATE : QUDA_NULL_FIELD_CREATE;
gParamMom.order = QUDA_FLOAT2_GAUGE_ORDER;
gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
gParamMom.precision = qudaGaugeParam->cuda_prec;
gParamMom.create = QUDA_ZERO_FIELD_CREATE;
cudaMom = new cudaGaugeField(gParamMom);
if (!qudaGaugeParam->overwrite_mom) cudaMom->loadCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);
profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
}
......@@ -3409,7 +3414,7 @@ int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int
path_length, loop_coeff, num_paths, max_length);
profileGaugeForce.TPSTOP(QUDA_PROFILE_COMPUTE);
if (qudaGaugeParam->return_mom) {
if (qudaGaugeParam->return_result_mom) {
profileGaugeForce.TPSTART(QUDA_PROFILE_D2H);
cudaMom->saveCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);
profileGaugeForce.TPSTOP(QUDA_PROFILE_D2H);
......@@ -4318,10 +4323,10 @@ computeHISQForceQuda(void* const milc_momentum,
updateMomentum(*momResident, 1.0, *cudaMom);
}
if (gParam->return_mom) {
if (gParam->return_result_mom) {
profileHISQForce.TPSTART(QUDA_PROFILE_D2H);
// Close the paths, make anti-hermitian, and store in compressed format
if (gParam->return_mom) cudaMom->saveCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);
if (gParam->return_result_mom) cudaMom->saveCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);
profileHISQForce.TPSTOP(QUDA_PROFILE_D2H);
}
......@@ -4365,7 +4370,7 @@ void computeStaggeredOprodQuda(void** oprod,
#ifdef GPU_STAGGERED_OPROD
#ifndef BUILD_QDP_INTERFACE
#error "Staggerd oprod requires BUILD_QDP_INTERFACE";
#error "Staggered oprod requires BUILD_QDP_INTERFACE";
#endif
using namespace quda;
profileStaggeredOprod.TPSTART(QUDA_PROFILE_TOTAL);
......@@ -4825,7 +4830,8 @@ void updateGaugeFieldQuda(void* gauge,
gParam.reconstruct = QUDA_RECONSTRUCT_NO;
gParam.gauge = gauge;
gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
cpuGaugeField *cpuGauge = !param->use_resident_gauge ? new cpuGaugeField(gParam) : NULL;
bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : NULL;
gParam.reconstruct = gParam.order == QUDA_TIFR_GAUGE_ORDER ?
QUDA_RECONSTRUCT_NO : QUDA_RECONSTRUCT_10;
......@@ -4875,7 +4881,7 @@ void updateGaugeFieldQuda(void* gauge,
(bool)conj_mom, (bool)exact);
profileGaugeUpdate.TPSTOP(QUDA_PROFILE_COMPUTE);
if (param->return_gauge) {
if (param->return_result_gauge) {
// copy the gauge field back to the host
profileGaugeUpdate.TPSTART(QUDA_PROFILE_D2H);
cudaOutGauge->saveCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);
......@@ -4923,7 +4929,7 @@ void updateGaugeFieldQuda(void* gauge,
gParam.reconstruct = QUDA_RECONSTRUCT_NO;
gParam.link_type = QUDA_GENERAL_LINKS;
gParam.gauge = gauge_h;
bool need_cpu = !param->use_resident_gauge || param->return_gauge;
bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : NULL;
// create the device fields
......@@ -4954,7 +4960,7 @@ void updateGaugeFieldQuda(void* gauge,
errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h);
profileProject.TPSTART(QUDA_PROFILE_D2H);
if (param->return_gauge) cudaGauge->saveCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);
if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);
profileProject.TPSTOP(QUDA_PROFILE_D2H);
if (param->make_resident_gauge) {
......@@ -4985,7 +4991,7 @@ void updateGaugeFieldQuda(void* gauge,
gParam.reconstruct = QUDA_RECONSTRUCT_NO;
gParam.link_type = QUDA_GENERAL_LINKS;
gParam.gauge = gauge_h;
bool need_cpu = !param->use_resident_gauge || param->return_gauge;
bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : NULL;
// create the device fields
......@@ -5014,7 +5020,7 @@ void updateGaugeFieldQuda(void* gauge,
profilePhase.TPSTOP(QUDA_PROFILE_COMPUTE);
profilePhase.TPSTART(QUDA_PROFILE_D2H);
if (param->return_gauge) cudaGauge->saveCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);
if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);
profilePhase.TPSTOP(QUDA_PROFILE_D2H);
if (param->make_resident_gauge) {
......
......@@ -293,11 +293,11 @@ void qudaHisqForce(int prec, const double level2_coeff[6], const double fat7_coe
if (!invalidate_quda_mom) {
gParam.use_resident_mom = true;
gParam.make_resident_mom = true;
gParam.return_mom = false;
gParam.return_result_mom = false;
} else {
gParam.use_resident_mom = false;
gParam.make_resident_mom = false;
gParam.return_mom = true;
gParam.return_result_mom = true;
}
long long flops;
......@@ -546,11 +546,20 @@ void qudaGaugeForce( int precision,
if (!invalidate_quda_mom) {
qudaGaugeParam.use_resident_mom = true;
qudaGaugeParam.make_resident_mom = true;
qudaGaugeParam.return_mom = false;
qudaGaugeParam.return_result_mom = false;
// this means when we compute the momentum, we acummulate to the
// preexisting resident momentum instead of overwriting it
qudaGaugeParam.overwrite_mom = false;
} else {
qudaGaugeParam.use_resident_mom = false;
qudaGaugeParam.make_resident_mom = false;
qudaGaugeParam.return_mom = true;
qudaGaugeParam.return_result_mom = true;
// this means we compute momentum into a fresh field, copy it back
// and sum to current momentum in MILC. This saves an initial
// CPU->GPU download of the current momentum.
qudaGaugeParam.overwrite_mom = true;
}
int max_length = 6;
......
......@@ -59,12 +59,21 @@ module quda_fortran
! Whether the staggered phase has already been applied to the links
integer(4) :: staggered_phase_applied
! Imaginary chemical potential
real(8) :: i_mu
integer(4) :: overlap ! width of domain overlap
! When computing momentum, should we overwrite it or accumulate
! to it (only presenty support in gauge-force)
integer(4) :: overwrite_mom
integer(4) :: use_resident_gauge ! Use the resident gauge field
integer(4) :: use_resident_mom ! Use the resident mom field
integer(4) :: make_resident_gauge ! Make the gauge field resident
integer(4) :: make_resident_mom ! Make the mom field resident
integer(4) :: use_resident_mom ! Use the resident momentume field
integer(4) :: make_resident_gauge ! Make the result gauge field resident
integer(4) :: make_resident_mom ! Make the result momentum field resident
integer(4) :: return_result_gauge ! Return the result gauge field
integer(4) :: return_result_mom ! Return the result momentum field
end type quda_gauge_param
......@@ -114,6 +123,9 @@ module quda_fortran
! Actual L2 residual norm achieved in solver for each offset
real(8), dimension(QUDA_MAX_MULTI_SHIFT) :: true_res_offset
! Iterated L2 residual achieved in multi shift solver for each offset
real(8), dimension(QUDA_MAX_MULTI_SHIFT) :: iter_res_offset
! Actual heavy quark residual norm achieved in solver for each offset
real(8), dimension(QUDA_MAX_MULTI_SHIFT) :: true_res_hq_offset
......@@ -206,6 +218,18 @@ module quda_fortran
integer(4)::max_search_dim ! for magma library this parameter must be multiple 16?
integer(4)::rhs_idx
integer(4)::deflation_grid !total deflation space is nev*deflation_grid
integer(4)::use_reduced_vector_set ! eigCG: specifies whether to use reduced eigenvector set
real(8):: eigenval_tol ! eigCG: selection criterion for the reduced eigenvector set
integer(4)::use_cg_updates ! mixed precision eigCG:whether to use cg refinement corrections in the incremental stage
real(8)::cg_iterref_tol ! mixed precision eigCG: tolerance for cg refinement corrections in the incremental stage
integer(4)::eigcg_max_restarts ! mixed precision eigCG tuning parameter: minimum search vector space restarts
integer(4)::max_restart_num ! initCG tuning parameter: maximum restarts
real(8)::inc_tol ! initCG tuning parameter: decrease in absolute value of the residual within each restart cycle
! Parameters for setting data residency of the solver
integer(8)::make_resident_solution ! Whether to make the solution vector(s) after the solve
integer(8)::use_resident_solution ! Whether to use the resident solution vector(s)
end type quda_invert_param
end module quda_fortran
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment