Merge pull request #434 from lattice/hotfix/0.8_update

added date to NEWS,README file and fixed some typos

Merge pull request #434 from lattice/hotfix/0.8_update
added date to NEWS,README file and fixed some typos
9425ca6f · Mathias Wagner · d38ed85e · d744cc15 · 9425ca6f · 9425ca6f
Commit 9425ca6f authored Feb 01, 2016 by Mathias Wagner
--- a/LICENSE
+++ b/LICENSE

-Copyright (c) 2009-2015 QUDA Developers
+Copyright (c) 2009-2016 QUDA Developers

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

--- a/NEWS
+++ b/NEWS
-Version 0.8.0 - xxth December 2015
+Version 0.8.0 - 1st February 2016

 - Removed all Tesla-generation GPU support from QUDA (sm_1x).  As a
  result, QUDA now requires a minimum of the Fermi-generation GPUs.

 - Added support for building QUDA using cmake.  This gives a much more
  flexible and extensible build system as well as allowing
-  out-of-source-directory building.
+  out-of-source-directory building. For details see:
+  https://github.com/lattice/quda/wiki/Building-QUDA-with-cmake

 - Improved strong scaling of the multi-shift solver by overlapping the
  shift updates with the subsequent iteration's dslash comms waiting.

- Improved performance of multi-shift solver by preventing unecessary
+- Improved performance of multi-shift solver by preventing unnecessary
  refinement of shifted solutions once the residual falls below
  floating point precision.

@@ -45,9 +46,9 @@ Version 0.8.0 - xxth December 2015
  force kernels.  This also improves compilation time and reduces
  library size.

- Added support for imaginary chemical potential to the staggeed phase
+- Added support for imaginary chemical potential to the staggered phase
  application / removal kernel, as well as fixing bugs in this
-  reoutine.
+  routine.

 - Algorithms that previously used double-precision atomics now use a
  cub reduction.  This drastically improves performance of such

--- a/README
+++ b/README
-Release Notes for QUDA v0.8.0                         xxth December 2015
+Release Notes for QUDA v0.8.0                         1st February 2016
 -----------------------------

 Overview:

--- a/include/quda.h
+++ b/include/quda.h
@@ -68,12 +68,14 @@ extern "C" {

    int overlap; /**< Width of overlapping domains */

-    int use_resident_gauge;  /**< Use the resident gauge field */
-    int use_resident_mom;    /**< Use the resident mom field */
-    int make_resident_gauge; /**< Make the gauge field resident */
-    int make_resident_mom;   /**< Make the mom field resident */
-    int return_gauge;        /**< Return the new gauge field */
-    int return_mom;          /**< Return the new mom field */
+    int overwrite_mom; /**< When computing momentum, should we overwrite it or accumulate to to */
+
+    int use_resident_gauge;  /**< Use the resident gauge field as input */
+    int use_resident_mom;    /**< Use the resident momentum field as input*/
+    int make_resident_gauge; /**< Make the result gauge field resident */
+    int make_resident_mom;   /**< Make the result momentum field resident */
+    int return_result_gauge; /**< Return the result gauge field */
+    int return_result_mom;   /**< Return the result momentum field */

  } QudaGaugeParam;


--- a/lib/check_params.h
+++ b/lib/check_params.h
@@ -109,19 +109,21 @@ void printQudaGaugeParam(QudaGaugeParam *param) {
 #endif

 #if defined INIT_PARAM
+  P(overwrite_mom, 0);
  P(use_resident_gauge, 0);
  P(use_resident_mom, 0);
  P(make_resident_gauge, 0);
  P(make_resident_mom, 0);
-  P(return_gauge, 1);
-  P(return_mom, 1);
+  P(return_result_gauge, 1);
+  P(return_result_mom, 1);
 #else
+  P(overwrite_mom, INVALID_INT);
  P(use_resident_gauge, INVALID_INT);
  P(use_resident_mom, INVALID_INT);
  P(make_resident_gauge, INVALID_INT);
  P(make_resident_mom, INVALID_INT);
-  P(return_gauge, INVALID_INT);
-  P(return_mom, INVALID_INT);
+  P(return_result_gauge, INVALID_INT);
+  P(return_result_mom, INVALID_INT);
 #endif

 #ifdef INIT_PARAM

--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -357,7 +357,6 @@ extern char* gitversion;
 * Set the device that QUDA uses.
 */
 void initQudaDevice(int dev) {
-
  //static bool initialized = false;
  if (initialized) return;
  initialized = true;
@@ -434,6 +433,8 @@ void initQudaDevice(int dev) {
  cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
  //cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
  cudaGetDeviceProperties(&deviceProp, dev);
+
+  profileInit.TPSTOP(QUDA_PROFILE_TOTAL);
 }

 /*
@@ -441,6 +442,8 @@ void initQudaDevice(int dev) {
 */
 void initQudaMemory()
 {
+  profileInit.TPSTART(QUDA_PROFILE_TOTAL);
+
  if (!comms_initialized) init_default_comms();

  streams = new cudaStream_t[Nstream];
@@ -470,6 +473,8 @@ void initQudaMemory()
  cudaHostGetDevicePointer(&num_failures_d, num_failures_h, 0);

  loadTuneCache(getVerbosity());
+
+  profileInit.TPSTOP(QUDA_PROFILE_TOTAL);
 }

 void initQuda(int dev)
@@ -489,8 +494,6 @@ void initQuda(int dev)
  pthread_mutexattr_settype(&mutex_attr, PTHREAD_MUTEX_RECURSIVE);
  pthread_mutex_init(&pthread_mutex, &mutex_attr);
 #endif
-
-  profileInit.TPSTOP(QUDA_PROFILE_TOTAL);
 }


@@ -3391,15 +3394,17 @@ int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int
  if (qudaGaugeParam->use_resident_mom) {
    if (!gaugePrecise) errorQuda("No resident momentum field to use");
    cudaMom = momResident;
+    if (qudaGaugeParam->overwrite_mom) cudaMom->zero();
    profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
  } else {
-    gParamMom.create = QUDA_ZERO_FIELD_CREATE;  
+    gParamMom.create = qudaGaugeParam->overwrite_mom ? QUDA_ZERO_FIELD_CREATE : QUDA_NULL_FIELD_CREATE;
    gParamMom.order = QUDA_FLOAT2_GAUGE_ORDER;
    gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
    gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
    gParamMom.precision = qudaGaugeParam->cuda_prec;
    gParamMom.create = QUDA_ZERO_FIELD_CREATE;
    cudaMom = new cudaGaugeField(gParamMom);
+    if (!qudaGaugeParam->overwrite_mom) cudaMom->loadCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);
    profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
  }

@@ -3409,7 +3414,7 @@ int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int
      path_length, loop_coeff, num_paths, max_length);
  profileGaugeForce.TPSTOP(QUDA_PROFILE_COMPUTE);

-  if (qudaGaugeParam->return_mom) {
+  if (qudaGaugeParam->return_result_mom) {
    profileGaugeForce.TPSTART(QUDA_PROFILE_D2H);
    cudaMom->saveCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);
    profileGaugeForce.TPSTOP(QUDA_PROFILE_D2H);
@@ -4318,10 +4323,10 @@ computeHISQForceQuda(void* const milc_momentum,
    updateMomentum(*momResident, 1.0, *cudaMom);
  }

-  if (gParam->return_mom) {
+  if (gParam->return_result_mom) {
    profileHISQForce.TPSTART(QUDA_PROFILE_D2H);
    // Close the paths, make anti-hermitian, and store in compressed format
-    if (gParam->return_mom) cudaMom->saveCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);
+    if (gParam->return_result_mom) cudaMom->saveCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);
    profileHISQForce.TPSTOP(QUDA_PROFILE_D2H);
  }

@@ -4365,7 +4370,7 @@ void computeStaggeredOprodQuda(void** oprod,

 #ifdef  GPU_STAGGERED_OPROD
 #ifndef BUILD_QDP_INTERFACE
-#error "Staggerd oprod requires BUILD_QDP_INTERFACE";
+#error "Staggered oprod requires BUILD_QDP_INTERFACE";
 #endif
  using namespace quda;
  profileStaggeredOprod.TPSTART(QUDA_PROFILE_TOTAL);
@@ -4825,7 +4830,8 @@ void updateGaugeFieldQuda(void* gauge,
  gParam.reconstruct = QUDA_RECONSTRUCT_NO;
  gParam.gauge = gauge;
  gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
-  cpuGaugeField *cpuGauge = !param->use_resident_gauge ? new cpuGaugeField(gParam) : NULL;
+  bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
+  cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : NULL;

  gParam.reconstruct = gParam.order == QUDA_TIFR_GAUGE_ORDER ? 
   QUDA_RECONSTRUCT_NO : QUDA_RECONSTRUCT_10;
@@ -4875,7 +4881,7 @@ void updateGaugeFieldQuda(void* gauge,
      (bool)conj_mom, (bool)exact);
  profileGaugeUpdate.TPSTOP(QUDA_PROFILE_COMPUTE);

-  if (param->return_gauge) {
+  if (param->return_result_gauge) {
    // copy the gauge field back to the host
    profileGaugeUpdate.TPSTART(QUDA_PROFILE_D2H);
    cudaOutGauge->saveCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);
@@ -4923,7 +4929,7 @@ void updateGaugeFieldQuda(void* gauge,
   gParam.reconstruct = QUDA_RECONSTRUCT_NO;
   gParam.link_type = QUDA_GENERAL_LINKS;
   gParam.gauge = gauge_h;
-   bool need_cpu = !param->use_resident_gauge || param->return_gauge;
+   bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
   cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : NULL;
   
   // create the device fields
@@ -4954,7 +4960,7 @@ void updateGaugeFieldQuda(void* gauge,
     errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h);
   
   profileProject.TPSTART(QUDA_PROFILE_D2H);
-   if (param->return_gauge) cudaGauge->saveCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);
+   if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);
   profileProject.TPSTOP(QUDA_PROFILE_D2H);

   if (param->make_resident_gauge) {
@@ -4985,7 +4991,7 @@ void updateGaugeFieldQuda(void* gauge,
   gParam.reconstruct = QUDA_RECONSTRUCT_NO;
   gParam.link_type = QUDA_GENERAL_LINKS;
   gParam.gauge = gauge_h;
-   bool need_cpu = !param->use_resident_gauge || param->return_gauge;
+   bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
   cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : NULL;
   
   // create the device fields
@@ -5014,7 +5020,7 @@ void updateGaugeFieldQuda(void* gauge,
   profilePhase.TPSTOP(QUDA_PROFILE_COMPUTE);
   
   profilePhase.TPSTART(QUDA_PROFILE_D2H);
-   if (param->return_gauge) cudaGauge->saveCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);
+   if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);
   profilePhase.TPSTOP(QUDA_PROFILE_D2H);

   if (param->make_resident_gauge) {

--- a/lib/milc_interface.cpp
+++ b/lib/milc_interface.cpp
@@ -293,11 +293,11 @@ void qudaHisqForce(int prec, const double level2_coeff[6], const double fat7_coe
  if (!invalidate_quda_mom) {
    gParam.use_resident_mom = true;
    gParam.make_resident_mom = true;
-    gParam.return_mom = false;
+    gParam.return_result_mom = false;
  } else {
    gParam.use_resident_mom = false;
    gParam.make_resident_mom = false;
-    gParam.return_mom = true;
+    gParam.return_result_mom = true;
  }

  long long flops;
@@ -546,11 +546,20 @@ void qudaGaugeForce( int precision,
  if (!invalidate_quda_mom) {
    qudaGaugeParam.use_resident_mom = true;
    qudaGaugeParam.make_resident_mom = true;
-    qudaGaugeParam.return_mom = false;
+    qudaGaugeParam.return_result_mom = false;
+
+    // this means when we compute the momentum, we acummulate to the
+    // preexisting resident momentum instead of overwriting it
+    qudaGaugeParam.overwrite_mom = false;
  } else {
    qudaGaugeParam.use_resident_mom = false;
    qudaGaugeParam.make_resident_mom = false;
-    qudaGaugeParam.return_mom = true;
+    qudaGaugeParam.return_result_mom = true;
+
+    // this means we compute momentum into a fresh field, copy it back
+    // and sum to current momentum in MILC.  This saves an initial
+    // CPU->GPU download of the current momentum.
+    qudaGaugeParam.overwrite_mom = true;
  }

  int max_length = 6;

--- a/lib/quda_fortran.F90
+++ b/lib/quda_fortran.F90
@@ -59,12 +59,21 @@ module quda_fortran
     ! Whether the staggered phase has already been applied to the links
     integer(4) :: staggered_phase_applied

+     ! Imaginary chemical potential
+     real(8) :: i_mu
+
     integer(4) :: overlap ! width of domain overlap

+     ! When computing momentum, should we overwrite it or accumulate
+     ! to it (only presenty support in gauge-force)
+     integer(4) :: overwrite_mom
+
     integer(4) :: use_resident_gauge  ! Use the resident gauge field
-     integer(4) :: use_resident_mom    ! Use the resident mom field
-     integer(4) :: make_resident_gauge ! Make the gauge field resident
-     integer(4) :: make_resident_mom   ! Make the mom field resident
+     integer(4) :: use_resident_mom    ! Use the resident momentume field
+     integer(4) :: make_resident_gauge ! Make the result gauge field resident
+     integer(4) :: make_resident_mom   ! Make the result momentum field resident
+     integer(4) :: return_result_gauge ! Return the result gauge field
+     integer(4) :: return_result_mom   ! Return the result momentum field

  end type quda_gauge_param

@@ -114,6 +123,9 @@ module quda_fortran
     ! Actual L2 residual norm achieved in solver for each offset
     real(8), dimension(QUDA_MAX_MULTI_SHIFT) :: true_res_offset

+     ! Iterated L2 residual achieved in multi shift solver for each offset
+     real(8), dimension(QUDA_MAX_MULTI_SHIFT) :: iter_res_offset
+
     ! Actual heavy quark residual norm achieved in solver for each offset
     real(8), dimension(QUDA_MAX_MULTI_SHIFT) :: true_res_hq_offset

@@ -206,6 +218,18 @@ module quda_fortran
     integer(4)::max_search_dim ! for magma library this parameter must be multiple 16?
     integer(4)::rhs_idx
     integer(4)::deflation_grid !total deflation space is nev*deflation_grid
+     integer(4)::use_reduced_vector_set ! eigCG: specifies whether to use reduced eigenvector set
+     real(8):: eigenval_tol ! eigCG: selection criterion for the reduced eigenvector set
+     integer(4)::use_cg_updates ! mixed precision eigCG:whether to use cg refinement corrections in the incremental stage
+     real(8)::cg_iterref_tol ! mixed precision eigCG:  tolerance for cg refinement corrections in the incremental stage
+     integer(4)::eigcg_max_restarts ! mixed precision eigCG tuning parameter:  minimum search vector space restarts
+     integer(4)::max_restart_num     ! initCG tuning parameter:  maximum restarts
+     real(8)::inc_tol     ! initCG tuning parameter:  decrease in absolute value of the residual within each restart cycle
+
+     ! Parameters for setting data residency of the solver
+     integer(8)::make_resident_solution ! Whether to make the solution vector(s) after the solve
+     integer(8)::use_resident_solution  ! Whether to use the resident solution vector(s)
+
  end type quda_invert_param

 end module quda_fortran