The version can be compiled

The c++ compiler is set to be hipcc. Thus the cmake should be hacked to include .cu in the suffix can be indentified to be the cppfiles. Otherwise one can rename all the .cu files into .cc to avoid the hack on cmake.

The version can be compiled
The c++ compiler is set to be hipcc. Thus the cmake should be hacked to include .cu in the suffix can be indentified to be the cppfiles. Otherwise one can rename all the .cu files into .cc to avoid the hack on cmake.
fe729b17 · YI-Bo Yang · 9505d29f · fe729b17 · fe729b17 · fe729b17
Commit fe729b17 authored Nov 04, 2019 by YI-Bo Yang
--- a/include/atomic.cuh
+++ b/include/atomic.cuh
@@ -20,7 +20,7 @@
   @param addr Address that stores the atomic variable to be updated
   @param val Value to be added to the atomic
 */
-static inline __device__ double atomicAdd(double* address, double val)
+/*static inline __device__ double atomicAdd(double* address, double val)
 {
  unsigned long long int* address_as_ull =
                            (unsigned long long int*)address;
@@ -36,7 +36,7 @@ static inline __device__ double atomicAdd(double* address, double val)
  } while (assumed != old);

  return __longlong_as_double(old);
-}
+}*/
 #endif

 /**

--- a/include/complex_quda.h
+++ b/include/complex_quda.h
@@ -1244,13 +1244,13 @@ lhs.real()*rhs.imag()+lhs.imag()*rhs.real());
  template <typename ValueType>
    __host__ __device__
    inline complex<ValueType> sqrt(const complex<ValueType>& z){
-    return polar(::sqrt(abs(z)),arg(z)/ValueType(2));
+    return polar(sqrt(abs(z)),arg(z)/ValueType(2));
  }

  template <typename ValueType>
    __host__ __device__
    inline complex<float> sqrt(const complex<float>& z){
-    return polar(::sqrtf(abs(z)),arg(z)/float(2));
+    return polar(sqrtf(abs(z)),arg(z)/float(2));
  }

  template <typename ValueType>

--- a/include/cub_helper.cuh
+++ b/include/cub_helper.cuh
@@ -7,7 +7,8 @@

 using namespace quda;

-#include <cub/block/block_reduce.cuh>
+#include <hipcub/hipcub.hpp>
+namespace cub=hipcub;

 #if __COMPUTE_CAPABILITY__ >= 300
 #include <generics/shfl.h>

--- a/include/float_vector.h
+++ b/include/float_vector.h
@@ -92,7 +92,7 @@ namespace quda {
    z.w = x.w + y.w;
    return z;
  }
-
+/*
  __host__ __device__ inline float4 operator+=(float4 &x, const float4 y) {
    x.x += y.x;
    x.y += y.y;
@@ -112,14 +112,14 @@ namespace quda {
    x.y += y.y;
    return x;
  }
-
+*/
  __host__ __device__ inline double3 operator+=(double3 &x, const double3 y) {
    x.x += y.x;
    x.y += y.y;
    x.z += y.z;
    return x;
  }
-
+/*
  __host__ __device__ inline double4 operator+=(double4 &x, const double4 y) {
    x.x += y.x;
    x.y += y.y;
@@ -147,7 +147,7 @@ namespace quda {
    x.y -= y.y;
    return x;
  }
-
+*/
  __host__ __device__ inline float2 operator*=(float2 &x, const float a) {
    x.x *= a;
    x.y *= a;
@@ -326,4 +326,38 @@ namespace quda {
  template<> struct RealType<complex<char> > { typedef char type; };
  template<> struct RealType<char4> { typedef char type; };

+  // The following added by Yujiang Bi
+     template<typename T> __host__ __device__ inline T rsqrt(T a){
+               return 1/sqrt(a);
+     }
+
+  __device__ __host__
+  inline
+  void sincos(float x, float* sptr, float* cptr)
+  {
+      float tmp;
+      *sptr =
+          __ocml_sincos_f32(x, (__attribute__((address_space(5))) float*) &tmp);
+      *cptr = tmp;
+  }
+  __device__ __host__
+  inline
+  void sincos(float x, double * sptr, float* cptr)
+  {
+      float tmp;
+      *sptr =
+          __ocml_sincos_f32(x, (__attribute__((address_space(5))) float*) &tmp);
+      *cptr = tmp;
+  }
+  __device__ __host__
+  inline
+  void sincos(float x, double * sptr, double * cptr)
+  {
+      float tmp;
+      *sptr =
+          __ocml_sincos_f32(x, (__attribute__((address_space(5))) float*) &tmp);
+      *cptr = tmp;
+  }
+
+
 }
--- a/include/hiprand_kernel.h
+++ b/include/hiprand_kernel.h
+// Copyright (c) 2017 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef HIPRAND_KERNEL_H_
+#define HIPRAND_KERNEL_H_
+
+#ifndef QUALIFIERS
+#define QUALIFIERS __forceinline__ __device__ __host__
+#endif // QUALIFIERS
+
+#include <hip/hip_runtime.h>
+#include <hiprand.h>
+
+/** \addtogroup hipranddevice
+ *
+ *  @{
+ */
+
+ /**
+ * \def HIPRAND_PHILOX4x32_DEFAULT_SEED
+ * \brief Default seed for PHILOX4x32 PRNG.
+ */
+#define HIPRAND_PHILOX4x32_DEFAULT_SEED 0ULL
+ /**
+ * \def HIPRAND_XORWOW_DEFAULT_SEED
+ * \brief Default seed for XORWOW PRNG.
+ */
+#define HIPRAND_XORWOW_DEFAULT_SEED 0ULL
+ /**
+ * \def HIPRAND_MRG32K3A_DEFAULT_SEED
+ * \brief Default seed for MRG32K3A PRNG.
+ */
+#define HIPRAND_MRG32K3A_DEFAULT_SEED 12345ULL
+/** @} */ // end of group hipranddevice
+
+#ifdef __HIP_PLATFORM_HCC__
+#include "hiprand_kernel_hcc.h"
+#else
+#include "hiprand_kernel_nvcc.h"
+#endif
+
+#endif // HIPRAND_KERNEL_H_
--- a/include/inline_ptx.h
+++ b/include/inline_ptx.h
@@ -20,84 +20,110 @@ namespace quda {

  __device__ inline void load_streaming_double2(double2 &a, const double2* addr)
  {
+    a.x = ((double *)addr)[0];a.y = ((double *)addr)[1];
+/*
    double x, y;
    asm("ld.cs.global.v2.f64 {%0, %1}, [%2+0];" : "=d"(x), "=d"(y) : __PTR(addr));
    a.x = x; a.y = y;
+*/
  }

  __device__ inline void load_streaming_float4(float4 &a, const float4* addr)
  {
+    a.x = ((float *)addr)[0];a.y = ((float *)addr)[1];a.z = ((float *)addr)[2];a.w = ((float *)addr)[3];
+/*
    float x, y, z, w;
    asm("ld.cs.global.v4.f32 {%0, %1, %2, %3}, [%4+0];" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : __PTR(addr));
    a.x = x; a.y = y; a.z = z; a.w = w;
+*/
  }

  __device__ inline void load_cached_short4(short4 &a, const short4 *addr)
  {
+    a.x = ((short *)addr)[0];a.y = ((short *)addr)[1];a.z = ((short *)addr)[2];a.w = ((short *)addr)[3];
+/*
    short x, y, z, w;
    asm("ld.ca.global.v4.s16 {%0, %1, %2, %3}, [%4+0];" : "=h"(x), "=h"(y), "=h"(z), "=h"(w) : __PTR(addr));
    a.x = x;
    a.y = y;
    a.z = z;
    a.w = w;
+*/
  }

  __device__ inline void load_cached_short2(short2 &a, const short2 *addr)
  {
+    a.x = ((short*)addr)[0];a.y = ((short*)addr)[1];
+/*
    short x, y;
    asm("ld.ca.global.v2.s16 {%0, %1}, [%2+0];" : "=h"(x), "=h"(y) : __PTR(addr));
    a.x = x;
    a.y = y;
+*/
  }

  __device__ inline void load_global_short4(short4 &a, const short4 *addr)
  {
+    a.x = ((short *)addr)[0];a.y = ((short *)addr)[1];a.z = ((short *)addr)[2];a.w = ((short *)addr)[3];
+/*
    short x, y, z, w;
    asm("ld.cg.global.v4.s16 {%0, %1, %2, %3}, [%4+0];" : "=h"(x), "=h"(y), "=h"(z), "=h"(w) : __PTR(addr));
    a.x = x;
    a.y = y;
    a.z = z;
    a.w = w;
+*/
  }

  __device__ inline void load_global_short2(short2 &a, const short2 *addr)
  {
+    a.x = ((short*)addr)[0];a.y = ((short*)addr)[1];
+/*
    short x, y;
    asm("ld.cg.global.v2.s16 {%0, %1}, [%2+0];" : "=h"(x), "=h"(y) : __PTR(addr));
    a.x = x;
    a.y = y;
+*/
  }

  __device__ inline void load_global_float4(float4 &a, const float4* addr)
  {
+    a.x = ((float *)addr)[0];a.y = ((float *)addr)[1];a.z = ((float *)addr)[2];a.w = ((float *)addr)[3];
+/*
    float x, y, z, w;
    asm("ld.cg.global.v4.f32 {%0, %1, %2, %3}, [%4+0];" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : __PTR(addr));
    a.x = x; a.y = y; a.z = z; a.w = w;
+*/
  }

  __device__ inline void store_streaming_float4(float4* addr, float x, float y, float z, float w)
  {
-    asm("st.cs.global.v4.f32 [%0+0], {%1, %2, %3, %4};" :: __PTR(addr), "f"(x), "f"(y), "f"(z), "f"(w));
+    ((float *)addr)[0] = x;((float *)addr)[1] = y;((float *)addr)[2] = z;((float *)addr)[3] = w;
+//    asm("st.cs.global.v4.f32 [%0+0], {%1, %2, %3, %4};" :: __PTR(addr), "f"(x), "f"(y), "f"(z), "f"(w));
  }

  __device__ inline void store_streaming_short4(short4* addr, short x, short y, short z, short w)
  {
-    asm("st.cs.global.v4.s16 [%0+0], {%1, %2, %3, %4};" :: __PTR(addr), "h"(x), "h"(y), "h"(z), "h"(w));
+    ((short *)addr)[0] = x;((short *)addr)[1] = y;((short *)addr)[2] = z;((short *)addr)[3] = w;
+//    asm("st.cs.global.v4.s16 [%0+0], {%1, %2, %3, %4};" :: __PTR(addr), "h"(x), "h"(y), "h"(z), "h"(w));
  }

  __device__ inline void store_streaming_double2(double2* addr, double x, double y)
  {
-    asm("st.cs.global.v2.f64 [%0+0], {%1, %2};" :: __PTR(addr), "d"(x), "d"(y));
+    ((double *)addr)[0] = x;((double *)addr)[1] = y;
+//    asm("st.cs.global.v2.f64 [%0+0], {%1, %2};" :: __PTR(addr), "d"(x), "d"(y));
  }

  __device__ inline void store_streaming_float2(float2* addr, float x, float y)
  {
-    asm("st.cs.global.v2.f32 [%0+0], {%1, %2};" :: __PTR(addr), "f"(x), "f"(y));
+    ((float *)addr)[0] = x;((float *)addr)[1] = y;
+//    asm("st.cs.global.v2.f32 [%0+0], {%1, %2};" :: __PTR(addr), "f"(x), "f"(y));
  }

  __device__ inline void store_streaming_short2(short2* addr, short x, short y)
  {
-    asm("st.cs.global.v2.s16 [%0+0], {%1, %2};" :: __PTR(addr), "h"(x), "h"(y));
+    ((short *)addr)[0] = x;((short *)addr)[1] = y;
+//    asm("st.cs.global.v2.s16 [%0+0], {%1, %2};" :: __PTR(addr), "h"(x), "h"(y));
  }

 } // namespace quda
--- a/include/kernels/coarse_op_preconditioned.cuh
+++ b/include/kernels/coarse_op_preconditioned.cuh
@@ -131,7 +131,7 @@ namespace quda {
    int d = j_d / n;

    Float max = computeYhat<Float, n, compute_max_only>(arg, d, x_cb, parity, i, j);
-    if (compute_max_only) atomicMax(arg.max_d, max);
+    if (compute_max_only) assert(1); // atomicMax(arg.max_d, max);
  }

 } // namespace quda
--- a/include/kernels/dslash_coarse.cuh
+++ b/include/kernels/dslash_coarse.cuh
@@ -96,7 +96,6 @@ namespace quda {
     @param parity The site parity
     @param x_cb The checkerboarded site index
   */
-  HIP_DYNAMIC_SHARED( float, s)
  template <typename Float, int nDim, int Ns, int Nc, int Mc, int color_stride, int dim_stride, int thread_dir, int thread_dim, bool dagger, DslashType type, typename Arg>
  __device__ __host__ inline void applyDslash(complex<Float> out[], Arg &arg, int x_cb, int src_idx, int parity, int s_row, int color_block, int color_offset) {
    const int their_spinor_parity = (arg.nParity == 2) ? 1-parity : 0;
@@ -106,6 +105,7 @@ namespace quda {
    coord[4] = src_idx;

 #ifdef __HIP_DEVICE_COMPILE__
+  HIP_DYNAMIC_SHARED( float, s)
    complex<Float> *shared_sum = (complex<Float>*)s;
    if (!thread_dir) {
 #endif

--- a/include/kernels/multi_blas_core.cuh
+++ b/include/kernels/multi_blas_core.cuh
@@ -8,7 +8,7 @@
 #include <float_vector.h>

 #if (__COMPUTE_CAPABILITY__ >= 300 || __HIP_DEVICE_COMPILE__ >= 300)
-#define WARP_SPLIT
+//#define WARP_SPLIT
 #include <generics/shfl.h>
 #endif


--- a/include/texture.h
+++ b/include/texture.h
@@ -439,7 +439,7 @@ public:
      convert<InterType, RegType>(y, x, M);

      if (isFixed<StoreType>::value) {
-        float C = SN::store_norm<M>(y, i, parity);
+        float C =SN::template store_norm<M>(y, i, parity);
 #pragma unroll
        for (int j = 0; j < M; j++) copyFloatN(spinor[ST::cb_offset * parity + i + j * ST::stride], C * y[j]);
      } else {

--- a/lib/blas_cublas.cu
+++ b/lib/blas_cublas.cu
@@ -5,6 +5,8 @@
 #endif
 #include <malloc_quda.h>

+#include <Eigen/LU>
+
 #define FMULS_GETRF(m_, n_) ( ((m_) < (n_)) \
    ? (0.5 * (m_) * ((m_) * ((n_) - (1./3.) * (m_) - 1. ) + (n_)) + (2. / 3.) * (m_)) \
    : (0.5 * (n_) * ((n_) * ((m_) - (1./3.) * (n_) - 1. ) + (m_)) + (2. / 3.) * (n_)) )
@@ -55,11 +57,11 @@ namespace quda {
    long long BatchInvertMatrix(void *Ainv, void* A, const int n, const int batch, QudaPrecision prec, QudaFieldLocation location)
    {
      long long flops = 0;
-#ifdef CUBLAS_LIB
      timeval start, stop;
+      size_t size = 2*n*n*prec*batch;
+/*#ifdef CUBLAS_LIB
      gettimeofday(&start, NULL);

-      size_t size = 2*n*n*prec*batch;
      void *A_d = location == QUDA_CUDA_FIELD_LOCATION ? A : pool_device_malloc(size);
      void *Ainv_d = location == QUDA_CUDA_FIELD_LOCATION ? Ainv : pool_device_malloc(size);
      if (location == QUDA_CPU_FIELD_LOCATION) qudaMemcpy(A_d, A, size, hipMemcpyHostToDevice);
@@ -131,7 +133,47 @@ namespace quda {

      if (getVerbosity() >= QUDA_VERBOSE)
 	printfQuda("Batched matrix inversion completed in %f seconds with GFLOPS = %f\n", time, 1e-9 * flops / time);
-#endif // CUBLAS_LIB
+#endif */ // CUBLAS_LIB
+
+      gettimeofday(&start, NULL);
+      void *A_h = location == QUDA_CUDA_FIELD_LOCATION ? pool_pinned_malloc(size): A;
+      void *Ainv_h = pool_pinned_malloc(size);
+      if (location == QUDA_CUDA_FIELD_LOCATION) qudaMemcpy(A_h, A, size,hipMemcpyDeviceToHost);
+
+      if (prec == QUDA_SINGLE_PRECISION) {
+
+        std::complex<float> *A_eig=(std::complex<float> *)A_h;
+        std::complex<float> *Ainv_eig=(std::complex<float> *)Ainv_h;
+
+#pragma omp parallel for
+        for(int i=0;i<batch;i++)
+        {
+                Eigen::MatrixXcd res=Eigen::MatrixXcd::Zero(n,n),inv;
+                for(int j=0;j<n;j++)
+                for(int k=0;k<n;k++)
+                        res(j,k)=A_eig[i*n*n+j*n+k];
+                inv=res.inverse();
+                for(int j=0;j<n;j++)
+                for(int k=0;k<n;k++)
+                        Ainv_eig[i*n*n+j*n+k]=inv(j,k);
+        }
+      }
+      else {
+        errorQuda("%s not implemented for precision=%d", __func__, prec);
+      }
+
+     gettimeofday(&stop, NULL);
+     long ds = stop.tv_sec - start.tv_sec;
+     long dus = stop.tv_usec - start.tv_usec;
+     double time = ds + 0.000001*dus;
+
+        printfQuda("CPU: Batched matrix inversion completed in %f seconds with GFLOPS = %f\n", time, 1e-9 * batch*FLOPS_CGETRI(n) / time);
+
+      if (location == QUDA_CUDA_FIELD_LOCATION) {
+        qudaMemcpy(Ainv, Ainv_h, size, hipMemcpyHostToDevice);
+        pool_pinned_free(Ainv_h);
+        pool_pinned_free(A_h);
+      }

      return flops;
    }

--- a/lib/comm_common.cpp
+++ b/lib/comm_common.cpp
@@ -220,7 +220,8 @@ void comm_peer2peer_init(const char* hostname_recv_buf)
    const int gpuid = comm_gpuid();
    hipDeviceProp_t prop;
    hipGetDeviceProperties(&prop, gpuid);
-    if(!prop.unifiedAddressing) return;
+    errorQuda("enable_peer_to_peer is not supported in this HIP version\n");
+//    if(!prop.unifiedAddressing) return;

    comm_set_neighbor_ranks();

@@ -446,11 +447,10 @@ inline bool isHost(const void *buffer)
  hipMemoryType memType;
  void *attrdata[] = {(void *)&memType};
  hipPointerAttribute_t attributes;//[2] = {CU_POINTER_ATTRIBUTE_MEMORY_TYPE};
-  hipError_t err = cuPointerGetAttributes(1, attributes, attrdata, (hipDeviceptr_t)buffer);
+  hipError_t err = hipPointerGetAttributes(&attributes, (hipDeviceptr_t)buffer);memType=attributes.memoryType;
  if (err != hipSuccess) {
-    const char *str;
-    str=hipGetErrorName(err);
-    errorQuda("cuPointerGetAttributes returned error %s", str);
+    memType=hipMemoryTypeHost;
+    printfQuda("hipPointerGetAttributes returned unknown address, surppose it to be a host address\n");
  }

  switch (memType) {

--- a/lib/extract_gauge_ghost_helper.cuh
+++ b/lib/extract_gauge_ghost_helper.cuh
@@ -137,7 +137,7 @@ namespace quda {
    if (oddness == parity) {
 #ifdef FINE_GRAINED_ACCESS
      int i = blockIdx.y * blockDim.y + threadIdx.y;
-      if (i >= Ncolor(length)) return;
+      if (i >= gauge::Ncolor(length)) return;
      for (int j=0; j<gauge::Ncolor(length); j++) {
 	if (extract) {
 	  arg.order.Ghost(dim, (parity+arg.localParity[dim])&1, X>>1, i, j)

--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -77,7 +77,6 @@
 #include <momentum.h>


-#include <hip/hip_profile.h>

 using namespace quda;


--- a/lib/malloc.cpp
+++ b/lib/malloc.cpp
@@ -339,7 +339,7 @@ namespace quda

    a.size = a.base_size = size;

-    hipError_t err = cudaMallocManaged(&ptr, size);
+    hipError_t err = hipMallocManaged(&ptr, size);
    if (err != hipSuccess) {
      errorQuda("Failed to allocate managed memory of size %zu (%s:%d in %s())\n", size, file, line, func);
    }

--- a/lib/quda_cuda_api.cpp
+++ b/lib/quda_cuda_api.cpp
@@ -80,7 +80,7 @@ namespace quda {
          PROFILE(hipMemcpyDtoHAsync(dst, (hipDeviceptr_t)src, count, stream), QUDA_PROFILE_MEMCPY_D2H_ASYNC);
          break;
        case hipMemcpyHostToDevice:
-          PROFILE(hipMemcpyHtoDAsync((hipDeviceptr_t)dst, src, count, stream), QUDA_PROFILE_MEMCPY_H2D_ASYNC);
+          PROFILE(hipMemcpyHtoDAsync((hipDeviceptr_t)dst, const_cast<void *>(src), count, stream), QUDA_PROFILE_MEMCPY_H2D_ASYNC);
          break;
        case hipMemcpyDeviceToDevice:
          PROFILE(hipMemcpyDtoDAsync((hipDeviceptr_t)dst, (hipDeviceptr_t)src, count, stream), QUDA_PROFILE_MEMCPY_D2D_ASYNC);
@@ -96,10 +96,10 @@ namespace quda {
 #ifdef USE_DRIVER_API
        switch(kind) {
        case hipMemcpyDeviceToHost:   hipMemcpyDtoH(dst, (hipDeviceptr_t)src, count);              break;
-        case hipMemcpyHostToDevice:   hipMemcpyHtoD((hipDeviceptr_t)dst, src, count);              break;
+        case hipMemcpyHostToDevice:   hipMemcpyHtoD((hipDeviceptr_t)dst, const_cast<void *>(src), count);              break;
        case hipMemcpyHostToHost:     memcpy(dst, src, count);                                 break;
        case hipMemcpyDeviceToDevice: hipMemcpyDtoD((hipDeviceptr_t)dst, (hipDeviceptr_t)src, count); break;
-        case hipMemcpyDefault:        cuMemcpy((hipDeviceptr_t)dst, (hipDeviceptr_t)src, count);     break;
+        case hipMemcpyDefault:        hipMemcpy((hipDeviceptr_t)dst, (hipDeviceptr_t)src, count, kind);     break;
        default:
 	errorQuda("Unsupported hipMemcpy %d", kind);
        }
@@ -152,7 +152,7 @@ namespace quda {
        PROFILE(hipMemcpyDtoHAsync(dst, (hipDeviceptr_t)src, count, stream), QUDA_PROFILE_MEMCPY_D2H_ASYNC);
        break;
      case hipMemcpyHostToDevice:
-        PROFILE(hipMemcpyHtoDAsync((hipDeviceptr_t)dst, src, count, stream), QUDA_PROFILE_MEMCPY_H2D_ASYNC);
+        PROFILE(hipMemcpyHtoDAsync((hipDeviceptr_t)dst, const_cast<void *>(src), count, stream), QUDA_PROFILE_MEMCPY_H2D_ASYNC);
        break;
      case hipMemcpyDeviceToDevice:
        PROFILE(hipMemcpyDtoDAsync((hipDeviceptr_t)dst, (hipDeviceptr_t)src, count, stream), QUDA_PROFILE_MEMCPY_D2D_ASYNC);
@@ -192,7 +192,8 @@ namespace quda {
    default:
      errorQuda("Unsupported cuMemcpyType2DAsync %d", kind);
    }
-    PROFILE(hipMemcpy2DAsync(&param, stream), QUDA_PROFILE_MEMCPY2D_D2H_ASYNC);
+    printfQuda("driver_api is not supported in hipMemcpy2DAsync\n");
+    PROFILE(hipMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream), QUDA_PROFILE_MEMCPY2D_D2H_ASYNC);    
 #else
    PROFILE(hipMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream), QUDA_PROFILE_MEMCPY2D_D2H_ASYNC);
 #endif
@@ -201,7 +202,8 @@ namespace quda {
  hipError_t qudaLaunchKernel(const void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, hipStream_t stream)
  {
    // no driver API variant here since we have C++ functions
-    PROFILE(hipError_t error = hipLaunchKernel(func, gridDim, blockDim, args, sharedMem, stream), QUDA_PROFILE_LAUNCH_KERNEL);
+    printfQuda("this kernel can not be used in HIP\n");hipError_t error = hipErrorUnknown;
+//    PROFILE(hipError_t error = hipLaunchKernel(func, gridDim, blockDim, args, sharedMem, stream), QUDA_PROFILE_LAUNCH_KERNEL);
    if (error != hipSuccess && !activeTuning()) errorQuda("(CUDA) %s", hipGetErrorString(error));
    return error;
  }

--- a/lib/reduce_quda.cu
+++ b/lib/reduce_quda.cu
@@ -95,7 +95,7 @@ namespace quda {
 #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
 	if(deviceProp.canMapHostMemory) {
 	  h_reduce = (QudaSumFloat *) mapped_malloc(bytes);
-	  hipHostGetDevicePointer(&hd_reduce, h_reduce, 0); // set the matching device pointer
+	  hipHostGetDevicePointer((void **)&hd_reduce, h_reduce, 0); // set the matching device pointer
 	} else
 #endif
 	  {

--- a/lib/tune.cpp
+++ b/lib/tune.cpp
@@ -103,7 +103,7 @@ namespace quda {
    }
  }

-  static const std::string quda_hash = QUDA_HASH; // defined in lib/Makefile
+  static const std::string quda_hash = "x64_64.quda_dev_rocm.hip2.8.19354"; // defined in lib/Makefile
  static std::string resource_path;
  static map tunecache;
  static map::iterator it;

--- a/lib/version.cpp
+++ b/lib/version.cpp
 #ifdef GITVERSION
-const char* gitversion = GITVERSION ;
+const char* gitversion = "rocm" ;
 #else
 const char* gitversion;
 #endif