quda-0.2: fixed loadParitySpinor() and loadFullSpinor() to be compatible

with padding git-svn-id: http://lattice.bu.edu/qcdalg/cuda/branches/quda-0.2@689 be54200a-260c-0410-bdd7-ce6af2a381ab

quda-0.2: fixed loadParitySpinor() and loadFullSpinor() to be compatible
with padding git-svn-id: http://lattice.bu.edu/qcdalg/cuda/branches/quda-0.2@689 be54200a-260c-0410-bdd7-ce6af2a381ab
be8375c9 · rbabich · 4cace40e · be8375c9 · be8375c9 · be8375c9
Commit be8375c9 authored Feb 05, 2010 by rbabich
--- a/NEWS
+++ b/NEWS
+Version 0.2.1
+
+- Fixed bug that would sometimes cause the inverter to fail when
+  padding, mixed precision, and the "PRESERVE_SOURCE_NO" option were
+  all used at the same time.
+

 Version 0.2 - 16 December 2009


--- a/README
+++ b/README
@@ -82,12 +82,12 @@ Known issues:
  requires over 1 GB of disk space in /tmp for the creation of
  temporary files.

-* For compatibility with CUDA, on 32-bit platforms the library is compiled
-  with the GCC option -malign-double.  This differs from the GCC default
-  and may affect the alignment of various structures, notably those of
-  type QudaGaugeParam and QudaInvertParam, defined in invert_quda.h.
-  Therefore, any code to be linked against QUDA should also be compiled
-  with this option.
+* For compatibility with CUDA, on 32-bit platforms the library is
+  compiled with the GCC option -malign-double.  This differs from the
+  GCC default and may affect the alignment of various structures,
+  notably those of type QudaGaugeParam and QudaInvertParam, defined in
+  quda.h.  Therefore, any code to be linked against QUDA should also
+  be compiled with this option.


 Contact information:

--- a/include/spinor_quda.h
+++ b/include/spinor_quda.h
@@ -12,7 +12,7 @@ extern "C" {
  
  void freeParitySpinor(ParitySpinor spinor);
  void freeSpinorField(FullSpinor spinor);
-  void freeSpinorBuffer();
+  void freeSpinorBuffer(void);

  void loadParitySpinor(ParitySpinor, void *spinor, Precision cpu_prec,
 			DiracFieldOrder dirac_order);

--- a/lib/spinor_quda.cpp
+++ b/lib/spinor_quda.cpp
 #include <stdlib.h>
+#include <string.h>
 #include <stdio.h>

 #include <quda_internal.h>
@@ -6,8 +7,43 @@
 #include <blas_quda.h>

 // Pinned memory for cpu-gpu memory copying
-void *packedSpinor1 = 0;
-void *packedSpinor2 = 0;
+void *packedSpinor1 = NULL;
+void *packedSpinor2 = NULL;
+
+static void allocateBuffer(void **packedSpinor, size_t bytes)
+{
+  static size_t last_bytes = 0;
+
+  if (!*packedSpinor) {
+#ifndef __DEVICE_EMULATION__
+    cudaMallocHost(packedSpinor, bytes);
+#else
+    *packedSpinor = malloc(bytes);
+#endif
+    memset(*packedSpinor, 0, bytes);
+    last_bytes = bytes;
+  } else if (bytes != last_bytes) {
+    errorQuda("unexpected change in lattice volume or cpu_prec");
+  }
+}
+
+static void freeBuffer(void **packedSpinor)
+{
+  if (*packedSpinor) {
+#ifndef __DEVICE_EMULATION__
+    cudaFreeHost(*packedSpinor);
+#else
+    free(*packedSpinor);
+#endif
+    *packedSpinor = NULL;
+  }
+}
+
+void freeSpinorBuffer(void)
+{
+  freeBuffer(&packedSpinor1);
+  freeBuffer(&packedSpinor2);
+}

 static int L[4];

@@ -70,15 +106,6 @@ void freeSpinorField(FullSpinor spinor) {
  freeParitySpinor(spinor.odd);
 }

-void freeSpinorBuffer() {
-#ifndef __DEVICE_EMULATION__
-  cudaFreeHost(packedSpinor1);
-#else
-  free(packedSpinor1);
-#endif
-  packedSpinor1 = NULL;
-}
-
 template <typename Float>
 inline void packSpinorVector(float4* a, Float *b, int V) {
  Float K = 1.0 / 2.0;
@@ -373,11 +400,7 @@ void loadParitySpinor(ParitySpinor ret, void *spinor, Precision cpu_prec,

  if (ret.precision != QUDA_HALF_PRECISION) {

-#ifndef __DEVICE_EMULATION__
-    if (!packedSpinor1) cudaMallocHost(&packedSpinor1, ret.bytes);
-#else
-    if (!packedSpinor1) packedSpinor1 = malloc(ret.bytes);
-#endif
+    allocateBuffer(&packedSpinor1, ret.bytes);
    
    if (dirac_order == QUDA_DIRAC_ORDER || QUDA_CPS_WILSON_DIRAC_ORDER) {
      if (ret.precision == QUDA_DOUBLE_PRECISION) {
@@ -407,14 +430,9 @@ void loadParitySpinor(ParitySpinor ret, void *spinor, Precision cpu_prec,
 void loadFullSpinor(FullSpinor ret, void *spinor, Precision cpu_prec) {

  if (ret.even.precision != QUDA_HALF_PRECISION) {
-    
-#ifndef __DEVICE_EMULATION__
-    if (!packedSpinor1) cudaMallocHost(&packedSpinor1, ret.even.bytes);
-    if (!packedSpinor2) cudaMallocHost(&packedSpinor2, ret.even.bytes);
-#else
-    if (!packedSpinor1) packedSpinor1 = malloc(ret.even.bytes);
-    if (!packedSpinor2) packedSpinor2 = malloc(ret.even.bytes);
-#endif
+
+    allocateBuffer(&packedSpinor1, ret.even.bytes);
+    allocateBuffer(&packedSpinor2, ret.even.bytes);
    
    if (ret.even.precision == QUDA_DOUBLE_PRECISION) {
      packFullSpinor((double2*)packedSpinor1, (double2*)packedSpinor2, (double*)spinor, ret.even.volume, ret.even.pad);
@@ -428,12 +446,8 @@ void loadFullSpinor(FullSpinor ret, void *spinor, Precision cpu_prec) {
    cudaMemcpy(ret.even.spinor, packedSpinor1, ret.even.bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(ret.odd.spinor, packedSpinor2, ret.even.bytes, cudaMemcpyHostToDevice);

-#ifndef __DEVICE_EMULATION__
-    cudaFreeHost(packedSpinor2);
-#else
-    free(packedSpinor2);
-#endif
-    packedSpinor2 = 0;
+    freeBuffer(&packedSpinor2);
+
  } else {
    FullSpinor tmp = allocateSpinorField(ret.even.X, QUDA_SINGLE_PRECISION, ret.even.pad);
    loadFullSpinor(tmp, spinor, cpu_prec);
@@ -466,7 +480,8 @@ void loadSpinorField(FullSpinor ret, void *spinor, Precision cpu_prec, DiracFiel
 void retrieveParitySpinor(void *res, ParitySpinor spinor, Precision cpu_prec, DiracFieldOrder dirac_order) {

  if (spinor.precision != QUDA_HALF_PRECISION) {
-    if (!packedSpinor1) cudaMallocHost((void**)&packedSpinor1, spinor.bytes);
+
+    allocateBuffer(&packedSpinor1, spinor.bytes);
    cudaMemcpy(packedSpinor1, spinor.spinor, spinor.bytes, cudaMemcpyDeviceToHost);

    if (dirac_order == QUDA_DIRAC_ORDER || QUDA_CPS_WILSON_DIRAC_ORDER) {
@@ -496,9 +511,10 @@ void retrieveParitySpinor(void *res, ParitySpinor spinor, Precision cpu_prec, Di
 void retrieveFullSpinor(void *res, FullSpinor spinor, Precision cpu_prec) {

  if (spinor.even.precision != QUDA_HALF_PRECISION) {
-    if (!packedSpinor1) cudaMallocHost((void**)&packedSpinor1, spinor.even.bytes);
-    if (!packedSpinor2) cudaMallocHost((void**)&packedSpinor2, spinor.even.bytes);
-    
+
+    allocateBuffer(&packedSpinor1, spinor.even.bytes);
+    allocateBuffer(&packedSpinor2, spinor.even.bytes);
+
    cudaMemcpy(packedSpinor1, spinor.even.spinor, spinor.even.bytes, cudaMemcpyDeviceToHost);
    cudaMemcpy(packedSpinor2, spinor.odd.spinor, spinor.odd.bytes, cudaMemcpyDeviceToHost);

@@ -509,13 +525,8 @@ void retrieveFullSpinor(void *res, FullSpinor spinor, Precision cpu_prec) {
 	unpackFullSpinor((double*)res, (float4*)packedSpinor1, (float4*)packedSpinor2, spinor.even.volume, spinor.even.pad);
      else unpackFullSpinor((float*)res, (float4*)packedSpinor1, (float4*)packedSpinor2, spinor.even.volume, spinor.even.pad);
    }
-    
-#ifndef __DEVICE_EMULATION__
-    cudaFreeHost(packedSpinor2);
-#else
-    free(packedSpinor2);
-#endif
-    packedSpinor2 = 0;
+    freeBuffer(&packedSpinor2);
+
  } else {
    FullSpinor tmp = allocateSpinorField(spinor.even.X, QUDA_SINGLE_PRECISION, spinor.even.pad);
    copyCuda(tmp.even, spinor.even);