cleaned up quda Makefile. 'make clean' is no longer necessary after

changing header files. git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@461 be54200a-260c-0410-bdd7-ce6af2a381ab

cleaned up quda Makefile. 'make clean' is no longer necessary after
changing header files. git-svn-id: http://lattice.bu.edu/qcdalg/cuda/quda@461 be54200a-260c-0410-bdd7-ce6af2a381ab
7f8a13b9 · rbabich · 2f900f63 · 7f8a13b9 · 7f8a13b9 · 7f8a13b9
Commit 7f8a13b9 authored Sep 16, 2009 by rbabich
--- a/Makefile.tmpl
+++ b/Makefile.tmpl
+###### Local configuration:
+
 CUDA_INSTALL_PATH = /usr/local/cuda
-INCLUDES = -I. -I$(CUDA_INSTALL_PATH)/include

-# On 64-bit platforms:
-LIB = -L$(CUDA_INSTALL_PATH)/lib64 -lcudart # for release 2.3 and newer
-# LIB = -L$(CUDA_INSTALL_PATH)/lib -lcudart # for release 2.2 and older
-COPT =
+CPU_ARCH = x86_64   # x86 or x86_64
+GPU_ARCH = sm_13    # sm_10, sm_11, sm_12, or sm_13
+GPU_EMU  = false    # set to 'true' for device emulation
+
+PYTHON = python2.6  # python 2.5 or later required for 'make generated'
+
+######

-# On 32-bit platforms:
-# LIB = -L$(CUDA_INSTALL_PATH)/lib -lcudart
-# COPT = -malign-double
+INC = -I. -I$(CUDA_INSTALL_PATH)/include

-DFLAGS = #-D__DEVICE_EMULATION__
+ifeq ($(strip $(CPU_ARCH)), x86_64)
+  LIB = -L$(CUDA_INSTALL_PATH)/lib64 -lcudart # for release 2.3 and later
+  # LIB = -L$(CUDA_INSTALL_PATH)/lib -lcudart # for release 2.2 and earlier
+  COPT =
+else
+  LIB = -L$(CUDA_INSTALL_PATH)/lib -lcudart
+  COPT = -malign-double
+endif
+
+ifeq ($(strip $(GPU_EMU)), true)
+  COPT += -D__DEVICE_EMULATION__
+  NVCCOPT = -deviceemu
+endif

 CC = gcc
-CFLAGS = -Wall -O3 -std=c99 $(COPT) $(INCLUDES) ${DFLAGS}
+CFLAGS = -Wall -O3 -std=c99 $(COPT) $(INC)
 CXX = g++
-CXXFLAGS = -Wall -O3 $(COPT) $(INCLUDES) ${DFLAGS} 
+CXXFLAGS = -Wall -O3 $(COPT) $(INC)
 NVCC = $(CUDA_INSTALL_PATH)/bin/nvcc 
-NVCCFLAGS = -O3 $(INCLUDES) ${DFLAGS} -arch=sm_13 #-deviceemu
+NVCCFLAGS = -O3 $(NVCCOPT) -arch=$(GPU_ARCH) $(INC)
 LDFLAGS = -fPIC $(LIB)

 all: dslash_test invert_test su3_test pack_test

 ILIB = libquda.a
-ILIB_OBJS = inv_bicgstab_quda.o inv_cg_quda.o dslash_quda.o blas_quda.o \
-	util_quda.o dslash_reference.o blas_reference.o invert_quda.o   \
-	gauge_quda.o spinor_quda.o
-ILIB_DEPS = $(ILIB_OBJS) blas_quda.h quda.h util_quda.h invert_quda.h \
-	gauge_quda.h spinor_quda.h enum_quda.h dslash_reference.h
+ILIB_OBJS = blas_quda.o blas_reference.o dslash_quda.o dslash_reference.o   \
+	gauge_quda.o inv_bicgstab_quda.o inv_cg_quda.o invert_quda.o        \
+	spinor_quda.o util_quda.o
+ILIB_HDRS = blas_quda.h blas_reference.h dslash_def.h dslash_quda.h         \
+	dslash_reference.h enum_quda.h gauge_quda.h invert_quda.h           \
+	io_spinor.h quda.h read_clover.h read_gauge.h reduce_complex_core.h \
+	reduce_core.h reduce_triple_core.h spinor_quda.h util_quda.h
+ILIB_CORE = dslash_core.h dslash_dagger_core.h

-$(ILIB): $(ILIB_DEPS)
+$(ILIB): $(ILIB_OBJS)
 	ar cru $@ $(ILIB_OBJS)

 dslash_test: dslash_test.o $(ILIB)
@@ -44,14 +61,19 @@ su3_test: su3_test.o $(ILIB)
 pack_test: pack_test.o $(ILIB)
 	$(CXX) $(LDFLAGS) $< $(ILIB) -o $@

+generated:
+	$(PYTHON) dslash_cuda_gen.py
+
 clean:
 	-rm -f *.o dslash_test invert_test su3_test pack_test $(ILIB)

-%.o: %.c
+%.o: %.c $(ILIB_HDRS)
 	$(CC) $(CFLAGS) $< -c -o $@

-%.o: %.cpp
+%.o: %.cpp $(ILIB_HDRS)
 	$(CXX) $(CXXFLAGS) $< -c -o $@

-%.o: %.cu
+%.o: %.cu $(ILIB_HDRS) $(ILIB_CORE)
 	$(NVCC) $(NVCCFLAGS) $< -c -o $@
+
+.PHONY: all generated clean
--- a/README
+++ b/README
 QUDA v0.x Release Notes
 -----------------------

+Overview:
+
+QUDA is a library for performing calculations in lattice QCD on
+graphics processing units (GPUs) using NVIDIA's "C for CUDA" API.
+This release includes optimized kernels for applying the Wilson Dirac
+operator and clover-improved Wilson Dirac operator, kernels for
+performing various BLAS-like operations, and full inverters built on
+these kernels.  Mixed-precision implementations of both CG and
+BiCGstab are provided, with support for double, single, and half
+(16-bit fixed-point) precision.
+
+NOTE: In this pre-release, only the BiCGstab inverter supports clover.
+
+
+Software compatibility:
+
+The library has been tested under linux (CentOS 5.3 and Ubuntu 8.04)
+using release 2.3 of the CUDA toolkit.  There are known issues with
+releases 2.1 and 2.2, but 2.0 should work if one is forced to use an
+older version (for compatibility with an old driver, for example).
+
+Under Mac OS X, the library fails to compile due to bugs in CUDA 2.3.
+It might work with CUDA 2.2 or 2.0, but this hasn't been tested.
+
+
+Hardware compatibility:
+
+For a list of supported devices, see
+
+http://www.nvidia.com/object/cuda_learn_products.html
+
+Before building the library, you should determine the "compute
+capability" of your card, either from NVIDIA's documentation or by
+running the deviceQuery example in the CUDA SDK, and set GPU_ARCH in
+the Makefile appropriately.  Setting 'GPU_ARCH = sm_13' will enable
+double precision support.
+
+
+Installation:
+
+In the source directory, copy the template 'Makefile.tmpl' to
+'Makefile', and edit the first few lines to specify the CUDA install
+path, the platform (x86 or x86_64), and the GPU architecture (see
+"Compatibility" above).  Then type 'make' to build the library.
+
+
+Using the library:
+
+Include the header file "invert_quda.h" in your application, link
+against libquda.a, and study invert_test.c for an example of the
+interface.
+

 Known issues:

-* For compatibility with CUDA, the library is compiled with the GCC option
-  -malign-double.  On 32-bit machines, this differs from the GCC default
-  and may affect the alignment of various structures, notably those of type
-  QudaGaugeParam and QudaInvertParam, defined in invert_quda.h.  As a result,
-  it may be necessary to use this switch when compiling code that is to be
-  linked against the QUDA library.
+* One of the stages of the build process requires over 5 GB of memory.
+  If too little memory is available, the compilation will either take
+  a very long time (given enough swap space) or fail completely.
+
+* For compatibility with CUDA, on 32-bit platforms the library is compiled
+  with the GCC option -malign-double.  This differs from the GCC default
+  and may affect the alignment of various structures, notably those of
+  type QudaGaugeParam and QudaInvertParam, defined in invert_quda.h.
+  Therefore, any code to be linked against QUDA should also be compiled
+  with this option.
+
+
+Contact information:
+
+For help or to report a bug, please contact Mike Clark
+(mikec@seas.harvard.edu) or Ron Babich (rbabich@bu.edu).
+
+If you find this code useful in your work, a citation to the following
+write-up would be appreciated:
+
+K. Barros et al., "Blasting through lattice calculations using CUDA,"
+PoS LATTICE2008, 045 (2008) [arXiv:0810.5365 [hep-lat]].
+
+Please also let us know so that we can send you updates and bug-fixes.
--- a/dslash_cuda_gen.py
+++ b/dslash_cuda_gen.py
+import sys

 ### complex numbers ########################################################################

@@ -605,11 +606,13 @@ def generate():
 sharedFloats = 0

 dagger = False
+print sys.argv[0] + ": generating dslash_core.h";
 f = open('dslash_core.h', 'w')
 f.write(generate())
 f.close()

 dagger = True
+print sys.argv[0] + ": generating dslash_dagger_core.h";
 f = open('dslash_dagger_core.h', 'w')
 f.write(generate())
 f.close()