slam-code
diff --git a/‎GPU-并行编程/CMakeLists.txt‎
Lines changed: 25 additions & 7 deletions b/‎GPU-并行编程/CMakeLists.txt‎
Lines changed: 25 additions & 7 deletions
diff --git a/‎GPU-并行编程/help_function/CMakeLists.txt‎ b/‎GPU-并行编程/help_function/CMakeLists.txt‎
diff --git a/‎GPU-并行编程/help_function/cuda_utils.h‎
Lines changed: 220 additions & 0 deletions b/‎GPU-并行编程/help_function/cuda_utils.h‎
Lines changed: 220 additions & 0 deletions
@@ -1,18 +1,36 @@
-
 # CMakeLists.txt for G4CU project
 project(test_cuda_project)
 # required cmake version
 cmake_minimum_required(VERSION 2.8)
+
+#SET (CMAKE_C_COMPILER  "/usr/bin/clang")
+#SET(CMAKE_CXX_COMPILER "/usr/bin/clang++")
+#
+enable_language(CXX)
+#
+#enable_language(C)
+
+
 # packages
 find_package(CUDA)
 # nvcc flags
-set(CUDA_NVCC_FLAGS -gencode arch=compute_20,code=sm_20;-G;-g)
+#set(CUDA_NVCC_FLAGS -gencode arch=compute_20,code=sm_20;-G;-g)
 #set(CUDA_NVCC_FLAGS -gencode arch=compute_52,code=sm_52;-G;-g)
+set(CUDA_NVCC_FLAGS -gencode arch=compute_30,code=sm_30;-G;-g)
+
+set(CUDA_NVCC_FLAGS -w)
+
+
+CUDA_ADD_EXECUTABLE(test_vec vectorAdd.cu )
+CUDA_ADD_EXECUTABLE(what_is_my_id what_is_my_id.cu)
+CUDA_ADD_EXECUTABLE(what_is_my_id_2D_A what_is_my_id_2D_A.cu)
+
+
 
-file(GLOB_RECURSE CURRENT_HEADERS  *.h *.hpp *.cuh)
-file(GLOB CURRENT_SOURCES  *.cpp *.cu)
-source_group("Include" FILES ${CURRENT_HEADERS}) 
-source_group("Source" FILES ${CURRENT_SOURCES}) 
 
-CUDA_ADD_EXECUTABLE(test_cuda_project ${CURRENT_HEADERS} ${CURRENT_SOURCES})
+#file(GLOB_RECURSE CURRENT_HEADERS  *.h *.hpp *.cuh)
+#file(GLOB CURRENT_SOURCES  *.cpp *.cu)
+#source_group("Include" FILES ${CURRENT_HEADERS}) 
+#source_group("Source" FILES ${CURRENT_SOURCES}) 
 
+#CUDA_ADD_EXECUTABLE(test_cuda_project ${CURRENT_HEADERS} ${CURRENT_SOURCES})
@@ -0,0 +1,220 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file cuda_utils.h
+ * \brief CUDA debugging utilities.
+ */
+#ifndef MXNET_COMMON_CUDA_UTILS_H_
+#define MXNET_COMMON_CUDA_UTILS_H_
+
+#include <dmlc/logging.h>
+#include <mshadow/base.h>
+
+/*! \brief Macros/inlines to assist CLion to parse Cuda files (*.cu, *.cuh) */
+#ifdef __JETBRAINS_IDE__
+#define __CUDACC__ 1
+#define __host__
+#define __device__
+#define __global__
+#define __forceinline__
+#define __shared__
+inline void __syncthreads() {}
+inline void __threadfence_block() {}
+template<class T> inline T __clz(const T val) { return val; }
+struct __cuda_fake_struct { int x; int y; int z; };
+extern __cuda_fake_struct blockDim;
+extern __cuda_fake_struct threadIdx;
+extern __cuda_fake_struct blockIdx;
+#endif
+
+#if MXNET_USE_CUDA
+
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <curand.h>
+
+namespace mxnet {
+namespace common {
+/*! \brief common utils for cuda */
+namespace cuda {
+/*!
+ * \brief Get string representation of cuBLAS errors.
+ * \param error The error.
+ * \return String representation.
+ */
+inline const char* CublasGetErrorString(cublasStatus_t error) {
+  switch (error) {
+  case CUBLAS_STATUS_SUCCESS:
+    return "CUBLAS_STATUS_SUCCESS";
+  case CUBLAS_STATUS_NOT_INITIALIZED:
+    return "CUBLAS_STATUS_NOT_INITIALIZED";
+  case CUBLAS_STATUS_ALLOC_FAILED:
+    return "CUBLAS_STATUS_ALLOC_FAILED";
+  case CUBLAS_STATUS_INVALID_VALUE:
+    return "CUBLAS_STATUS_INVALID_VALUE";
+  case CUBLAS_STATUS_ARCH_MISMATCH:
+    return "CUBLAS_STATUS_ARCH_MISMATCH";
+  case CUBLAS_STATUS_MAPPING_ERROR:
+    return "CUBLAS_STATUS_MAPPING_ERROR";
+  case CUBLAS_STATUS_EXECUTION_FAILED:
+    return "CUBLAS_STATUS_EXECUTION_FAILED";
+  case CUBLAS_STATUS_INTERNAL_ERROR:
+    return "CUBLAS_STATUS_INTERNAL_ERROR";
+  case CUBLAS_STATUS_NOT_SUPPORTED:
+    return "CUBLAS_STATUS_NOT_SUPPORTED";
+  default:
+    break;
+  }
+  return "Unknown cuBLAS status";
+}
+
+/*!
+ * \brief Get string representation of cuRAND errors.
+ * \param status The status.
+ * \return String representation.
+ */
+inline const char* CurandGetErrorString(curandStatus_t status) {
+  switch (status) {
+  case CURAND_STATUS_SUCCESS:
+    return "CURAND_STATUS_SUCCESS";
+  case CURAND_STATUS_VERSION_MISMATCH:
+    return "CURAND_STATUS_VERSION_MISMATCH";
+  case CURAND_STATUS_NOT_INITIALIZED:
+    return "CURAND_STATUS_NOT_INITIALIZED";
+  case CURAND_STATUS_ALLOCATION_FAILED:
+    return "CURAND_STATUS_ALLOCATION_FAILED";
+  case CURAND_STATUS_TYPE_ERROR:
+    return "CURAND_STATUS_TYPE_ERROR";
+  case CURAND_STATUS_OUT_OF_RANGE:
+    return "CURAND_STATUS_OUT_OF_RANGE";
+  case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+    return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+  case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+    return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+  case CURAND_STATUS_LAUNCH_FAILURE:
+    return "CURAND_STATUS_LAUNCH_FAILURE";
+  case CURAND_STATUS_PREEXISTING_FAILURE:
+    return "CURAND_STATUS_PREEXISTING_FAILURE";
+  case CURAND_STATUS_INITIALIZATION_FAILED:
+    return "CURAND_STATUS_INITIALIZATION_FAILED";
+  case CURAND_STATUS_ARCH_MISMATCH:
+    return "CURAND_STATUS_ARCH_MISMATCH";
+  case CURAND_STATUS_INTERNAL_ERROR:
+    return "CURAND_STATUS_INTERNAL_ERROR";
+  }
+  return "Unknown cuRAND status";
+}
+
+}  // namespace cuda
+}  // namespace common
+}  // namespace mxnet
+
+/*!
+ * \brief Check CUDA error.
+ * \param msg Message to print if an error occured.
+ */
+#define CHECK_CUDA_ERROR(msg)                                                \
+  {                                                                          \
+    cudaError_t e = cudaGetLastError();                                      \
+    CHECK_EQ(e, cudaSuccess) << (msg) << " CUDA: " << cudaGetErrorString(e); \
+  }
+
+/*!
+ * \brief Protected CUDA call.
+ * \param func Expression to call.
+ *
+ * It checks for CUDA errors after invocation of the expression.
+ */
+#define CUDA_CALL(func)                                            \
+  {                                                                \
+    cudaError_t e = (func);                                        \
+    CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading)       \
+        << "CUDA: " << cudaGetErrorString(e);                      \
+  }
+
+/*!
+ * \brief Protected cuBLAS call.
+ * \param func Expression to call.
+ *
+ * It checks for cuBLAS errors after invocation of the expression.
+ */
+#define CUBLAS_CALL(func)                                       \
+  {                                                             \
+    cublasStatus_t e = (func);                                  \
+    CHECK_EQ(e, CUBLAS_STATUS_SUCCESS)                          \
+        << "cuBLAS: " << common::cuda::CublasGetErrorString(e); \
+  }
+
+/*!
+ * \brief Protected cuRAND call.
+ * \param func Expression to call.
+ *
+ * It checks for cuRAND errors after invocation of the expression.
+ */
+#define CURAND_CALL(func)                                       \
+  {                                                             \
+    curandStatus_t e = (func);                                  \
+    CHECK_EQ(e, CURAND_STATUS_SUCCESS)                          \
+        << "cuRAND: " << common::cuda::CurandGetErrorString(e); \
+  }
+
+#endif  // MXNET_USE_CUDA
+
+#if MXNET_USE_CUDNN
+
+#include <cudnn.h>
+
+#define CUDNN_CALL(func)                                                      \
+  {                                                                           \
+    cudnnStatus_t e = (func);                                                 \
+    CHECK_EQ(e, CUDNN_STATUS_SUCCESS) << "cuDNN: " << cudnnGetErrorString(e); \
+  }
+
+#endif  // MXNET_USE_CUDNN
+
+// Overload atomicAdd to work for floats on all architectures
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+// From CUDA Programming Guide
+static inline  __device__  void atomicAdd(double *address, double val) {
+  unsigned long long* address_as_ull =                  // NOLINT(*)
+    reinterpret_cast<unsigned long long*>(address);     // NOLINT(*)
+  unsigned long long old = *address_as_ull;             // NOLINT(*)
+  unsigned long long assumed;                           // NOLINT(*)
+
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val +
+                    __longlong_as_double(assumed)));
+
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+  } while (assumed != old);
+}
+#endif
+
+// Overload atomicAdd for half precision
+// Taken from:
+// https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh
+#if defined(__CUDA_ARCH__)
+static inline __device__ void atomicAdd(mshadow::half::half_t *address,
+                                        mshadow::half::half_t val) {
+  unsigned int *address_as_ui =
+      reinterpret_cast<unsigned int *>(reinterpret_cast<char *>(address) -
+                                   (reinterpret_cast<size_t>(address) & 2));
+  unsigned int old = *address_as_ui;
+  unsigned int assumed;
+
+  do {
+    assumed = old;
+    mshadow::half::half_t hsum;
+    hsum.half_ =
+        reinterpret_cast<size_t>(address) & 2 ? (old >> 16) : (old & 0xffff);
+    hsum += val;
+    old = reinterpret_cast<size_t>(address) & 2
+              ? (old & 0xffff) | (hsum.half_ << 16)
+              : (old & 0xffff0000) | hsum.half_;
+    old = atomicCAS(address_as_ui, assumed, old);
+  } while (assumed != old);
+}
+#endif
+
+#endif  // MXNET_COMMON_CUDA_UTILS_H_