added memset node

twhuang-utah · twhuang-utah · commit 852923e91da2 · 2020-03-23T23:13:39.000-06:00
diff --git a/taskflow/cuda/cuda_flow_builder.hpp b/taskflow/cuda/cuda_flow_builder.hpp
@@ -7,7 +7,7 @@ namespace tf {
 /**
 @class cudaFlow
 
-@brief Building methods of a cuda task dependency graph.
+@brief Building methods for a cuda task dependency graph.
 */
 class cudaFlow {
 
@@ -78,6 +78,18 @@ class cudaFlow {
     */
     template <typename F, typename... ArgsT>
     cudaTask kernel_on(int d, dim3 g, dim3 b, size_t s, F&& f, ArgsT&&... args);
+
+    /**
+    @brief creates a memset node
+
+    @param dst pointer to the destination device memory area
+    @param ch value to set for each byte of specified memory
+    @param count size in bytes to set
+
+    A memset tasks fills the first @c count bytes of device memory area 
+    pointed by @c dst with the byte value @ch.
+    */
+    cudaTask memset(void* dst, int ch, size_t count);
     
     /**
     @brief creates an 1D copy task
@@ -240,6 +252,32 @@ cudaTask cudaFlow::kernel_on(
   return cudaTask(node);
 }
 
+// Function: memset
+inline cudaTask cudaFlow::memset(void* dst, int ch, size_t count) {
+
+  auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Memset>{},
+    [=] (cudaGraph_t& graph, cudaGraphNode_t& node) {
+      cudaMemsetParams p;
+      p.dst = dst;
+      p.value = ch;
+      p.pitch = 0;
+      //p.elementSize = (count & 1) == 0 ? ((count & 3) == 0 ? 4 : 2) : 1;
+      //p.width = (count & 1) == 0 ? ((count & 3) == 0 ? count >> 2 : count >> 1) : count;
+      p.elementSize = 1;  // either 1, 2, or 4
+      p.width = count;
+
+      p.height = 1;
+      TF_CHECK_CUDA(
+        cudaGraphAddMemsetNode(&node, graph, nullptr, 0, &p),
+        "failed to create a cudaMemset node"
+      );
+    }
+  );
+  
+  return cudaTask(node);
+}
+
+
 // Function: copy
 template <
   typename T,
diff --git a/taskflow/cuda/cuda_graph.hpp b/taskflow/cuda/cuda_graph.hpp
@@ -27,7 +27,8 @@ class cudaNode {
   //struct Host {
   //  cudaHostNodeParams param;
   //};
-
+  
+  // Noop handle
   struct Noop {
 
     template <typename C>
@@ -36,14 +37,21 @@ class cudaNode {
     std::function<void(cudaGraph_t&, cudaGraphNode_t&)> work;
   };
 
+  // Memset handle
+  struct Memset {
+    
+    template <typename C>
+    Memset(C&&);
+
+    std::function<void(cudaGraph_t&, cudaGraphNode_t&)> work;
+  };
+
   // Copy handle
   struct Copy {
     
     template <typename C>
     Copy(C&&);
 
-    //cudaMemcpy3DParms param;
-
     std::function<void(cudaGraph_t&, cudaGraphNode_t&)> work;
   };
   
@@ -53,16 +61,15 @@ class cudaNode {
     template <typename C>
     Kernel(C&&);
 
-    //cudaKernelNodeParams param;
-
     std::function<void(cudaGraph_t&, cudaGraphNode_t&)> work;
   };
 
-  using handle_t = nstd::variant<nstd::monostate, Noop, Copy, Kernel>;
+  using handle_t = nstd::variant<nstd::monostate, Noop, Memset, Copy, Kernel>;
   
   // variant index
-  constexpr static auto NOOP = get_index_v<Noop, handle_t>;
-  constexpr static auto COPY = get_index_v<Copy, handle_t>; 
+  constexpr static auto NOOP   = get_index_v<Noop, handle_t>;
+  constexpr static auto MEMSET = get_index_v<Memset, handle_t>;
+  constexpr static auto COPY   = get_index_v<Copy, handle_t>; 
   constexpr static auto KERNEL = get_index_v<Kernel, handle_t>;
 
   public:
@@ -128,6 +135,11 @@ template <typename C>
 cudaNode::Noop::Noop(C&& c) : work {std::forward<C>(c)} {
 }
 
+// Memset handle constructor
+template <typename C>
+cudaNode::Memset::Memset(C&& c) : work {std::forward<C>(c)} {
+}
+
 // Copy handle constructor
 template <typename C>
 cudaNode::Copy::Copy(C&& c) : work {std::forward<C>(c)} {
@@ -146,10 +158,6 @@ cudaNode::cudaNode(ArgsT&&... args) : _handle {std::forward<ArgsT>(args)...} {
 // Procedure: _precede
 inline void cudaNode::_precede(cudaNode* v) {
   _successors.push_back(v);
-  //TF_CHECK_CUDA(
-  //  ::cudaGraphAddDependencies(_graph._handle, &_node, &(v->_node), 1),
-  //  "failed to add a preceding link"
-  //);
 }
 
 // ----------------------------------------------------------------------------
@@ -219,6 +227,12 @@ inline void cudaGraph::_make_native_graph(int d) {
         );
       break;
 
+      case cudaNode::MEMSET:
+        nstd::get<cudaNode::Memset>(node->_handle).work(
+          _native_handle, node->_native_handle
+        );
+      break;
+
       case cudaNode::COPY:
         nstd::get<cudaNode::Copy>(node->_handle).work(
           _native_handle, node->_native_handle
@@ -230,9 +244,6 @@ inline void cudaGraph::_make_native_graph(int d) {
           _native_handle, node->_native_handle
         );
       break;
-
-      default:
-      break;
     }
   }
 
diff --git a/unittests/cuda/cuda_basics.cu b/unittests/cuda/cuda_basics.cu
@@ -247,6 +247,55 @@ TEST_CASE("BSet.i32" * doctest::timeout(300)) {
   bset<int32_t>();
 }
 
+// --------------------------------------------------------
+// Testcase: Memset
+// --------------------------------------------------------
+TEST_CASE("Memset") {
+  
+  tf::Taskflow taskflow;
+  tf::Executor executor;
+  
+  const int N = 100;
+
+  int* cpu = new int [N];
+  int* gpu = nullptr;
+    
+  REQUIRE(cudaMalloc(&gpu, N*sizeof(int)) == cudaSuccess);
+
+  for(int r=1; r<=100; ++r) {
+
+    int start = ::rand() % N;
+
+    for(int i=0; i<N; ++i) {
+      cpu[i] = 999;
+    }
+    
+    taskflow.emplace([&](tf::cudaFlow& cf){
+      dim3 g = {(unsigned)(N+255)/256, 1, 1};
+      dim3 b = {256, 1, 1};
+      auto kset = cf.kernel(g, b, 0, k_set<int>, gpu, N, 123);
+      auto zero = cf.memset(gpu+start, 0x3f, (N-start)*sizeof(int));
+      auto copy = cf.copy(cpu, gpu, N);
+      kset.precede(zero);
+      zero.precede(copy);
+    });
+    
+    executor.run(taskflow).wait();
+
+    for(int i=0; i<start; ++i) {
+      REQUIRE(cpu[i] == 123);
+    }
+    for(int i=start; i<N; ++i) {
+      REQUIRE(cpu[i] == 0x3f3f3f3f);
+    }
+  }
+  
+
+  delete [] cpu;
+  REQUIRE(cudaFree(gpu) == cudaSuccess);
+}
+
+
 // --------------------------------------------------------
 // Testcase: Barrier
 // --------------------------------------------------------