updated kmeans

twhuang-utah · twhuang-utah · commit 7d66a73b6d39 · 2020-04-18T22:22:36.000-06:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -573,6 +573,8 @@ add_test(cuda_basics.subflow ${TF_UTEST_CUDA_BASICS} -tc=Subflow)
 add_test(cuda_basics.nested_subflow ${TF_UTEST_CUDA_BASICS} -tc=NestedSubflow)
 add_test(cuda_basics.detached_subflow ${TF_UTEST_CUDA_BASICS} -tc=DetachedSubflow)
 add_test(cuda_basics.loop ${TF_UTEST_CUDA_BASICS} -tc=Loop)
+add_test(cuda_basics.predicate ${TF_UTEST_CUDA_BASICS} -tc=Predicate)
+add_test(cuda_basics.repeat ${TF_UTEST_CUDA_BASICS} -tc=Repeat)
 
 # matrix operation tests
 add_executable(cuda_matrix ${TF_UTEST_DIR}/cuda/cuda_matrix.cu)
diff --git a/doxygen/examples/kmeans.dox b/doxygen/examples/kmeans.dox
@@ -418,6 +418,15 @@ It precedes a condition task that circles back to itself until we reach @c M ite
 When iteration completes, the condition task directs the execution path to the %cudaFlow, @c h2d,
 to copy the results of clusters to @c h_mx and @c h_my and then deallocate all GPU memory.
 
+@section PredicateInsteadOfConditionalTasking Predicate instead of Conditional Tasking
+
+Using a condition task to iterate the k-means %cudaFlow may be replaced with a simple predicate,
+given that the graph parameters remain unchanged across all iterations.
+In this case, we can create the %cudaFlow once and launch it repeatedly as rapidly as possible.
+
+@code{.cpp}
+@endcode
+
 @section KMeansBenchmarking Benchmarking
 
 We run three versions of kmeans,
@@ -426,13 +435,13 @@ on a machine of 6 Intel i7-8700 CPUs at 3.20GHz and a Nvidia RTX 2080 GPU using
 2D point counts and iterations.
 
 <div align="center">
-| N        | K   | M       | CPU Sequential | CPU Parallel | GPU Parallel |
-| :-:      | :-: | :-:     | :-:            | :-:          | :-:          |
-| 10       | 5   | 10      | 0.14 ms        | 77 ms        | 1 ms         |
-| 100      | 10  | 100     | 0.56 ms        | 86 ms        | 7 ms         |
-| 1000     | 10  | 1000    | 10 ms          | 98 ms        | 55 ms        |
-| 10000    | 10  | 10000   | 1006 ms        | 713 ms       | 458 ms       |
-| 100000   | 10  | 100000  | 102483 ms      | 49966 ms     | 7952 ms      |
+| N        | K   | M       | CPU Sequential | CPU Parallel | GPU (conditional taksing) | GPU (with predicate) |
+| :-:      | :-: | :-:     | :-:            | :-:          | :-:          | :-:                               |
+| 10       | 5   | 10      | 0.14 ms        | 77 ms        | 1 ms         | 1 ms   |
+| 100      | 10  | 100     | 0.56 ms        | 86 ms        | 7 ms         | 1 ms   |
+| 1000     | 10  | 1000    | 10 ms          | 98 ms        | 55 ms        | 13 ms  |
+| 10000    | 10  | 10000   | 1006 ms        | 713 ms       | 458 ms       | 183 ms |
+| 100000   | 10  | 100000  | 102483 ms      | 49966 ms     | 7952 ms      | 4725 ms |
 </div>
 
 When the number of points is larger than 10K, 
diff --git a/examples/cuda/kmeans.cu b/examples/cuda/kmeans.cu
@@ -1,7 +1,8 @@
 // This program implements the k-means clustering algorithm in three forms:
 //  - sequential cpu
 //  - parallel cpu
-//  - gpu
+//  - gpu with conditional tasking
+//  - gpu without conditional tasking
 
 #include <taskflow/taskflow.hpp>
 
@@ -220,7 +221,7 @@ __global__ void compute_new_means(
   my[cluster] = sy[cluster] / count;
 }
 
-// run k-means on gpu
+// Runs k-means on gpu using conditional tasking
 std::pair<std::vector<float>, std::vector<float>> gpu(
  const int N, 
  const int K, 
@@ -335,6 +336,117 @@ std::pair<std::vector<float>, std::vector<float>> gpu(
   return {h_mx, h_my};
 }
 
+// Runs k-means on gpu without using conditional tasking
+std::pair<std::vector<float>, std::vector<float>> gpu_predicate(
+ const int N, 
+ const int K, 
+ const int M,
+ const std::vector<float>& h_px,
+ const std::vector<float>& h_py
+) {
+  
+  std::vector<float> h_mx, h_my;
+  float *d_px, *d_py, *d_mx, *d_my, *d_sx, *d_sy, *d_c;
+  
+  for(int i=0; i<K; ++i) {
+    h_mx.push_back(h_px[i]);
+    h_my.push_back(h_py[i]);
+  }
+  
+  // create a taskflow graph
+  tf::Executor executor;
+  tf::Taskflow taskflow("K-Means");
+  
+  auto allocate_px = taskflow.emplace([&](){
+    TF_CHECK_CUDA(cudaMalloc(&d_px, N*sizeof(float)), "failed to allocate d_px"); 
+  }).name("allocate_px");
+
+  auto allocate_py = taskflow.emplace([&](){
+    TF_CHECK_CUDA(cudaMalloc(&d_py, N*sizeof(float)), "failed to allocate d_py"); 
+  }).name("allocate_py");
+  
+  auto allocate_mx = taskflow.emplace([&](){
+    TF_CHECK_CUDA(cudaMalloc(&d_mx, K*sizeof(float)), "failed to allocate d_mx"); 
+  }).name("allocate_mx");
+
+  auto allocate_my = taskflow.emplace([&](){
+    TF_CHECK_CUDA(cudaMalloc(&d_my, K*sizeof(float)), "failed to allocate d_my"); 
+  }).name("allocate_my");
+
+  auto allocate_sx = taskflow.emplace([&](){
+    TF_CHECK_CUDA(cudaMalloc(&d_sx, K*sizeof(float)), "failed to allocate d_sx"); 
+  }).name("allocate_sx");
+
+  auto allocate_sy = taskflow.emplace([&](){
+    TF_CHECK_CUDA(cudaMalloc(&d_sy, K*sizeof(float)), "failed to allocate d_sy"); 
+  }).name("allocate_sy");
+
+  auto allocate_c = taskflow.emplace([&](){
+    TF_CHECK_CUDA(cudaMalloc(&d_c, K*sizeof(float)), "failed to allocate dc");
+  }).name("allocate_c");
+
+  auto h2d = taskflow.emplace([&](tf::cudaFlow& cf){
+    cf.copy(d_px, h_px.data(), N).name("h2d_px");
+    cf.copy(d_py, h_py.data(), N).name("h2d_py");
+    cf.copy(d_mx, h_mx.data(), K).name("h2d_mx");
+    cf.copy(d_my, h_my.data(), K).name("h2d_my");
+  }).name("h2d");
+
+  auto kmeans = taskflow.emplace([&](tf::cudaFlow& cf){
+
+    auto zero_c = cf.zero(d_c, K).name("zero_c");
+    auto zero_sx = cf.zero(d_sx, K).name("zero_sx");
+    auto zero_sy = cf.zero(d_sy, K).name("zero_sy");
+    
+    auto cluster = cf.kernel(
+      (N+1024-1) / 1024, 1024, 0, 
+      assign_clusters, d_px, d_py, N, d_mx, d_my, d_sx, d_sy, K, d_c
+    ).name("cluster"); 
+    
+    auto new_centroid = cf.kernel(
+      1, K, 0, 
+      compute_new_means, d_mx, d_my, d_sx, d_sy, d_c
+    ).name("new_centroid");
+
+    cluster.precede(new_centroid)
+           .succeed(zero_c, zero_sx, zero_sy);
+
+    cf.repeat(M);
+  }).name("update_means");
+
+  auto stop = taskflow.emplace([&](tf::cudaFlow& cf){
+    cf.copy(h_mx.data(), d_mx, K).name("d2h_mx");
+    cf.copy(h_my.data(), d_my, K).name("d2h_my");
+  }).name("d2h");
+
+  auto free = taskflow.emplace([&](){
+    TF_CHECK_CUDA(cudaFree(d_px), "failed to free d_px");
+    TF_CHECK_CUDA(cudaFree(d_py), "failed to free d_py");
+    TF_CHECK_CUDA(cudaFree(d_mx), "failed to free d_mx");
+    TF_CHECK_CUDA(cudaFree(d_my), "failed to free d_my");
+    TF_CHECK_CUDA(cudaFree(d_sx), "failed to free d_sx");
+    TF_CHECK_CUDA(cudaFree(d_sy), "failed to free d_sy");
+    TF_CHECK_CUDA(cudaFree(d_c),  "failed to free d_c");
+  }).name("free");
+  
+  // build up the dependency
+  h2d.succeed(allocate_px, allocate_py, allocate_mx, allocate_my);
+
+  kmeans.succeed(allocate_sx, allocate_sy, allocate_c, h2d)
+        .precede(stop);
+
+  stop.precede(free);
+  
+  //taskflow.dump(std::cout);
+
+  // run the taskflow
+  executor.run(taskflow).wait();
+
+  //std::cout << "dumping kmeans graph ...\n";
+  //taskflow.dump(std::cout);
+  return {h_mx, h_my};
+}
+
 // Function: main
 int main(int argc, const char* argv[]) {
 
@@ -398,22 +510,35 @@ int main(int argc, const char* argv[]) {
                                           << std::setw(10) << my[k] << '\n';  
   }
  
-  // k-means on gpu
-  std::cout << "running k-means on gpu ... ";
+  // k-means on gpu with conditional tasking
+  std::cout << "running k-means on gpu (with conditional tasking) ... ";
   auto gbeg = std::chrono::steady_clock::now();
   std::tie(mx, my) = gpu(N, K, M, h_px, h_py);
   auto gend = std::chrono::steady_clock::now();
   std::cout << "completed with " 
             << std::chrono::duration_cast<std::chrono::milliseconds>(gend-gbeg).count()
             << " ms\n";
   
-  std::cout << "k centroids found by gpu\n";
+  std::cout << "k centroids found by gpu (with conditional tasking)\n";
   for(int k=0; k<K; ++k) {
     std::cout << "centroid " << k << ": " << std::setw(10) << mx[k] << ' ' 
                                           << std::setw(10) << my[k] << '\n';  
   }
   
-
+  // k-means on gpu without conditional tasking
+  std::cout << "running k-means on gpu (without conditional tasking) ... ";
+  auto rbeg = std::chrono::steady_clock::now();
+  std::tie(mx, my) = gpu_predicate(N, K, M, h_px, h_py);
+  auto rend = std::chrono::steady_clock::now();
+  std::cout << "completed with " 
+            << std::chrono::duration_cast<std::chrono::milliseconds>(rend-rbeg).count()
+            << " ms\n";
+  
+  std::cout << "k centroids found by gpu (without conditional tasking)\n";
+  for(int k=0; k<K; ++k) {
+    std::cout << "centroid " << k << ": " << std::setw(10) << mx[k] << ' ' 
+                                          << std::setw(10) << my[k] << '\n';  
+  }
 
   return 0;
 }
diff --git a/taskflow/core/executor.hpp b/taskflow/core/executor.hpp
@@ -948,7 +948,7 @@ inline void Executor::_invoke_cudaflow_work_impl(Worker& w, Node* node) {
 
   h.graph.clear();
 
-  cudaFlow cf(h.graph);
+  cudaFlow cf(h.graph, [repeat=1] () mutable { return repeat-- == 0; });
 
   h.work(cf); 
 
@@ -973,14 +973,16 @@ inline void Executor::_invoke_cudaflow_work_impl(Worker& w, Node* node) {
     cudaGraphInstantiate(&exec, h.graph._native_handle, nullptr, nullptr, 0),
     "failed to create an executable cudaGraph"
   );
+  
+  while(!cf._predicate()) {
+    TF_CHECK_CUDA(
+      cudaGraphLaunch(exec, s), "failed to launch cudaGraph on stream ", s
+    );
 
-  TF_CHECK_CUDA(
-    cudaGraphLaunch(exec, s), "failed to launch cudaGraph on stream ", s
-  );
-
-  TF_CHECK_CUDA(
-    cudaStreamSynchronize(s), "failed to synchronize stream ", s
-  );
+    TF_CHECK_CUDA(
+      cudaStreamSynchronize(s), "failed to synchronize stream ", s
+    );
+  }
 
   TF_CHECK_CUDA(
     cudaGraphExecDestroy(exec), "failed to destroy an executable cudaGraph"
diff --git a/taskflow/cuda/cuda_flow.hpp b/taskflow/cuda/cuda_flow.hpp
@@ -24,9 +24,13 @@ class cudaFlow {
     /**
     @brief constructs a cudaFlow builder object
 
+    @tparam P predicate type
+
     @param graph a cudaGraph to manipulate
+    @param p predicate which return @c true if the launching should be contined
     */
-    cudaFlow(cudaGraph& graph);
+    template <typename P>
+    cudaFlow(cudaGraph& graph, P&& p);
 
     /**
     @brief queries the emptiness of the graph
@@ -168,7 +172,7 @@ class cudaFlow {
 
     @return cudaTask handle
 
-    A copy task transfers num*sizeof(T) bytes of data from a source location
+    A copy task transfers <tt>num*sizeof(T)</tt> bytes of data from a source location
     to a target location. Direction can be arbitrary among CPUs and GPUs.
     */
     template <
@@ -196,17 +200,49 @@ class cudaFlow {
     */
     void stream(cudaStream_t stream);
 
+    /**
+    @brief assigns a predicate to loop the cudaFlow until the predicate is satisfied
+
+    @tparam P predicate type
+    @param p predicate which return @c true if the launching should be contined
+
+    The execution of cudaFlow is equivalent to: <tt>while(!predicate()) { run cudaflow; }</tt>
+    */
+    template <typename P>
+    void predicate(P&& p);
+    
+    /**
+    @brief repeats the execution of the cudaFlow by @c n times
+    */
+    void repeat(size_t n);
+
   private:
 
     cudaGraph& _graph;
     
     int _device {0};
 
     nstd::optional<cudaStream_t> _stream;
+
+    std::function<bool()> _predicate;
 };
 
 // Constructor
-inline cudaFlow::cudaFlow(cudaGraph& g) : _graph {g} {
+template <typename P>
+cudaFlow::cudaFlow(cudaGraph& g, P&& p) : 
+  _graph {g},
+  _predicate {std::forward<P>(p)} {
+}
+
+// Procedure: predicate
+template <typename P>
+void cudaFlow::predicate(P&& pred) {
+  _predicate = std::forward<P>(pred);
+}
+
+// Procedure: repeat
+inline void cudaFlow::repeat(size_t n) {
+  _predicate = [n] () mutable { return n-- == 0; };
 }
 
 // Function: empty
diff --git a/unittests/cuda/cuda_basics.cu b/unittests/cuda/cuda_basics.cu