added parallel_for on index

tsung-wei-huang · tsung-wei-huang · commit e4555503269f · 2018-12-17T16:52:59.000-06:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -172,6 +172,7 @@ add_test(builder          ${TF_UTEST_DIR}/taskflow -tc=Builder)
 add_test(dispatch         ${TF_UTEST_DIR}/taskflow -tc=Dispatch)
 add_test(executor         ${TF_UTEST_DIR}/taskflow -tc=Executor)
 add_test(parallel_for     ${TF_UTEST_DIR}/taskflow -tc=ParallelFor)
+add_test(parallel_for_idx ${TF_UTEST_DIR}/taskflow -tc=ParallelForOnIndex)
 add_test(reduce           ${TF_UTEST_DIR}/taskflow -tc=Reduce)
 add_test(reduce_min       ${TF_UTEST_DIR}/taskflow -tc=ReduceMin)
 add_test(reduce_max       ${TF_UTEST_DIR}/taskflow -tc=ReduceMax)
diff --git a/README.md b/README.md
@@ -479,6 +479,7 @@ Visit [documentation][wiki] to see the complete list.
 | placeholder     | none        | task         | insert a node without any work; work can be assigned later |
 | linearize       | task list   | none         | create a linear dependency in the given task list |
 | parallel_for    | beg, end, callable, group | task pair | apply the callable in parallel and group-by-group to the result of dereferencing every iterator in the range | 
+| parallel_for    | beg, end, step, callable, group | task pair | apply the callable in parallel and group-by-group to a index-based range | 
 | reduce | beg, end, res, bop | task pair | reduce a range of elements to a single result through a binary operator | 
 | transform_reduce | beg, end, res, bop, uop | task pair | apply a unary operator to each element in the range and reduce them to a single result through a binary operator | 
 | dispatch        | none        | future | dispatch the current graph and return a shared future to block on completion |
@@ -569,12 +570,42 @@ auto [S, T] = tf.parallel_for(
   [] (int i) { 
     std::cout << "AB and CD run in parallel" << '\n';
   },
-  2  // group to execute two tasks at a time
+  2  // group two tasks at a time
 );
 ```
 
 By default, taskflow performs an even partition over worker threads
-if the group size is not specified.
+if the group size is not specified (or equal to 0).
+
+In addition to range-based iterator, parallel\_for has another overload on an index-based loop.
+The first three argument to this overload indicates 
+starting index, ending index (exclusive), and step size.
+
+```cpp
+// [0, 10) with a step size of 2
+auto [S, T] = tf.parallel_for(
+  0, 10, 2, 
+  [] (int i) {
+    std::cout << "parallel_for on index " << i << std::endl;
+  }, 
+  2  // group two tasks at a time
+);
+// will print 0, 2, 4, 6, 8 (three groups, {0, 2}, {4, 6}, {8})
+```
+
+You can also go opposite direction by reversing the starting index and the ending index
+with a negative step size.
+
+```cpp
+// [10, 0) with a step size of -2
+auto [S, T] = tf.parallel_for(
+  10, 0, 2, 
+  [] (int i) {
+    std::cout << "parallel_for on index " << i << std::endl;
+  }
+);
+// will print 10, 8, 6, 4, 2 (group size decided by taskflow)
+```
 
 ### *reduce/transform_reduce*
 
diff --git a/example/parallel_for.cpp b/example/parallel_for.cpp
@@ -2,65 +2,37 @@
 #include <cassert>
 #include <numeric>
 
-// Function: fib
-int fib(int n) {
-  if(n <= 2) return n;
-  return (fib(n-1) + fib(n-2))%1024;
-}
-
-// ----------------------------------------------------------------------------
-
-// Procedure: sequential
-void sequential(int N) {
-  auto tbeg = std::chrono::steady_clock::now();
-  for(int i=0; i<N; ++i) {
-    printf("fib[%d]=%d\n", i, fib(i));
-  }
-  auto tend = std::chrono::steady_clock::now();
-  std::cout << "sequential version takes " 
-            << std::chrono::duration_cast<std::chrono::milliseconds>(tend-tbeg).count() 
-            << " ms\n";
-}
-
-// Procedure: taskflow
-void taskflow(int N) {
+// Procedure: parallel_for_on_range
+void parallel_for_on_range(int N) {
 
   std::vector<int> range(N);
   std::iota(range.begin(), range.end(), 0);
 
-  auto tbeg = std::chrono::steady_clock::now();
   tf::Taskflow tf;
   tf.parallel_for(range, [&] (const int i) { 
-    printf("fib[%d]=%d\n", i, fib(i));
-  }, 1);
+    printf("parallel_for on container item: %d\n", i);
+  });
   tf.wait_for_all();
+}
 
-  auto tend = std::chrono::steady_clock::now();
-  std::cout << "taskflow version takes " 
-            << std::chrono::duration_cast<std::chrono::milliseconds>(tend-tbeg).count() 
-            << " ms\n";
+// Procedure: parallel_for_on_index
+void parallel_for_on_index(int N) {
+  tf::Taskflow tf;
+
+  // [0, N) with step size 1
+  tf.parallel_for(0, N, 1, [] (int i) {
+    printf("parallel_for on index: %d\n", i);
+  });
+  tf.wait_for_all();
 }
 
 // ----------------------------------------------------------------------------
 
 // Function: main
 int main(int argc, char* argv[]) {
 
-  if(argc != 3) {
-    std::cerr << "usage: ./parallel_for [baseline|taskflow] N\n";
-    std::exit(EXIT_FAILURE);
-  }
-  
-  // Run methods
-  if(std::string_view method(argv[1]); method == "baseline") {
-    sequential(std::atoi(argv[2]));
-  }
-  else if(method == "taskflow") {
-    taskflow(std::atoi(argv[2]));
-  }
-  else {
-    std::cerr << "wrong method, shoud be [baseline|taskflow]\n";
-  }
+  parallel_for_on_range(10);
+  parallel_for_on_index(10);
 
   return 0;
 }
diff --git a/taskflow/error/error.hpp b/taskflow/error/error.hpp
@@ -12,6 +12,7 @@ struct Error : public std::error_category {
 
   enum Code : int {
     SUCCESS = 0,
+    FLOW_BUILDER,
     EXECUTOR
   };
 
@@ -39,6 +40,10 @@ inline std::string Error::message(int code) const {
       return "success";
     break;
 
+    case FLOW_BUILDER:
+      return "flow builder error";
+    break;
+
     case EXECUTOR:
       return "executor error";
     break;
diff --git a/taskflow/graph/flow_builder.hpp b/taskflow/graph/flow_builder.hpp
@@ -29,11 +29,7 @@ class FlowBuilder {
     template <typename T, typename C, std::enable_if_t<is_iterable_v<T>, void>* = nullptr>
     auto parallel_for(T&, C&&, size_t = 0);
 
-    template <
-      typename I, 
-      typename C, 
-      std::enable_if_t<std::is_arithmetic_v<I>, void>* = nullptr
-    >
+    template <typename I, typename C, std::enable_if_t<std::is_arithmetic_v<I>, void>* = nullptr >
     auto parallel_for(I, I, I, C&&, size_t = 0);
 
     template <typename I, typename T, typename B>
@@ -71,6 +67,9 @@ class FlowBuilder {
 
     template <typename L>
     void _linearize(L&);
+
+    template <typename I>
+    size_t _estimate_chunk_size(I, I, I);
 };
 
 // Constructor
@@ -178,40 +177,93 @@ auto FlowBuilder::parallel_for(T& t, C&& c, size_t group) {
 template <
   typename I, 
   typename C, 
-  std::enable_if_t<std::is_arithmetic_v<I>, void>* = nullptr
+  std::enable_if_t<std::is_arithmetic_v<I>, void>*
 >
-auto FlowBuilder::parallel_for(I beg, I end, I step, C&& c, size_t g) {
+auto FlowBuilder::parallel_for(I beg, I end, I s, C&& c, size_t g) {
 
-  if(g == 0) {
-    auto N = (end - beg + step - 1) / step;
-    auto w = std::max(unsigned{1}, std::thread::hardware_concurrency());
-    g = (N + w - 1) / w;
-  }
+  using T = std::decay_t<I>;
 
+  if((s == 0 && beg != end) || (beg < end && s <= 0) || (beg > end && s >=0) ) {
+    TF_THROW(Error::FLOW_BUILDER, 
+      "invalid range [", beg, ", ", end, ") with step size ", s
+    );
+  }
+    
   auto source = placeholder();
   auto target = placeholder();
 
-  std::cout << "g is " << g << std::endl;
-  
-  while(beg < end) {
+  if(g == 0) {
+    g = _estimate_chunk_size(beg, end, s);
+  }
 
-    auto e = beg + static_cast<I>(g) * step;
+  // Integer indices
+  if constexpr(std::is_integral_v<T>) {
 
-    std::cout << beg << " " << e << std::endl;
-    
-    // Create a task
-    auto task = silent_emplace([beg, e, step, c] () mutable {
-      for(auto i=beg; i<e; i+=step) {
-        c(i);
-      }
-    });
-    source.precede(task);
-    task.precede(target);
+    auto offset = static_cast<T>(g) * s;
 
-    // adjust the pointer
-    beg = e;
+    // positive case
+    if(beg < end) {
+      while(beg != end) {
+        auto e = std::min(beg + offset, end);
+        auto task = silent_emplace([=] () mutable {
+          for(auto i=beg; i<e; i+=s) {
+            c(i);
+          }
+        });
+        source.precede(task);
+        task.precede(target);
+        beg = e;
+      }
+    }
+    // negative case
+    else if(beg > end) {
+      while(beg != end) {
+        auto e = std::max(beg + offset, end);
+        auto task = silent_emplace([=] () mutable {
+          for(auto i=beg; i>e; i+=s) {
+            c(i);
+          }
+        });
+        source.precede(task);
+        task.precede(target);
+        beg = e;
+      }
+    }
   }
+  // We enumerate the entire sequence to avoid floating error
+  else if constexpr(std::is_floating_point_v<T>) {
+    size_t N = 0;
+    auto B = beg;
+    for(auto i=beg; (beg<end ? i<end : i>end); i+=s, ++N) {
+      if(N == g) {
+        auto task = silent_emplace([=] () mutable {
+          auto b = B;
+          for(size_t n=0; n<N; ++n) {
+            c(b);
+            b += s; 
+          }
+        });
+        N = 0;
+        B = i;
+        source.precede(task);
+        task.precede(target);
+      }
+    }
 
+    // the last pices
+    if(N != 0) {
+      auto task = silent_emplace([=] () mutable {
+        auto b = B;
+        for(size_t n=0; n<N; ++n) {
+          c(b);
+          b += s; 
+        }
+      });
+      source.precede(task);
+      task.precede(target);
+    }
+  }
+    
   return std::make_pair(source, target); 
 }
 
@@ -345,6 +397,33 @@ auto FlowBuilder::transform_reduce(I beg, I end, T& result, B&& bop, P&& pop, U&
   return std::make_pair(source, target); 
 }
 
+// Function: _estimate_chunk_size
+template <typename I>
+size_t FlowBuilder::_estimate_chunk_size(I beg, I end, I step) {
+
+  using T = std::decay_t<I>;
+      
+  size_t w = std::max(unsigned{1}, std::thread::hardware_concurrency());
+  size_t N = 0;
+
+  if constexpr(std::is_integral_v<T>) {
+    if(beg <= end) {  
+      N = (end - beg + step - 1) / step;
+    }
+    else {
+      N = (end - beg + step + 1) / step;
+    }
+  }
+  else if constexpr(std::is_floating_point_v<T>) {
+    N = std::ceil((end - beg) / step);
+  }
+  else {
+    static_assert(dependent_false_v<T>, "can't deduce chunk size");
+  }
+
+  return (N + w - 1) / w;
+}
+
 
 // Procedure: _linearize
 template <typename L>
diff --git a/taskflow/utility/utility.hpp b/taskflow/utility/utility.hpp
@@ -20,6 +20,7 @@
 #include <cassert>
 #include <optional>
 #include <variant>
+#include <cmath>
 
 namespace tf {
 
diff --git a/unittest/taskflow.cpp b/unittest/taskflow.cpp