added matrix-multiplication benchmark

tsung-wei-huang · tsung-wei-huang · commit 896799ff18a6 · 2019-06-14T23:36:43.000-05:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -320,6 +320,23 @@ target_link_libraries(
 )
 set_target_properties(parallel_dnn PROPERTIES COMPILE_FLAGS ${OpenMP_CXX_FLAGS})
 
+## benchmark 6: matrix multiplication
+message(STATUS "benchmark 6: matrix multiplication")
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${TF_BENCHMARK_DIR}/matrix_multiplication)
+add_executable(
+  matrix_multiplication 
+  ${TF_BENCHMARK_DIR}/matrix_multiplication/main.cpp
+  ${TF_BENCHMARK_DIR}/matrix_multiplication/omp.cpp
+  ${TF_BENCHMARK_DIR}/matrix_multiplication/tbb.cpp
+  ${TF_BENCHMARK_DIR}/matrix_multiplication/taskflow.cpp
+)
+target_include_directories(matrix_multiplication PRIVATE ${PROJECT_SOURCE_DIR}/3rd-party/CLI11)
+target_link_libraries(
+  matrix_multiplication 
+  ${PROJECT_NAME} Threads::Threads ${TBB_IMPORTED_TARGETS} ${OpenMP_CXX_LIBRARIES}
+)
+set_target_properties(matrix_multiplication PROPERTIES COMPILE_FLAGS ${OpenMP_CXX_FLAGS})
+
 endif()
 
 # -----------------------------------------------------------------------------
diff --git a/benchmark/matrix_multiplication/main.cpp b/benchmark/matrix_multiplication/main.cpp
@@ -0,0 +1,74 @@
+#include "matrix_multiplication.hpp"
+#include <CLI11.hpp>
+
+void matrix_multiplication(
+  const std::string& model,
+  const unsigned num_threads, 
+  const unsigned num_rounds
+  ) {
+
+  std::cout << std::setw(12) << "size"
+            << std::setw(12) << "runtime"
+            << std::endl;
+  
+  for(int i=128; i<=1024; i += 32) {
+
+    N = i;
+
+    allocate_matrix();
+
+    double runtime {0.0};
+
+    for(unsigned j=0; j<num_rounds; ++j) {
+      if(model == "tf") {
+        runtime += measure_time_taskflow(num_threads).count();
+      }
+      else if(model == "tbb") {
+        runtime += measure_time_tbb(num_threads).count();
+      }
+      else if(model == "omp") {
+        runtime += measure_time_omp(num_threads).count();
+      }
+      else assert(false);
+    }
+
+    std::cout << std::setw(12) << N
+              << std::setw(12) << runtime / num_rounds / 1e3
+              << std::endl;
+
+    deallocate_matrix();
+  }
+}
+
+int main(int argc, char* argv[]) {
+
+  CLI::App app{"MatrixMultiplication"};
+
+  unsigned num_threads {1}; 
+  app.add_option("-t,--num_threads", num_threads, "number of threads (default=1)");
+
+  unsigned num_rounds {1};  
+  app.add_option("-r,--num_rounds", num_rounds, "number of rounds (default=1)");
+
+  std::string model = "tf";
+  app.add_option("-m,--model", model, "model name tbb|omp|tf (default=tf)")
+     ->check([] (const std::string& m) {
+        if(m != "tbb" && m != "tf" && m != "omp") {
+          return "model name should be \"tbb\", \"omp\", or \"tf\"";
+        }
+        return "";
+     });
+
+  CLI11_PARSE(app, argc, argv);
+   
+  std::cout << "model=" << model << ' '
+            << "num_threads=" << num_threads << ' '
+            << "num_rounds=" << num_rounds << ' '
+            << std::endl;
+
+  matrix_multiplication(model, num_threads, num_rounds);
+
+  return 0;
+}
+
+
diff --git a/benchmark/matrix_multiplication/matrix_multiplication.hpp b/benchmark/matrix_multiplication/matrix_multiplication.hpp
@@ -0,0 +1,50 @@
+#include <algorithm> 
+#include <cassert>
+#include <cstdio>
+#include <chrono>
+#include <iostream>
+#include <iomanip>
+#include <thread>
+#include <random>
+#include <cmath>
+#include <atomic>
+
+inline int N;
+inline double **a, **b, **c;
+
+std::chrono::microseconds measure_time_taskflow(unsigned);
+std::chrono::microseconds measure_time_tbb(unsigned);
+std::chrono::microseconds measure_time_omp(unsigned);
+
+inline void allocate_matrix() {
+  a = static_cast<double**>(std::malloc(N * sizeof(double*)));
+  b = static_cast<double**>(std::malloc(N * sizeof(double*)));
+  c = static_cast<double**>(std::malloc(N * sizeof(double*)));
+  for(int i=0; i<N; ++i) {
+    a[i] = static_cast<double*>(std::malloc(N * sizeof(double)));
+    b[i] = static_cast<double*>(std::malloc(N * sizeof(double)));
+    c[i] = static_cast<double*>(std::malloc(N * sizeof(double)));
+  }
+}
+
+inline void deallocate_matrix() {
+  for(int i=0; i<N; ++i) {
+    std::free(a[i]);
+    std::free(b[i]);
+    std::free(c[i]);
+  }
+  std::free(a);
+  std::free(b);
+  std::free(c);
+}
+
+inline int64_t reduce_sum() {
+  int64_t sum {0};
+  for(int i=0; i<N; i++) {
+    for(int j=0; j<N; ++j) {
+      sum += c[i][j];
+    }
+  }
+  return sum;
+}
+
diff --git a/benchmark/matrix_multiplication/omp.cpp b/benchmark/matrix_multiplication/omp.cpp
@@ -0,0 +1,48 @@
+#include "matrix_multiplication.hpp"
+#include <omp.h>
+
+// matrix_multiplication_omp
+// reference: https://computing.llnl.gov/tutorials/openMP/samples/C/omp_mm.c
+void matrix_multiplication_omp(unsigned nthreads) {
+  
+  omp_set_num_threads(nthreads);
+
+  int i, j, k;
+
+  #pragma omp parallel shared(a, b, c, nthreads) private(i, j, k)
+  {
+
+    #pragma omp for schedule (static)
+    for (i=0; i<N; i++)
+      for (j=0; j<N; j++)
+        a[i][j]= i+j;
+
+    #pragma omp for schedule (static)
+    for (i=0; i<N; i++)
+      for (j=0; j<N; j++)
+        b[i][j]= i*j;
+
+    #pragma omp for schedule (static)
+    for (i=0; i<N; i++)
+      for (j=0; j<N; j++)
+        c[i][j]= 0;
+
+    #pragma omp for schedule (static)
+    for (i=0; i<N; i++) {
+      for(j=0; j<N; j++) {
+        for (k=0; k<N; k++) {
+          c[i][j] += a[i][k] * b[k][j];
+        }
+      }
+    }
+  }
+  
+  //std::cout << reduce_sum() << std::endl;
+}
+
+std::chrono::microseconds measure_time_omp(unsigned num_threads) {
+  auto beg = std::chrono::high_resolution_clock::now();
+  matrix_multiplication_omp(num_threads);
+  auto end = std::chrono::high_resolution_clock::now();
+  return std::chrono::duration_cast<std::chrono::microseconds>(end - beg);
+}
diff --git a/benchmark/matrix_multiplication/taskflow.cpp b/benchmark/matrix_multiplication/taskflow.cpp
@@ -0,0 +1,52 @@
+#include "matrix_multiplication.hpp"
+#include <taskflow/taskflow.hpp> 
+
+// matrix_multiplication_taskflow
+void matrix_multiplication_taskflow(unsigned num_threads) {
+
+  tf::Executor executor(num_threads); 
+  tf::Taskflow taskflow;
+
+  auto pa = taskflow.parallel_for(0, N, 1, [&] (int i) { 
+    for(int j=0; j<N; ++j) {
+      a[i][j] = i + j;
+    }
+  });
+  
+  auto pb = taskflow.parallel_for(0, N, 1, [&] (int i) { 
+    for(int j=0; j<N; ++j) {
+      b[i][j] = i * j;
+    }
+  });
+  
+  auto pc = taskflow.parallel_for(0, N, 1, [&] (int i) { 
+    for(int j=0; j<N; ++j) {
+      c[i][j] = 0;;
+    }
+  });
+
+  auto pr = taskflow.parallel_for(0, N, 1, [&] (int i) {
+    for(int j=0; j<N; ++j) {
+      for(int k=0; k<N; k++) {
+        c[i][j] += a[i][k] * b[k][j];
+      }
+    }
+  });
+
+  pa.second.precede(pr.first);
+  pb.second.precede(pr.first);
+  pc.second.precede(pr.first);
+
+  executor.run(taskflow).get(); 
+  
+  //std::cout << reduce_sum() << std::endl;
+}
+
+std::chrono::microseconds measure_time_taskflow(unsigned num_threads) {
+  auto beg = std::chrono::high_resolution_clock::now();
+  matrix_multiplication_taskflow(num_threads);
+  auto end = std::chrono::high_resolution_clock::now();
+  return std::chrono::duration_cast<std::chrono::microseconds>(end - beg);
+}
+
+
diff --git a/benchmark/matrix_multiplication/tbb.cpp b/benchmark/matrix_multiplication/tbb.cpp
@@ -0,0 +1,47 @@
+#include "matrix_multiplication.hpp"
+#include <tbb/tbb.h>
+#include <tbb/task_scheduler_init.h>
+
+// matrix_multiplication_tbb
+void matrix_multiplication_tbb(unsigned num_threads) {
+
+  using namespace tbb;
+  using namespace tbb::flow;
+  
+  tbb::task_scheduler_init init(num_threads);
+
+  parallel_for(0, N, 1, [=](int i) {
+    for(int j=0; j<N; ++j) {
+      a[i][j] = i + j;
+    }
+  });
+  
+  parallel_for(0, N, 1, [=](int i) {
+    for(int j=0; j<N; ++j) {
+      b[i][j] = i * j;
+    }
+  });
+  
+  parallel_for(0, N, 1, [=](int i) {
+    for(int j=0; j<N; ++j) {
+      c[i][j] = 0;
+    }
+  });
+  
+  parallel_for(0, N, 1, [=](int i) {
+    for(int j=0; j<N; ++j) {
+      for(int k=0; k<N; ++k) {
+        c[i][j] += a[i][k] * b[k][j];
+      }
+    }
+  });
+
+  //std::cout << reduce_sum() << std::endl;
+}
+
+std::chrono::microseconds measure_time_tbb(unsigned num_threads) {
+  auto beg = std::chrono::high_resolution_clock::now();
+  matrix_multiplication_tbb(num_threads);
+  auto end = std::chrono::high_resolution_clock::now();
+  return std::chrono::duration_cast<std::chrono::microseconds>(end - beg);
+}
diff --git a/docs/master-branch.html b/docs/master-branch.html
@@ -103,7 +103,8 @@ <h1><a class="anchor" id="master-branch_new_features"></a>
 Working Items</h1>
 <ul>
 <li>Improving the performance of work stealing algorithm </li>
-<li>Discovering a scalable memory allocator to handle the taskflow graph</li>
+<li>Discovering a scalable memory allocator to handle the taskflow graph </li>
+<li>Adding more benchmarks to compare Cpp-Taskflow with OpenMP and TBB</li>
 </ul>
 <h1><a class="anchor" id="master-branch_bug_fixes"></a>
 Bug Fixes</h1>
diff --git a/doxygen/releases/master-branch.dox b/doxygen/releases/master-branch.dox
@@ -17,6 +17,7 @@ To download the newest version of Cpp-Taskflow, please clone from <a href="https
 
 @li Improving the performance of work stealing algorithm
 @li Discovering a scalable memory allocator to handle the taskflow graph
+@li Adding more benchmarks to compare Cpp-Taskflow with OpenMP and TBB
 
 @section master-branch_bug_fixes Bug Fixes 
 
diff --git a/legacy/20190514/example/dataflow.hpp b/legacy/20190514/example/dataflow.hpp
diff --git a/legacy/20190514/example/dice_pools.cpp b/legacy/20190514/example/dice_pools.cpp
diff --git a/legacy/20190514/example/get_best_dice.cpp b/legacy/20190514/example/get_best_dice.cpp