GeorgeARM
diff --git a/‎CMakeLists.txt‎
Lines changed: 22 additions & 86 deletions b/‎CMakeLists.txt‎
Lines changed: 22 additions & 86 deletions
diff --git a/‎benchmarks/hetero_traversal/graph.hpp‎
Lines changed: 98 additions & 0 deletions b/‎benchmarks/hetero_traversal/graph.hpp‎
Lines changed: 98 additions & 0 deletions
diff --git a/‎benchmarks/hetero_traversal/main.cu‎
Lines changed: 78 additions & 0 deletions b/‎benchmarks/hetero_traversal/main.cu‎
Lines changed: 78 additions & 0 deletions
@@ -751,93 +751,29 @@ target_link_libraries(
 )
 set_target_properties(matrix_multiplication PROPERTIES COMPILE_FLAGS ${OpenMP_CXX_FLAGS})
 
-## benchmark 6: Parallel DNN
-#message(STATUS "benchmark 6: Parallel DNN")
-#set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${TF_BENCHMARK_DIR}/parallel_dnn)
-#add_executable(
-#  parallel_dnn 
-#  ${TF_BENCHMARK_DIR}/parallel_dnn/main.cpp
-#  ${TF_BENCHMARK_DIR}/parallel_dnn/omp.cpp
-#  ${TF_BENCHMARK_DIR}/parallel_dnn/tbb.cpp
-#  ${TF_BENCHMARK_DIR}/parallel_dnn/seq.cpp
-#  ${TF_BENCHMARK_DIR}/parallel_dnn/taskflow.cpp
-#)
-#target_include_directories(parallel_dnn PRIVATE ${PROJECT_SOURCE_DIR}/3rd-party/CLI11)
-#target_link_libraries(
-#  parallel_dnn 
-#  ${PROJECT_NAME} 
-#  Threads::Threads 
-#  ${TBB_IMPORTED_TARGETS}
-#  ${OpenMP_CXX_LIBRARIES} 
-#  stdc++fs 
-#  tf::default_settings
-#)
-#set_target_properties(parallel_dnn PROPERTIES COMPILE_FLAGS ${OpenMP_CXX_FLAGS})
-#
 
-
-### benchmark 8: Mandelbrot set
-#message(STATUS "benchmark 8: Mandelbrot set")
-#set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${TF_BENCHMARK_DIR}/mandelbrot_set)
-#add_executable(
-#  mandelbrot_set
-#  ${TF_BENCHMARK_DIR}/mandelbrot_set/main.cpp
-#  ${TF_BENCHMARK_DIR}/mandelbrot_set/omp.cpp
-#  ${TF_BENCHMARK_DIR}/mandelbrot_set/tbb.cpp
-#  ${TF_BENCHMARK_DIR}/mandelbrot_set/taskflow.cpp
-#)
-#target_include_directories(mandelbrot_set PRIVATE ${PROJECT_SOURCE_DIR}/3rd-party/CLI11)
-#target_link_libraries(
-#  mandelbrot_set
-#  ${PROJECT_NAME} 
-#  Threads::Threads 
-#  ${TBB_IMPORTED_TARGETS} 
-#  ${OpenMP_CXX_LIBRARIES} 
-#  tf::default_settings
-#)
-#set_target_properties(mandelbrot_set PROPERTIES COMPILE_FLAGS ${OpenMP_CXX_FLAGS})
-#
-### benchmark 9: Black–Scholes
-#message(STATUS "benchmark 9: Black-Scholes Partial Differential Equation")
-#set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${TF_BENCHMARK_DIR}/black_scholes)
-#add_executable(
-#  black_scholes
-#  ${TF_BENCHMARK_DIR}/black_scholes/main.cpp
-#  ${TF_BENCHMARK_DIR}/black_scholes/omp.cpp
-#  ${TF_BENCHMARK_DIR}/black_scholes/tbb.cpp
-#  ${TF_BENCHMARK_DIR}/black_scholes/taskflow.cpp
-#)
-#target_include_directories(black_scholes PRIVATE ${PROJECT_SOURCE_DIR}/3rd-party/CLI11)
-#target_link_libraries(
-#  black_scholes
-#  ${PROJECT_NAME} 
-#  Threads::Threads 
-#  ${TBB_IMPORTED_TARGETS} 
-#  ${OpenMP_CXX_LIBRARIES} 
-#  tf::default_settings
-#)
-#set_target_properties(black_scholes PROPERTIES COMPILE_FLAGS ${OpenMP_CXX_FLAGS})
-#
-### benchmark 10: Strassen algorithm
-#message(STATUS "benchmark 10: Strassen matrix multiplication algorithm")
-#set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${TF_BENCHMARK_DIR}/strassen)
-#add_executable(
-#  strassen
-#  ${TF_BENCHMARK_DIR}/strassen/main.cpp
-#  ${TF_BENCHMARK_DIR}/strassen/omp.cpp
-#  ${TF_BENCHMARK_DIR}/strassen/tbb.cpp
-#  ${TF_BENCHMARK_DIR}/strassen/taskflow.cpp
-#)
-#target_include_directories(strassen PRIVATE ${PROJECT_SOURCE_DIR}/3rd-party/CLI11)
-#target_link_libraries(
-#  strassen
-#  ${PROJECT_NAME} 
-#  Threads::Threads 
-#  ${TBB_IMPORTED_TARGETS} 
-#  ${OpenMP_CXX_LIBRARIES} 
-#  tf::default_settings
-#)
-#set_target_properties(strassen PROPERTIES COMPILE_FLAGS ${OpenMP_CXX_FLAGS})
+if(${TF_ENABLE_CUDA})
+## cuda benchmark 1: heterogeneous traversal
+message(STATUS "cuda benchmark 1: heterogeneous traversal")
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${TF_BENCHMARK_DIR}/hetero_traversal)
+add_executable(
+  hetero_traversal 
+  ${TF_BENCHMARK_DIR}/hetero_traversal/main.cu
+  ${TF_BENCHMARK_DIR}/hetero_traversal/taskflow.cu
+  ${TF_BENCHMARK_DIR}/hetero_traversal/tbb.cu
+  ${TF_BENCHMARK_DIR}/hetero_traversal/omp.cu
+)
+target_include_directories(hetero_traversal PRIVATE ${PROJECT_SOURCE_DIR}/3rd-party/CLI11)
+target_link_libraries(
+  hetero_traversal 
+  ${PROJECT_NAME} 
+  Threads::Threads 
+  ${TBB_IMPORTED_TARGETS} 
+  ${OpenMP_CXX_LIBRARIES} 
+  tf::default_settings
+)
+set_target_properties(hetero_traversal PROPERTIES COMPILE_FLAGS "-Xcompiler ${OpenMP_CXX_FLAGS}")
+endif(${TF_ENABLE_CUDA})
 
 endif()
 
 
@@ -0,0 +1,98 @@
+#include <taskflow/taskflow.hpp>
+
+struct pair_hash {
+  template <typename T1, typename T2>
+  size_t operator ()(const std::pair<T1, T2>& pair) const {
+    auto h1 = std::hash<T1>()(pair.first);
+    auto h2 = std::hash<T2>()(pair.second);
+    return h1^h2;
+  }
+};
+
+struct Graph {
+
+  struct Node {
+    int v, g;
+  };
+
+  struct Edge {
+    int u, v;
+  };
+
+  int num_nodes;
+  int num_edges;
+  int num_gpus;
+
+  std::vector<Edge> edges;
+  std::vector<Node> nodes;
+
+  Graph(int V, int E, int cuda_ratio) : 
+    num_nodes {V}, 
+    num_edges {E},
+    num_gpus  {static_cast<int>(tf::cuda_num_devices())} {
+
+    std::unordered_set<std::pair<int, int>, pair_hash> set;
+
+    num_edges = std::min(num_edges, (num_nodes)*(num_nodes-1)/2);
+
+    for(int j=0; j<num_nodes; j++) {
+      Node v;
+      v.v = j;
+      v.g = rand()%cuda_ratio == 0 ? rand()%num_gpus : -1;
+      nodes.push_back(v);
+    }
+
+    for (int j=0; j<num_edges; j++) {
+
+      std::pair<int, int> p;
+      p.first = rand() % num_nodes;
+      p.second = rand() % num_nodes;
+
+      while(set.find(p) != set.end() || p.first >= p.second) {
+        p.first = rand() % num_nodes;
+        p.second = rand() % num_nodes;
+        if(p.first >= p.second) {
+          std::swap(p.first, p.second);
+        }
+      };
+
+      set.insert(p);
+    }
+
+    for (auto& pair : set) {
+      Edge e;
+      e.u = pair.first;
+      e.v = pair.second;
+      edges.push_back(e);
+    }
+    set.clear();
+  }
+
+  void dump(std::ostream& os) {
+    os << num_nodes << ' ' << num_edges << '\n';
+    for(const auto& v : nodes) {
+      os << v.g << '\n';
+    }
+    for(const auto& e : edges) {
+      os << e.u << ' ' << e.v << '\n';
+    }
+  }
+
+  size_t size() const {
+    return nodes.size() + edges.size();
+  }
+};
+
+// saxpy kernel
+inline __global__ void add(int* x, int* y, int* z, int n) {
+  int i = blockIdx.x*blockDim.x + threadIdx.x;
+  if (i < n) {
+    z[i] = x[i] + y[i];
+  }
+}
+
+std::chrono::microseconds measure_time_taskflow(const Graph&, unsigned, unsigned);
+std::chrono::microseconds measure_time_tbb(const Graph&, unsigned, unsigned);
+std::chrono::microseconds measure_time_omp(const Graph&, unsigned, unsigned);
+
+
@@ -0,0 +1,78 @@
+#include "graph.hpp"
+#include <CLI11.hpp>
+
+int main(int argc, char* argv[]) {
+  
+  CLI::App app{"HeteroTraversal"};
+
+  unsigned num_threads {1}; 
+  app.add_option("-t,--num_threads", num_threads, "number of threads (default=1)");
+
+  unsigned num_gpus {1};
+  app.add_option("-g,--num_gpus", num_gpus, "number of gpus (default=1)");
+
+  unsigned num_rounds {5};  
+  app.add_option("-r,--num_rounds", num_rounds, "number of rounds (default=5)");
+
+  unsigned cuda_ratio {2};
+  app.add_option(
+    "-c,--cuda_ratio", 
+    cuda_ratio, 
+    "cpu/cuda task ratio (the higher, the fewer cuda tasks (default=2)"
+  );
+
+  std::string model = "tf";
+  app.add_option("-m,--model", model, "model name tf|tbb|omp (default=tf)")
+     ->check([] (const std::string& m) {
+        if(m != "tf" && m != "tbb" && m != "omp") {
+          return "model name should be \"tbb\", \"tf\", or \"omp\"";
+        }
+        return "";
+     });
+
+  CLI11_PARSE(app, argc, argv);
+   
+  std::cout << "model=" << model << ' '
+            << "num_threads=" << num_threads << ' '
+            << "num_gpus=" << num_gpus << ' '
+            << "num_rounds=" << num_rounds << ' '
+            << std::endl;
+
+  std::cout << std::setw(12) << "|V|+|E|"
+            << std::setw(12) << "Runtime"
+             << '\n';
+
+  cudaDeviceReset();
+
+  for(int i=10; i<=10000; i += 500) {
+
+    double runtime {0.0};
+
+    Graph graph(i, 4*i, cuda_ratio);
+
+    //std::ofstream ofs(std::string("graph") + std::to_string(graph.size()) + ".txt");
+    //graph.dump(ofs);
+    //continue;
+    
+    for(unsigned j=0; j<num_rounds; ++j) {
+      if(model == "tf") {
+        runtime += measure_time_taskflow(graph, num_threads, num_gpus).count();
+      }
+      else if(model == "tbb") {
+        runtime += measure_time_tbb(graph, num_threads, num_gpus).count();
+      }
+      else if(model == "omp") {
+        runtime += measure_time_omp(graph, num_threads, num_gpus).count();
+      }
+    }
+
+    std::cout << std::setw(12) << graph.size() 
+              << std::setw(12) << runtime / num_rounds / 1e3
+              << std::endl;
+  }
+
+  return 0;
+}
+
+
+