Skip to content

Commit 5fb9ddc

Browse files
committed
added per-thread stream
1 parent 5eadf10 commit 5fb9ddc

10 files changed

Lines changed: 2569 additions & 119 deletions

File tree

CMakeLists.txt

Lines changed: 22 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -751,93 +751,29 @@ target_link_libraries(
751751
)
752752
set_target_properties(matrix_multiplication PROPERTIES COMPILE_FLAGS ${OpenMP_CXX_FLAGS})
753753

754-
## benchmark 6: Parallel DNN
755-
#message(STATUS "benchmark 6: Parallel DNN")
756-
#set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${TF_BENCHMARK_DIR}/parallel_dnn)
757-
#add_executable(
758-
# parallel_dnn
759-
# ${TF_BENCHMARK_DIR}/parallel_dnn/main.cpp
760-
# ${TF_BENCHMARK_DIR}/parallel_dnn/omp.cpp
761-
# ${TF_BENCHMARK_DIR}/parallel_dnn/tbb.cpp
762-
# ${TF_BENCHMARK_DIR}/parallel_dnn/seq.cpp
763-
# ${TF_BENCHMARK_DIR}/parallel_dnn/taskflow.cpp
764-
#)
765-
#target_include_directories(parallel_dnn PRIVATE ${PROJECT_SOURCE_DIR}/3rd-party/CLI11)
766-
#target_link_libraries(
767-
# parallel_dnn
768-
# ${PROJECT_NAME}
769-
# Threads::Threads
770-
# ${TBB_IMPORTED_TARGETS}
771-
# ${OpenMP_CXX_LIBRARIES}
772-
# stdc++fs
773-
# tf::default_settings
774-
#)
775-
#set_target_properties(parallel_dnn PROPERTIES COMPILE_FLAGS ${OpenMP_CXX_FLAGS})
776-
#
777754

778-
779-
### benchmark 8: Mandelbrot set
780-
#message(STATUS "benchmark 8: Mandelbrot set")
781-
#set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${TF_BENCHMARK_DIR}/mandelbrot_set)
782-
#add_executable(
783-
# mandelbrot_set
784-
# ${TF_BENCHMARK_DIR}/mandelbrot_set/main.cpp
785-
# ${TF_BENCHMARK_DIR}/mandelbrot_set/omp.cpp
786-
# ${TF_BENCHMARK_DIR}/mandelbrot_set/tbb.cpp
787-
# ${TF_BENCHMARK_DIR}/mandelbrot_set/taskflow.cpp
788-
#)
789-
#target_include_directories(mandelbrot_set PRIVATE ${PROJECT_SOURCE_DIR}/3rd-party/CLI11)
790-
#target_link_libraries(
791-
# mandelbrot_set
792-
# ${PROJECT_NAME}
793-
# Threads::Threads
794-
# ${TBB_IMPORTED_TARGETS}
795-
# ${OpenMP_CXX_LIBRARIES}
796-
# tf::default_settings
797-
#)
798-
#set_target_properties(mandelbrot_set PROPERTIES COMPILE_FLAGS ${OpenMP_CXX_FLAGS})
799-
#
800-
### benchmark 9: Black–Scholes
801-
#message(STATUS "benchmark 9: Black-Scholes Partial Differential Equation")
802-
#set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${TF_BENCHMARK_DIR}/black_scholes)
803-
#add_executable(
804-
# black_scholes
805-
# ${TF_BENCHMARK_DIR}/black_scholes/main.cpp
806-
# ${TF_BENCHMARK_DIR}/black_scholes/omp.cpp
807-
# ${TF_BENCHMARK_DIR}/black_scholes/tbb.cpp
808-
# ${TF_BENCHMARK_DIR}/black_scholes/taskflow.cpp
809-
#)
810-
#target_include_directories(black_scholes PRIVATE ${PROJECT_SOURCE_DIR}/3rd-party/CLI11)
811-
#target_link_libraries(
812-
# black_scholes
813-
# ${PROJECT_NAME}
814-
# Threads::Threads
815-
# ${TBB_IMPORTED_TARGETS}
816-
# ${OpenMP_CXX_LIBRARIES}
817-
# tf::default_settings
818-
#)
819-
#set_target_properties(black_scholes PROPERTIES COMPILE_FLAGS ${OpenMP_CXX_FLAGS})
820-
#
821-
### benchmark 10: Strassen algorithm
822-
#message(STATUS "benchmark 10: Strassen matrix multiplication algorithm")
823-
#set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${TF_BENCHMARK_DIR}/strassen)
824-
#add_executable(
825-
# strassen
826-
# ${TF_BENCHMARK_DIR}/strassen/main.cpp
827-
# ${TF_BENCHMARK_DIR}/strassen/omp.cpp
828-
# ${TF_BENCHMARK_DIR}/strassen/tbb.cpp
829-
# ${TF_BENCHMARK_DIR}/strassen/taskflow.cpp
830-
#)
831-
#target_include_directories(strassen PRIVATE ${PROJECT_SOURCE_DIR}/3rd-party/CLI11)
832-
#target_link_libraries(
833-
# strassen
834-
# ${PROJECT_NAME}
835-
# Threads::Threads
836-
# ${TBB_IMPORTED_TARGETS}
837-
# ${OpenMP_CXX_LIBRARIES}
838-
# tf::default_settings
839-
#)
840-
#set_target_properties(strassen PROPERTIES COMPILE_FLAGS ${OpenMP_CXX_FLAGS})
755+
if(${TF_ENABLE_CUDA})
756+
## cuda benchmark 1: heterogeneous traversal
757+
message(STATUS "cuda benchmark 1: heterogeneous traversal")
758+
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${TF_BENCHMARK_DIR}/hetero_traversal)
759+
add_executable(
760+
hetero_traversal
761+
${TF_BENCHMARK_DIR}/hetero_traversal/main.cu
762+
${TF_BENCHMARK_DIR}/hetero_traversal/taskflow.cu
763+
${TF_BENCHMARK_DIR}/hetero_traversal/tbb.cu
764+
${TF_BENCHMARK_DIR}/hetero_traversal/omp.cu
765+
)
766+
target_include_directories(hetero_traversal PRIVATE ${PROJECT_SOURCE_DIR}/3rd-party/CLI11)
767+
target_link_libraries(
768+
hetero_traversal
769+
${PROJECT_NAME}
770+
Threads::Threads
771+
${TBB_IMPORTED_TARGETS}
772+
${OpenMP_CXX_LIBRARIES}
773+
tf::default_settings
774+
)
775+
set_target_properties(hetero_traversal PROPERTIES COMPILE_FLAGS "-Xcompiler ${OpenMP_CXX_FLAGS}")
776+
endif(${TF_ENABLE_CUDA})
841777

842778
endif()
843779

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
#include <taskflow/taskflow.hpp>
2+
3+
struct pair_hash {
4+
template <typename T1, typename T2>
5+
size_t operator ()(const std::pair<T1, T2>& pair) const {
6+
auto h1 = std::hash<T1>()(pair.first);
7+
auto h2 = std::hash<T2>()(pair.second);
8+
return h1^h2;
9+
}
10+
};
11+
12+
struct Graph {
13+
14+
struct Node {
15+
int v, g;
16+
};
17+
18+
struct Edge {
19+
int u, v;
20+
};
21+
22+
int num_nodes;
23+
int num_edges;
24+
int num_gpus;
25+
26+
std::vector<Edge> edges;
27+
std::vector<Node> nodes;
28+
29+
Graph(int V, int E, int cuda_ratio) :
30+
num_nodes {V},
31+
num_edges {E},
32+
num_gpus {static_cast<int>(tf::cuda_num_devices())} {
33+
34+
std::unordered_set<std::pair<int, int>, pair_hash> set;
35+
36+
num_edges = std::min(num_edges, (num_nodes)*(num_nodes-1)/2);
37+
38+
for(int j=0; j<num_nodes; j++) {
39+
Node v;
40+
v.v = j;
41+
v.g = rand()%cuda_ratio == 0 ? rand()%num_gpus : -1;
42+
nodes.push_back(v);
43+
}
44+
45+
for (int j=0; j<num_edges; j++) {
46+
47+
std::pair<int, int> p;
48+
p.first = rand() % num_nodes;
49+
p.second = rand() % num_nodes;
50+
51+
while(set.find(p) != set.end() || p.first >= p.second) {
52+
p.first = rand() % num_nodes;
53+
p.second = rand() % num_nodes;
54+
if(p.first >= p.second) {
55+
std::swap(p.first, p.second);
56+
}
57+
};
58+
59+
set.insert(p);
60+
}
61+
62+
for (auto& pair : set) {
63+
Edge e;
64+
e.u = pair.first;
65+
e.v = pair.second;
66+
edges.push_back(e);
67+
}
68+
set.clear();
69+
}
70+
71+
void dump(std::ostream& os) {
72+
os << num_nodes << ' ' << num_edges << '\n';
73+
for(const auto& v : nodes) {
74+
os << v.g << '\n';
75+
}
76+
for(const auto& e : edges) {
77+
os << e.u << ' ' << e.v << '\n';
78+
}
79+
}
80+
81+
size_t size() const {
82+
return nodes.size() + edges.size();
83+
}
84+
};
85+
86+
// saxpy kernel
87+
inline __global__ void add(int* x, int* y, int* z, int n) {
88+
int i = blockIdx.x*blockDim.x + threadIdx.x;
89+
if (i < n) {
90+
z[i] = x[i] + y[i];
91+
}
92+
}
93+
94+
std::chrono::microseconds measure_time_taskflow(const Graph&, unsigned, unsigned);
95+
std::chrono::microseconds measure_time_tbb(const Graph&, unsigned, unsigned);
96+
std::chrono::microseconds measure_time_omp(const Graph&, unsigned, unsigned);
97+
98+
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
#include "graph.hpp"
2+
#include <CLI11.hpp>
3+
4+
int main(int argc, char* argv[]) {
5+
6+
CLI::App app{"HeteroTraversal"};
7+
8+
unsigned num_threads {1};
9+
app.add_option("-t,--num_threads", num_threads, "number of threads (default=1)");
10+
11+
unsigned num_gpus {1};
12+
app.add_option("-g,--num_gpus", num_gpus, "number of gpus (default=1)");
13+
14+
unsigned num_rounds {5};
15+
app.add_option("-r,--num_rounds", num_rounds, "number of rounds (default=5)");
16+
17+
unsigned cuda_ratio {2};
18+
app.add_option(
19+
"-c,--cuda_ratio",
20+
cuda_ratio,
21+
"cpu/cuda task ratio (the higher, the fewer cuda tasks (default=2)"
22+
);
23+
24+
std::string model = "tf";
25+
app.add_option("-m,--model", model, "model name tf|tbb|omp (default=tf)")
26+
->check([] (const std::string& m) {
27+
if(m != "tf" && m != "tbb" && m != "omp") {
28+
return "model name should be \"tbb\", \"tf\", or \"omp\"";
29+
}
30+
return "";
31+
});
32+
33+
CLI11_PARSE(app, argc, argv);
34+
35+
std::cout << "model=" << model << ' '
36+
<< "num_threads=" << num_threads << ' '
37+
<< "num_gpus=" << num_gpus << ' '
38+
<< "num_rounds=" << num_rounds << ' '
39+
<< std::endl;
40+
41+
std::cout << std::setw(12) << "|V|+|E|"
42+
<< std::setw(12) << "Runtime"
43+
<< '\n';
44+
45+
cudaDeviceReset();
46+
47+
for(int i=10; i<=10000; i += 500) {
48+
49+
double runtime {0.0};
50+
51+
Graph graph(i, 4*i, cuda_ratio);
52+
53+
//std::ofstream ofs(std::string("graph") + std::to_string(graph.size()) + ".txt");
54+
//graph.dump(ofs);
55+
//continue;
56+
57+
for(unsigned j=0; j<num_rounds; ++j) {
58+
if(model == "tf") {
59+
runtime += measure_time_taskflow(graph, num_threads, num_gpus).count();
60+
}
61+
else if(model == "tbb") {
62+
runtime += measure_time_tbb(graph, num_threads, num_gpus).count();
63+
}
64+
else if(model == "omp") {
65+
runtime += measure_time_omp(graph, num_threads, num_gpus).count();
66+
}
67+
}
68+
69+
std::cout << std::setw(12) << graph.size()
70+
<< std::setw(12) << runtime / num_rounds / 1e3
71+
<< std::endl;
72+
}
73+
74+
return 0;
75+
}
76+
77+
78+

0 commit comments

Comments
 (0)