Skip to content

Commit 896799f

Browse files
added matrix-multiplication benchmark
1 parent 24072c5 commit 896799f

File tree

11 files changed

+291
-1
lines changed

11 files changed

+291
-1
lines changed

CMakeLists.txt

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,23 @@ target_link_libraries(
320320
)
321321
set_target_properties(parallel_dnn PROPERTIES COMPILE_FLAGS ${OpenMP_CXX_FLAGS})
322322

323+
## benchmark 6: matrix multiplication
324+
message(STATUS "benchmark 6: matrix multiplication")
325+
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${TF_BENCHMARK_DIR}/matrix_multiplication)
326+
add_executable(
327+
matrix_multiplication
328+
${TF_BENCHMARK_DIR}/matrix_multiplication/main.cpp
329+
${TF_BENCHMARK_DIR}/matrix_multiplication/omp.cpp
330+
${TF_BENCHMARK_DIR}/matrix_multiplication/tbb.cpp
331+
${TF_BENCHMARK_DIR}/matrix_multiplication/taskflow.cpp
332+
)
333+
target_include_directories(matrix_multiplication PRIVATE ${PROJECT_SOURCE_DIR}/3rd-party/CLI11)
334+
target_link_libraries(
335+
matrix_multiplication
336+
${PROJECT_NAME} Threads::Threads ${TBB_IMPORTED_TARGETS} ${OpenMP_CXX_LIBRARIES}
337+
)
338+
set_target_properties(matrix_multiplication PROPERTIES COMPILE_FLAGS ${OpenMP_CXX_FLAGS})
339+
323340
endif()
324341

325342
# -----------------------------------------------------------------------------
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#include "matrix_multiplication.hpp"
2+
#include <CLI11.hpp>
3+
4+
void matrix_multiplication(
5+
const std::string& model,
6+
const unsigned num_threads,
7+
const unsigned num_rounds
8+
) {
9+
10+
std::cout << std::setw(12) << "size"
11+
<< std::setw(12) << "runtime"
12+
<< std::endl;
13+
14+
for(int i=128; i<=1024; i += 32) {
15+
16+
N = i;
17+
18+
allocate_matrix();
19+
20+
double runtime {0.0};
21+
22+
for(unsigned j=0; j<num_rounds; ++j) {
23+
if(model == "tf") {
24+
runtime += measure_time_taskflow(num_threads).count();
25+
}
26+
else if(model == "tbb") {
27+
runtime += measure_time_tbb(num_threads).count();
28+
}
29+
else if(model == "omp") {
30+
runtime += measure_time_omp(num_threads).count();
31+
}
32+
else assert(false);
33+
}
34+
35+
std::cout << std::setw(12) << N
36+
<< std::setw(12) << runtime / num_rounds / 1e3
37+
<< std::endl;
38+
39+
deallocate_matrix();
40+
}
41+
}
42+
43+
int main(int argc, char* argv[]) {
44+
45+
CLI::App app{"MatrixMultiplication"};
46+
47+
unsigned num_threads {1};
48+
app.add_option("-t,--num_threads", num_threads, "number of threads (default=1)");
49+
50+
unsigned num_rounds {1};
51+
app.add_option("-r,--num_rounds", num_rounds, "number of rounds (default=1)");
52+
53+
std::string model = "tf";
54+
app.add_option("-m,--model", model, "model name tbb|omp|tf (default=tf)")
55+
->check([] (const std::string& m) {
56+
if(m != "tbb" && m != "tf" && m != "omp") {
57+
return "model name should be \"tbb\", \"omp\", or \"tf\"";
58+
}
59+
return "";
60+
});
61+
62+
CLI11_PARSE(app, argc, argv);
63+
64+
std::cout << "model=" << model << ' '
65+
<< "num_threads=" << num_threads << ' '
66+
<< "num_rounds=" << num_rounds << ' '
67+
<< std::endl;
68+
69+
matrix_multiplication(model, num_threads, num_rounds);
70+
71+
return 0;
72+
}
73+
74+
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#include <algorithm>
2+
#include <cassert>
3+
#include <cstdio>
4+
#include <chrono>
5+
#include <iostream>
6+
#include <iomanip>
7+
#include <thread>
8+
#include <random>
9+
#include <cmath>
10+
#include <atomic>
11+
12+
inline int N;
13+
inline double **a, **b, **c;
14+
15+
std::chrono::microseconds measure_time_taskflow(unsigned);
16+
std::chrono::microseconds measure_time_tbb(unsigned);
17+
std::chrono::microseconds measure_time_omp(unsigned);
18+
19+
inline void allocate_matrix() {
20+
a = static_cast<double**>(std::malloc(N * sizeof(double*)));
21+
b = static_cast<double**>(std::malloc(N * sizeof(double*)));
22+
c = static_cast<double**>(std::malloc(N * sizeof(double*)));
23+
for(int i=0; i<N; ++i) {
24+
a[i] = static_cast<double*>(std::malloc(N * sizeof(double)));
25+
b[i] = static_cast<double*>(std::malloc(N * sizeof(double)));
26+
c[i] = static_cast<double*>(std::malloc(N * sizeof(double)));
27+
}
28+
}
29+
30+
inline void deallocate_matrix() {
31+
for(int i=0; i<N; ++i) {
32+
std::free(a[i]);
33+
std::free(b[i]);
34+
std::free(c[i]);
35+
}
36+
std::free(a);
37+
std::free(b);
38+
std::free(c);
39+
}
40+
41+
inline int64_t reduce_sum() {
42+
int64_t sum {0};
43+
for(int i=0; i<N; i++) {
44+
for(int j=0; j<N; ++j) {
45+
sum += c[i][j];
46+
}
47+
}
48+
return sum;
49+
}
50+
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#include "matrix_multiplication.hpp"
2+
#include <omp.h>
3+
4+
// matrix_multiplication_omp
5+
// reference: https://computing.llnl.gov/tutorials/openMP/samples/C/omp_mm.c
6+
void matrix_multiplication_omp(unsigned nthreads) {
7+
8+
omp_set_num_threads(nthreads);
9+
10+
int i, j, k;
11+
12+
#pragma omp parallel shared(a, b, c, nthreads) private(i, j, k)
13+
{
14+
15+
#pragma omp for schedule (static)
16+
for (i=0; i<N; i++)
17+
for (j=0; j<N; j++)
18+
a[i][j]= i+j;
19+
20+
#pragma omp for schedule (static)
21+
for (i=0; i<N; i++)
22+
for (j=0; j<N; j++)
23+
b[i][j]= i*j;
24+
25+
#pragma omp for schedule (static)
26+
for (i=0; i<N; i++)
27+
for (j=0; j<N; j++)
28+
c[i][j]= 0;
29+
30+
#pragma omp for schedule (static)
31+
for (i=0; i<N; i++) {
32+
for(j=0; j<N; j++) {
33+
for (k=0; k<N; k++) {
34+
c[i][j] += a[i][k] * b[k][j];
35+
}
36+
}
37+
}
38+
}
39+
40+
//std::cout << reduce_sum() << std::endl;
41+
}
42+
43+
std::chrono::microseconds measure_time_omp(unsigned num_threads) {
44+
auto beg = std::chrono::high_resolution_clock::now();
45+
matrix_multiplication_omp(num_threads);
46+
auto end = std::chrono::high_resolution_clock::now();
47+
return std::chrono::duration_cast<std::chrono::microseconds>(end - beg);
48+
}
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#include "matrix_multiplication.hpp"
2+
#include <taskflow/taskflow.hpp>
3+
4+
// matrix_multiplication_taskflow
5+
void matrix_multiplication_taskflow(unsigned num_threads) {
6+
7+
tf::Executor executor(num_threads);
8+
tf::Taskflow taskflow;
9+
10+
auto pa = taskflow.parallel_for(0, N, 1, [&] (int i) {
11+
for(int j=0; j<N; ++j) {
12+
a[i][j] = i + j;
13+
}
14+
});
15+
16+
auto pb = taskflow.parallel_for(0, N, 1, [&] (int i) {
17+
for(int j=0; j<N; ++j) {
18+
b[i][j] = i * j;
19+
}
20+
});
21+
22+
auto pc = taskflow.parallel_for(0, N, 1, [&] (int i) {
23+
for(int j=0; j<N; ++j) {
24+
c[i][j] = 0;;
25+
}
26+
});
27+
28+
auto pr = taskflow.parallel_for(0, N, 1, [&] (int i) {
29+
for(int j=0; j<N; ++j) {
30+
for(int k=0; k<N; k++) {
31+
c[i][j] += a[i][k] * b[k][j];
32+
}
33+
}
34+
});
35+
36+
pa.second.precede(pr.first);
37+
pb.second.precede(pr.first);
38+
pc.second.precede(pr.first);
39+
40+
executor.run(taskflow).get();
41+
42+
//std::cout << reduce_sum() << std::endl;
43+
}
44+
45+
std::chrono::microseconds measure_time_taskflow(unsigned num_threads) {
46+
auto beg = std::chrono::high_resolution_clock::now();
47+
matrix_multiplication_taskflow(num_threads);
48+
auto end = std::chrono::high_resolution_clock::now();
49+
return std::chrono::duration_cast<std::chrono::microseconds>(end - beg);
50+
}
51+
52+
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#include "matrix_multiplication.hpp"
2+
#include <tbb/tbb.h>
3+
#include <tbb/task_scheduler_init.h>
4+
5+
// matrix_multiplication_tbb
6+
void matrix_multiplication_tbb(unsigned num_threads) {
7+
8+
using namespace tbb;
9+
using namespace tbb::flow;
10+
11+
tbb::task_scheduler_init init(num_threads);
12+
13+
parallel_for(0, N, 1, [=](int i) {
14+
for(int j=0; j<N; ++j) {
15+
a[i][j] = i + j;
16+
}
17+
});
18+
19+
parallel_for(0, N, 1, [=](int i) {
20+
for(int j=0; j<N; ++j) {
21+
b[i][j] = i * j;
22+
}
23+
});
24+
25+
parallel_for(0, N, 1, [=](int i) {
26+
for(int j=0; j<N; ++j) {
27+
c[i][j] = 0;
28+
}
29+
});
30+
31+
parallel_for(0, N, 1, [=](int i) {
32+
for(int j=0; j<N; ++j) {
33+
for(int k=0; k<N; ++k) {
34+
c[i][j] += a[i][k] * b[k][j];
35+
}
36+
}
37+
});
38+
39+
//std::cout << reduce_sum() << std::endl;
40+
}
41+
42+
std::chrono::microseconds measure_time_tbb(unsigned num_threads) {
43+
auto beg = std::chrono::high_resolution_clock::now();
44+
matrix_multiplication_tbb(num_threads);
45+
auto end = std::chrono::high_resolution_clock::now();
46+
return std::chrono::duration_cast<std::chrono::microseconds>(end - beg);
47+
}

docs/master-branch.html

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,8 @@ <h1><a class="anchor" id="master-branch_new_features"></a>
103103
Working Items</h1>
104104
<ul>
105105
<li>Improving the performance of work stealing algorithm </li>
106-
<li>Discovering a scalable memory allocator to handle the taskflow graph</li>
106+
<li>Discovering a scalable memory allocator to handle the taskflow graph </li>
107+
<li>Adding more benchmarks to compare Cpp-Taskflow with OpenMP and TBB</li>
107108
</ul>
108109
<h1><a class="anchor" id="master-branch_bug_fixes"></a>
109110
Bug Fixes</h1>

doxygen/releases/master-branch.dox

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ To download the newest version of Cpp-Taskflow, please clone from <a href="https
1717

1818
@li Improving the performance of work stealing algorithm
1919
@li Discovering a scalable memory allocator to handle the taskflow graph
20+
@li Adding more benchmarks to compare Cpp-Taskflow with OpenMP and TBB
2021

2122
@section master-branch_bug_fixes Bug Fixes
2223

0 commit comments

Comments
 (0)