|
| 1 | +# Wavefront |
| 2 | + |
| 3 | +This page we compare the three implementations of wavefront computing pattern using OpenMP, Intel-TBB and Cpp-Taskflow. |
| 4 | +The wavefront computing pattern is from the blog in [Intel Developer Zone]. |
| 5 | + |
| 6 | + |
| 7 | + |
| 8 | +As shown in the figure, we partition a 2D matrix into a set of identical square sub-matrices (blocks). |
| 9 | +Each submatrix is mapped to a task that performs a linear scan through each element and |
| 10 | +apply some arithmetic calculation. The wavefront propagates the block dependency diagonally |
| 11 | +from the top-left submatrix to the bottom-right submatrix. Each block precedes two blocks, one to the |
| 12 | +right and another below. The blocks with the same color can run concurrently. |
| 13 | + |
| 14 | + |
| 15 | ++ [OpenMp](#openmp) |
| 16 | ++ [Intel-TBB](#intel-tbb) |
| 17 | ++ [Cpp-Taskflow](#cpp-taskflow) |
| 18 | + |
| 19 | + |
| 20 | +# OpenMP |
| 21 | + |
| 22 | +```cpp |
| 23 | +1: // MB, NB: number of blocks in the two dimensions. B: dimension of a block |
| 24 | +2: // matrix: the given 2D matrix |
| 25 | +3: // D: dependency matrix |
| 26 | +4: void wavefront(size_t MB, size_t NB, size_t B, double** matrix, int** D){ |
| 27 | +5: omp_set_num_threads(std::thread::hardware_concurrency()); |
| 28 | +6: #pragma omp parallel |
| 29 | +7: { |
| 30 | +8: #pragma omp single |
| 31 | +9: { |
| 32 | +10: for(int i=0; i<MB; i++){ |
| 33 | +11: for(int j=0; j<NB; j++) { |
| 34 | +12: if(i > 0 && j > 0){ |
| 35 | +13: #pragma omp task depend(in:D[i-1][j], D[i][j-1]) depend(out:D[i][j]) firstprivate(i, j) |
| 36 | +14: block_computation(matrix, B, i, j); |
| 37 | +15: } |
| 38 | +16: // Top left corner |
| 39 | +17: else if(i == 0 && j == 0){ |
| 40 | +18: #pragma omp task depend(out:D[i][j]) firstprivate(i, j) |
| 41 | +19: block_computation(matrix, B, i, j); |
| 42 | +20: } |
| 43 | +21: // Top edge |
| 44 | +22: else if(j+1 <= NB && i == 0 && j > 0){ |
| 45 | +23: #pragma omp task depend(in:D[i][j-1]) depend(out:D[i][j]) firstprivate(i, j) |
| 46 | +24: block_computation(matrix, B, i, j); |
| 47 | +25: } |
| 48 | +26: // Left edge |
| 49 | +27: else if(i+1 <= MB && i > 0 && j == 0){ |
| 50 | +28: #pragma omp task depend(in:D[i-1][j]) depend(out:D[i][j]) firstprivate(i, j) |
| 51 | +29: block_computation(matrix, B, i, j); |
| 52 | +30: } |
| 53 | +31: // Bottom right corner |
| 54 | +32: else{ |
| 55 | +33: #pragma omp task depend(in:D[i-1][j] ,D[i][j-1]) firstprivate(i, j) |
| 56 | +34: block_computation(matrix, B, i, j); |
| 57 | +35: } |
| 58 | +36: } // End of inner loop |
| 59 | +37: } // End of outer loop |
| 60 | +38: } // End of omp single |
| 61 | +39: } // End of omp parallel |
| 62 | +40: } |
| 63 | +``` |
| 64 | +
|
| 65 | +This function shows the wavefront computing implemented using OpenMP. Each |
| 66 | +block is delegated to a OpenMP task. For each task we need to explicitly specify both the |
| 67 | +input and output depedency and an additional depedency matrix `D` is |
| 68 | +created for this purpose. |
| 69 | +
|
| 70 | +
|
| 71 | +# Intel-TBB |
| 72 | +
|
| 73 | +```cpp |
| 74 | +1: using namespace tbb; |
| 75 | +2: using namespace tbb::flow; |
| 76 | +3: |
| 77 | +4: // MB, NB: number of blocks in the two dimensions. B: dimension of a block |
| 78 | +5: // matrix: the given 2D matrix |
| 79 | +6: // nodes: the nodes in flow graph |
| 80 | +7: // G: Intel-TBB flow graph |
| 81 | +8: void wavefront(size_t MB, size_t NB, size_t B, double** matrix, continue_node<continue_msg> ***nodes, Graph& G){ |
| 82 | +9: for(int i=MB; --i>=0;) { |
| 83 | +10: for(int j=NB; --j>=0;) { |
| 84 | +11: node[i][j] = new continue_node<continue_msg>(G, |
| 85 | +12: [=](const continue_msg&) { |
| 86 | +13: block_computation(matrix, i, j); |
| 87 | +14: }); |
| 88 | +15: if(i+1 < MB) { |
| 89 | +16: make_edge(*node[i][j], *node[i+1][j]); |
| 90 | +17: } |
| 91 | +18: if(j+1 < NB) { |
| 92 | +19: make_edge(*node[i][j], *node[i][j+1]); |
| 93 | +20: } |
| 94 | +21: } // End of inner loop |
| 95 | +22: } // End of outer loop |
| 96 | +23: |
| 97 | +24: nodes[0][0]->try_put(continue_msg()); |
| 98 | +25: G.wait_for_all(); |
| 99 | +26: } |
| 100 | +``` |
| 101 | + |
| 102 | +This function shows the wavefront computing implemented using Intel-TBB flow graph. We |
| 103 | +build a depedency graph using the `continue_node` type in TBB flow graph and delegate |
| 104 | +each block to a node. The `make_edge` function specifies the depedency between two nodes |
| 105 | +and calling `wait_for_all` waits until all computations complete. |
| 106 | + |
| 107 | +# Cpp-Taskflow |
| 108 | + |
| 109 | +```cpp |
| 110 | +1: // MB, NB: number of blocks in the two dimensions. B: dimension of a block |
| 111 | +2: // matrix: the given 2D matrix |
| 112 | +3: // tasks: the placeholders for tasks in Taskflow |
| 113 | +4: // tf: Taskflow object |
| 114 | +5: void wavefront(size_t MB, size_t NB, size_t B, double** matrix, std::vector<std::vector<tf::Task>>& tasks, tf::Taskflow& tf){ |
| 115 | +6: for(int i=MB; --i>=0;) { |
| 116 | +7: for(int j=NB; --j>=0;) { |
| 117 | +8: task[i][j].work([=]() { |
| 118 | +9: block_computation(matrix, B, i, j); |
| 119 | +10: }); |
| 120 | +11: if(j+1 < NB) { |
| 121 | +12: task[i][j].precede(task[i][j+1]); |
| 122 | +13: } |
| 123 | +14: if(i+1 < MB) { |
| 124 | +15: task[i][j].precede(task[i+1][j]); |
| 125 | +16: } |
| 126 | +17: } // End of inner loop |
| 127 | +18: } // End of outer loop |
| 128 | +19: |
| 129 | +20: tf.wait_for_all(); |
| 130 | +21: } |
| 131 | +``` |
| 132 | +This function shows the wavefront computing implemented using Cpp-Taskflow. We |
| 133 | +delegate each block to a `tf::Task` and use the `precede` function to specify |
| 134 | +the dependency between tasks. The `tf.wait_for_all()` blocks until all tasks |
| 135 | +are executed. |
| 136 | +
|
| 137 | +
|
| 138 | +* * * |
| 139 | +
|
| 140 | +[GraphvizOnline]: https://dreampuf.github.io/GraphvizOnline/ |
| 141 | +[Intel Developer Zone]: https://software.intel.com/en-us/blogs/2011/09/09/implementing-a-wave-front-computation-using-the-intel-threading-building-blocks-flow-graph |
0 commit comments