Skip to content

Commit f7598c4

Browse files
committed
updated saxpy.cu
1 parent fcfbe93 commit f7598c4

1 file changed

Lines changed: 83 additions & 0 deletions

File tree

examples/cuda/saxpy.cu

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#include <taskflow/taskflow.hpp>
2+
3+
// Kernel: saxpy
4+
__global__ void saxpy(int n, float a, float *x, float *y) {
5+
int i = blockIdx.x*blockDim.x + threadIdx.x;
6+
if (i < n) {
7+
y[i] = a*x[i] + y[i];
8+
}
9+
}
10+
11+
// Function: main
12+
int main() {
13+
14+
const unsigned N = 1<<20;
15+
16+
tf::Taskflow taskflow ("saxpy-flow");
17+
tf::Executor executor;
18+
19+
std::vector<float> hx, hy;
20+
21+
float* dx {nullptr};
22+
float* dy {nullptr};
23+
24+
// allocate x
25+
auto allocate_x = taskflow.emplace([&]() {
26+
std::cout << "allocating host x and device x ...\n";
27+
hx.resize(N, 1.0f);
28+
cudaMalloc(&dx, N*sizeof(float));
29+
}).name("allocate_x");
30+
31+
// allocate y
32+
auto allocate_y = taskflow.emplace([&]() {
33+
std::cout << "allocating host y and device y ...\n";
34+
hy.resize(N, 2.0f);
35+
cudaMalloc(&dy, N*sizeof(float));
36+
}).name("allocate_y");
37+
38+
// saxpy
39+
auto cudaflow = taskflow.emplace([&](tf::cudaFlow& cf) {
40+
std::cout << "running cudaflow ...\n";
41+
auto h2d_x = cf.copy(dx, hx.data(), N).name("h2d_x");
42+
auto h2d_y = cf.copy(dy, hy.data(), N).name("h2d_y");
43+
auto d2h_x = cf.copy(hx.data(), dx, N).name("d2h_x");
44+
auto d2h_y = cf.copy(hy.data(), dy, N).name("d2h_y");
45+
auto kernel = cf.kernel((N+255)/256, 256, 0, saxpy, N, 2.0f, dx, dy)
46+
.name("saxpy");
47+
kernel.succeed(h2d_x, h2d_y)
48+
.precede(d2h_x, d2h_y);
49+
}).name("saxpy");
50+
51+
cudaflow.succeed(allocate_x, allocate_y);
52+
53+
// Add a verification task
54+
auto verifier = taskflow.emplace([&](){
55+
float max_error = 0.0f;
56+
for (size_t i = 0; i < N; i++) {
57+
max_error = std::max(max_error, abs(hx[i]-1.0f));
58+
max_error = std::max(max_error, abs(hy[i]-4.0f));
59+
}
60+
std::cout << "saxpy finished with max error: " << max_error << '\n';
61+
}).succeed(cudaflow).name("verify");
62+
63+
// free memory
64+
auto deallocate_x = taskflow.emplace([&](){
65+
std::cout << "deallocating device x ...\n";
66+
cudaFree(dx);
67+
}).name("deallocate_x");
68+
69+
auto deallocate_y = taskflow.emplace([&](){
70+
std::cout << "deallocating device y ...\n";
71+
cudaFree(dy);
72+
}).name("deallocate_y");
73+
74+
verifier.precede(deallocate_x, deallocate_y);
75+
76+
executor.run(taskflow).wait();
77+
78+
std::cout << "dumping the taskflow ...\n";
79+
taskflow.dump(std::cout);
80+
81+
return 0;
82+
}
83+

0 commit comments

Comments
 (0)