Skip to content

Commit 852923e

Browse files
committed
added memset node
1 parent 7689ea9 commit 852923e

3 files changed

Lines changed: 114 additions & 16 deletions

File tree

taskflow/cuda/cuda_flow_builder.hpp

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ namespace tf {
77
/**
88
@class cudaFlow
99
10-
@brief Building methods of a cuda task dependency graph.
10+
@brief Building methods for a cuda task dependency graph.
1111
*/
1212
class cudaFlow {
1313

@@ -78,6 +78,18 @@ class cudaFlow {
7878
*/
7979
template <typename F, typename... ArgsT>
8080
cudaTask kernel_on(int d, dim3 g, dim3 b, size_t s, F&& f, ArgsT&&... args);
81+
82+
/**
83+
@brief creates a memset node
84+
85+
@param dst pointer to the destination device memory area
86+
@param ch value to set for each byte of specified memory
87+
@param count size in bytes to set
88+
89+
A memset tasks fills the first @c count bytes of device memory area
90+
pointed by @c dst with the byte value @ch.
91+
*/
92+
cudaTask memset(void* dst, int ch, size_t count);
8193

8294
/**
8395
@brief creates an 1D copy task
@@ -240,6 +252,32 @@ cudaTask cudaFlow::kernel_on(
240252
return cudaTask(node);
241253
}
242254

255+
// Function: memset
256+
inline cudaTask cudaFlow::memset(void* dst, int ch, size_t count) {
257+
258+
auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Memset>{},
259+
[=] (cudaGraph_t& graph, cudaGraphNode_t& node) {
260+
cudaMemsetParams p;
261+
p.dst = dst;
262+
p.value = ch;
263+
p.pitch = 0;
264+
//p.elementSize = (count & 1) == 0 ? ((count & 3) == 0 ? 4 : 2) : 1;
265+
//p.width = (count & 1) == 0 ? ((count & 3) == 0 ? count >> 2 : count >> 1) : count;
266+
p.elementSize = 1; // either 1, 2, or 4
267+
p.width = count;
268+
269+
p.height = 1;
270+
TF_CHECK_CUDA(
271+
cudaGraphAddMemsetNode(&node, graph, nullptr, 0, &p),
272+
"failed to create a cudaMemset node"
273+
);
274+
}
275+
);
276+
277+
return cudaTask(node);
278+
}
279+
280+
243281
// Function: copy
244282
template <
245283
typename T,

taskflow/cuda/cuda_graph.hpp

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@ class cudaNode {
2727
//struct Host {
2828
// cudaHostNodeParams param;
2929
//};
30-
30+
31+
// Noop handle
3132
struct Noop {
3233

3334
template <typename C>
@@ -36,14 +37,21 @@ class cudaNode {
3637
std::function<void(cudaGraph_t&, cudaGraphNode_t&)> work;
3738
};
3839

40+
// Memset handle
41+
struct Memset {
42+
43+
template <typename C>
44+
Memset(C&&);
45+
46+
std::function<void(cudaGraph_t&, cudaGraphNode_t&)> work;
47+
};
48+
3949
// Copy handle
4050
struct Copy {
4151

4252
template <typename C>
4353
Copy(C&&);
4454

45-
//cudaMemcpy3DParms param;
46-
4755
std::function<void(cudaGraph_t&, cudaGraphNode_t&)> work;
4856
};
4957

@@ -53,16 +61,15 @@ class cudaNode {
5361
template <typename C>
5462
Kernel(C&&);
5563

56-
//cudaKernelNodeParams param;
57-
5864
std::function<void(cudaGraph_t&, cudaGraphNode_t&)> work;
5965
};
6066

61-
using handle_t = nstd::variant<nstd::monostate, Noop, Copy, Kernel>;
67+
using handle_t = nstd::variant<nstd::monostate, Noop, Memset, Copy, Kernel>;
6268

6369
// variant index
64-
constexpr static auto NOOP = get_index_v<Noop, handle_t>;
65-
constexpr static auto COPY = get_index_v<Copy, handle_t>;
70+
constexpr static auto NOOP = get_index_v<Noop, handle_t>;
71+
constexpr static auto MEMSET = get_index_v<Memset, handle_t>;
72+
constexpr static auto COPY = get_index_v<Copy, handle_t>;
6673
constexpr static auto KERNEL = get_index_v<Kernel, handle_t>;
6774

6875
public:
@@ -128,6 +135,11 @@ template <typename C>
128135
cudaNode::Noop::Noop(C&& c) : work {std::forward<C>(c)} {
129136
}
130137

138+
// Memset handle constructor
139+
template <typename C>
140+
cudaNode::Memset::Memset(C&& c) : work {std::forward<C>(c)} {
141+
}
142+
131143
// Copy handle constructor
132144
template <typename C>
133145
cudaNode::Copy::Copy(C&& c) : work {std::forward<C>(c)} {
@@ -146,10 +158,6 @@ cudaNode::cudaNode(ArgsT&&... args) : _handle {std::forward<ArgsT>(args)...} {
146158
// Procedure: _precede
147159
inline void cudaNode::_precede(cudaNode* v) {
148160
_successors.push_back(v);
149-
//TF_CHECK_CUDA(
150-
// ::cudaGraphAddDependencies(_graph._handle, &_node, &(v->_node), 1),
151-
// "failed to add a preceding link"
152-
//);
153161
}
154162

155163
// ----------------------------------------------------------------------------
@@ -219,6 +227,12 @@ inline void cudaGraph::_make_native_graph(int d) {
219227
);
220228
break;
221229

230+
case cudaNode::MEMSET:
231+
nstd::get<cudaNode::Memset>(node->_handle).work(
232+
_native_handle, node->_native_handle
233+
);
234+
break;
235+
222236
case cudaNode::COPY:
223237
nstd::get<cudaNode::Copy>(node->_handle).work(
224238
_native_handle, node->_native_handle
@@ -230,9 +244,6 @@ inline void cudaGraph::_make_native_graph(int d) {
230244
_native_handle, node->_native_handle
231245
);
232246
break;
233-
234-
default:
235-
break;
236247
}
237248
}
238249

unittests/cuda/cuda_basics.cu

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,55 @@ TEST_CASE("BSet.i32" * doctest::timeout(300)) {
247247
bset<int32_t>();
248248
}
249249

250+
// --------------------------------------------------------
251+
// Testcase: Memset
252+
// --------------------------------------------------------
253+
TEST_CASE("Memset") {
254+
255+
tf::Taskflow taskflow;
256+
tf::Executor executor;
257+
258+
const int N = 100;
259+
260+
int* cpu = new int [N];
261+
int* gpu = nullptr;
262+
263+
REQUIRE(cudaMalloc(&gpu, N*sizeof(int)) == cudaSuccess);
264+
265+
for(int r=1; r<=100; ++r) {
266+
267+
int start = ::rand() % N;
268+
269+
for(int i=0; i<N; ++i) {
270+
cpu[i] = 999;
271+
}
272+
273+
taskflow.emplace([&](tf::cudaFlow& cf){
274+
dim3 g = {(unsigned)(N+255)/256, 1, 1};
275+
dim3 b = {256, 1, 1};
276+
auto kset = cf.kernel(g, b, 0, k_set<int>, gpu, N, 123);
277+
auto zero = cf.memset(gpu+start, 0x3f, (N-start)*sizeof(int));
278+
auto copy = cf.copy(cpu, gpu, N);
279+
kset.precede(zero);
280+
zero.precede(copy);
281+
});
282+
283+
executor.run(taskflow).wait();
284+
285+
for(int i=0; i<start; ++i) {
286+
REQUIRE(cpu[i] == 123);
287+
}
288+
for(int i=start; i<N; ++i) {
289+
REQUIRE(cpu[i] == 0x3f3f3f3f);
290+
}
291+
}
292+
293+
294+
delete [] cpu;
295+
REQUIRE(cudaFree(gpu) == cudaSuccess);
296+
}
297+
298+
250299
// --------------------------------------------------------
251300
// Testcase: Barrier
252301
// --------------------------------------------------------

0 commit comments

Comments
 (0)