|
1 | 1 | // This program implements the k-means clustering algorithm in three forms: |
2 | 2 | // - sequential cpu |
3 | 3 | // - parallel cpu |
4 | | -// - gpu |
| 4 | +// - gpu with conditional tasking |
| 5 | +// - gpu without conditional tasking |
5 | 6 |
|
6 | 7 | #include <taskflow/taskflow.hpp> |
7 | 8 |
|
@@ -220,7 +221,7 @@ __global__ void compute_new_means( |
220 | 221 | my[cluster] = sy[cluster] / count; |
221 | 222 | } |
222 | 223 |
|
223 | | -// run k-means on gpu |
| 224 | +// Runs k-means on gpu using conditional tasking |
224 | 225 | std::pair<std::vector<float>, std::vector<float>> gpu( |
225 | 226 | const int N, |
226 | 227 | const int K, |
@@ -335,6 +336,117 @@ std::pair<std::vector<float>, std::vector<float>> gpu( |
335 | 336 | return {h_mx, h_my}; |
336 | 337 | } |
337 | 338 |
|
| 339 | +// Runs k-means on gpu without using conditional tasking |
| 340 | +std::pair<std::vector<float>, std::vector<float>> gpu_predicate( |
| 341 | + const int N, |
| 342 | + const int K, |
| 343 | + const int M, |
| 344 | + const std::vector<float>& h_px, |
| 345 | + const std::vector<float>& h_py |
| 346 | +) { |
| 347 | + |
| 348 | + std::vector<float> h_mx, h_my; |
| 349 | + float *d_px, *d_py, *d_mx, *d_my, *d_sx, *d_sy, *d_c; |
| 350 | + |
| 351 | + for(int i=0; i<K; ++i) { |
| 352 | + h_mx.push_back(h_px[i]); |
| 353 | + h_my.push_back(h_py[i]); |
| 354 | + } |
| 355 | + |
| 356 | + // create a taskflow graph |
| 357 | + tf::Executor executor; |
| 358 | + tf::Taskflow taskflow("K-Means"); |
| 359 | + |
| 360 | + auto allocate_px = taskflow.emplace([&](){ |
| 361 | + TF_CHECK_CUDA(cudaMalloc(&d_px, N*sizeof(float)), "failed to allocate d_px"); |
| 362 | + }).name("allocate_px"); |
| 363 | + |
| 364 | + auto allocate_py = taskflow.emplace([&](){ |
| 365 | + TF_CHECK_CUDA(cudaMalloc(&d_py, N*sizeof(float)), "failed to allocate d_py"); |
| 366 | + }).name("allocate_py"); |
| 367 | + |
| 368 | + auto allocate_mx = taskflow.emplace([&](){ |
| 369 | + TF_CHECK_CUDA(cudaMalloc(&d_mx, K*sizeof(float)), "failed to allocate d_mx"); |
| 370 | + }).name("allocate_mx"); |
| 371 | + |
| 372 | + auto allocate_my = taskflow.emplace([&](){ |
| 373 | + TF_CHECK_CUDA(cudaMalloc(&d_my, K*sizeof(float)), "failed to allocate d_my"); |
| 374 | + }).name("allocate_my"); |
| 375 | + |
| 376 | + auto allocate_sx = taskflow.emplace([&](){ |
| 377 | + TF_CHECK_CUDA(cudaMalloc(&d_sx, K*sizeof(float)), "failed to allocate d_sx"); |
| 378 | + }).name("allocate_sx"); |
| 379 | + |
| 380 | + auto allocate_sy = taskflow.emplace([&](){ |
| 381 | + TF_CHECK_CUDA(cudaMalloc(&d_sy, K*sizeof(float)), "failed to allocate d_sy"); |
| 382 | + }).name("allocate_sy"); |
| 383 | + |
| 384 | + auto allocate_c = taskflow.emplace([&](){ |
| 385 | + TF_CHECK_CUDA(cudaMalloc(&d_c, K*sizeof(float)), "failed to allocate dc"); |
| 386 | + }).name("allocate_c"); |
| 387 | + |
| 388 | + auto h2d = taskflow.emplace([&](tf::cudaFlow& cf){ |
| 389 | + cf.copy(d_px, h_px.data(), N).name("h2d_px"); |
| 390 | + cf.copy(d_py, h_py.data(), N).name("h2d_py"); |
| 391 | + cf.copy(d_mx, h_mx.data(), K).name("h2d_mx"); |
| 392 | + cf.copy(d_my, h_my.data(), K).name("h2d_my"); |
| 393 | + }).name("h2d"); |
| 394 | + |
| 395 | + auto kmeans = taskflow.emplace([&](tf::cudaFlow& cf){ |
| 396 | + |
| 397 | + auto zero_c = cf.zero(d_c, K).name("zero_c"); |
| 398 | + auto zero_sx = cf.zero(d_sx, K).name("zero_sx"); |
| 399 | + auto zero_sy = cf.zero(d_sy, K).name("zero_sy"); |
| 400 | + |
| 401 | + auto cluster = cf.kernel( |
| 402 | + (N+1024-1) / 1024, 1024, 0, |
| 403 | + assign_clusters, d_px, d_py, N, d_mx, d_my, d_sx, d_sy, K, d_c |
| 404 | + ).name("cluster"); |
| 405 | + |
| 406 | + auto new_centroid = cf.kernel( |
| 407 | + 1, K, 0, |
| 408 | + compute_new_means, d_mx, d_my, d_sx, d_sy, d_c |
| 409 | + ).name("new_centroid"); |
| 410 | + |
| 411 | + cluster.precede(new_centroid) |
| 412 | + .succeed(zero_c, zero_sx, zero_sy); |
| 413 | + |
| 414 | + cf.repeat(M); |
| 415 | + }).name("update_means"); |
| 416 | + |
| 417 | + auto stop = taskflow.emplace([&](tf::cudaFlow& cf){ |
| 418 | + cf.copy(h_mx.data(), d_mx, K).name("d2h_mx"); |
| 419 | + cf.copy(h_my.data(), d_my, K).name("d2h_my"); |
| 420 | + }).name("d2h"); |
| 421 | + |
| 422 | + auto free = taskflow.emplace([&](){ |
| 423 | + TF_CHECK_CUDA(cudaFree(d_px), "failed to free d_px"); |
| 424 | + TF_CHECK_CUDA(cudaFree(d_py), "failed to free d_py"); |
| 425 | + TF_CHECK_CUDA(cudaFree(d_mx), "failed to free d_mx"); |
| 426 | + TF_CHECK_CUDA(cudaFree(d_my), "failed to free d_my"); |
| 427 | + TF_CHECK_CUDA(cudaFree(d_sx), "failed to free d_sx"); |
| 428 | + TF_CHECK_CUDA(cudaFree(d_sy), "failed to free d_sy"); |
| 429 | + TF_CHECK_CUDA(cudaFree(d_c), "failed to free d_c"); |
| 430 | + }).name("free"); |
| 431 | + |
| 432 | + // build up the dependency |
| 433 | + h2d.succeed(allocate_px, allocate_py, allocate_mx, allocate_my); |
| 434 | + |
| 435 | + kmeans.succeed(allocate_sx, allocate_sy, allocate_c, h2d) |
| 436 | + .precede(stop); |
| 437 | + |
| 438 | + stop.precede(free); |
| 439 | + |
| 440 | + //taskflow.dump(std::cout); |
| 441 | + |
| 442 | + // run the taskflow |
| 443 | + executor.run(taskflow).wait(); |
| 444 | + |
| 445 | + //std::cout << "dumping kmeans graph ...\n"; |
| 446 | + //taskflow.dump(std::cout); |
| 447 | + return {h_mx, h_my}; |
| 448 | +} |
| 449 | + |
338 | 450 | // Function: main |
339 | 451 | int main(int argc, const char* argv[]) { |
340 | 452 |
|
@@ -398,22 +510,35 @@ int main(int argc, const char* argv[]) { |
398 | 510 | << std::setw(10) << my[k] << '\n'; |
399 | 511 | } |
400 | 512 |
|
401 | | - // k-means on gpu |
402 | | - std::cout << "running k-means on gpu ... "; |
| 513 | + // k-means on gpu with conditional tasking |
| 514 | + std::cout << "running k-means on gpu (with conditional tasking) ... "; |
403 | 515 | auto gbeg = std::chrono::steady_clock::now(); |
404 | 516 | std::tie(mx, my) = gpu(N, K, M, h_px, h_py); |
405 | 517 | auto gend = std::chrono::steady_clock::now(); |
406 | 518 | std::cout << "completed with " |
407 | 519 | << std::chrono::duration_cast<std::chrono::milliseconds>(gend-gbeg).count() |
408 | 520 | << " ms\n"; |
409 | 521 |
|
410 | | - std::cout << "k centroids found by gpu\n"; |
| 522 | + std::cout << "k centroids found by gpu (with conditional tasking)\n"; |
411 | 523 | for(int k=0; k<K; ++k) { |
412 | 524 | std::cout << "centroid " << k << ": " << std::setw(10) << mx[k] << ' ' |
413 | 525 | << std::setw(10) << my[k] << '\n'; |
414 | 526 | } |
415 | 527 |
|
416 | | - |
| 528 | + // k-means on gpu without conditional tasking |
| 529 | + std::cout << "running k-means on gpu (without conditional tasking) ... "; |
| 530 | + auto rbeg = std::chrono::steady_clock::now(); |
| 531 | + std::tie(mx, my) = gpu_predicate(N, K, M, h_px, h_py); |
| 532 | + auto rend = std::chrono::steady_clock::now(); |
| 533 | + std::cout << "completed with " |
| 534 | + << std::chrono::duration_cast<std::chrono::milliseconds>(rend-rbeg).count() |
| 535 | + << " ms\n"; |
| 536 | + |
| 537 | + std::cout << "k centroids found by gpu (without conditional tasking)\n"; |
| 538 | + for(int k=0; k<K; ++k) { |
| 539 | + std::cout << "centroid " << k << ": " << std::setw(10) << mx[k] << ' ' |
| 540 | + << std::setw(10) << my[k] << '\n'; |
| 541 | + } |
417 | 542 |
|
418 | 543 | return 0; |
419 | 544 | } |
|
0 commit comments