atomics: Add warp-aggregated atomic increment

Faster atomic counter increment using warp-aggregated atomics. Useful for filtering. Adapted from: https://developer.nvidia.com/blog/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/
rapidsai · rapids-bot · Jul 14, 2022 · Jul 8, 2022 · Jul 13, 2022 · 04ea9a72f8406245d5b6ed0b8a1934edc6a87865
commit 04ea9a72f8406245d5b6ed0b8a1934edc6a87865
@@ -27,6 +27,7 @@
  * binary operator.
  */
 
+#include <cooperative_groups.h>
 #include <type_traits>
 
 namespace raft {
@@ -636,3 +637,32 @@ __forceinline__ __device__ T atomicXor(T* address, T val)
 {
   return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceXor{});
 }
+
+/**
+ * @brief: Warp aggregated atomic increment
+ *
+ * increments an atomic counter using all active threads in a warp. The return
+ * value is the original value of the counter plus the rank of the calling
+ * thread.
+ *
+ * The use of atomicIncWarp is a performance optimization. It can reduce the
+ * amount of atomic memory traffic by a factor of 32.
+ *
+ * Adapted from:
+ * https://developer.nvidia.com/blog/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/
+ *
+ * @tparam          T An integral type
+ * @param[in,out] ctr The address of old value
+ *
+ * @return The old value of the counter plus the rank of the calling thread.
+ */
+template <typename T                                                = unsigned int,
+          typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
+__device__ T atomicIncWarp(T* ctr)
+{
+  namespace cg = cooperative_groups;
+  auto g       = cg::coalesced_threads();
+  T warp_res;
+  if (g.thread_rank() == 0) { warp_res = atomicAdd(ctr, static_cast<T>(g.size())); }
+  return g.shfl(warp_res, 0) + g.thread_rank();
+}
@@ -20,6 +20,7 @@ add_executable(test_raft
     test/common/seive.cu
     test/cudart_utils.cpp
     test/cluster_solvers.cu
+    test/device_atomics.cu
     test/distance/dist_adj.cu
     test/distance/dist_canberra.cu
     test/distance/dist_chebyshev.cu

@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <numeric>
+#include <raft/cudart_utils.h>
+#include <raft/device_atomics.cuh>
+#include <rmm/cuda_stream_pool.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+
+__global__ void test_atomic_inc_warp_kernel(int* counter, int* out_array)
+{
+  int global_tid                    = blockDim.x * blockIdx.x + threadIdx.x;
+  out_array[atomicIncWarp(counter)] = global_tid;
+}
+
+TEST(Raft, AtomicIncWarp)
+{
+  const int num_blocks        = 1024;
+  const int threads_per_block = 1024;
+  const int num_elts          = num_blocks * threads_per_block;
+
+  rmm::cuda_stream_pool pool{1};
+  auto s = pool.get_stream();
+
+  rmm::device_scalar<int> counter{0, s};
+  rmm::device_uvector<int> out_device{num_elts, s};
+  std::array<int, num_elts> out_host{0};
+
+  // Write all 1M thread indices to a unique location in `out_device`
+  test_atomic_inc_warp_kernel<<<num_blocks, threads_per_block, 0, s>>>(counter.data(),
+                                                                       out_device.data());
+
+  // Copy data to host
+  RAFT_CUDA_TRY(cudaMemcpy(out_host.data(),
+                           (const void*)out_device.data(),
+                           num_elts * sizeof(int),
+                           cudaMemcpyDeviceToHost));
+
+  // Check that count is correct and that each thread index is contained in the
+  // array exactly once.
+  ASSERT_EQ(num_elts, counter.value(s));
+  std::sort(out_host.begin(), out_host.end());
+  for (int i = 0; i < num_elts; ++i) {
+    ASSERT_EQ(i, out_host[i]);
+  }
+}
+
+}  // namespace raft