atomics: Add warp-aggregated atomic increment

Faster atomic counter increment using warp-aggregated atomics. Useful for filtering. Adapted from: https://developer.nvidia.com/blog/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/
rapidsai · Jul 8, 2022 · 4a76a73 · 4a76a73
1 parent 2b27bad
commit 4a76a73
Showing 1 changed file with 30 additions and 0 deletions.
diff --git a/cpp/include/raft/device_atomics.cuh b/cpp/include/raft/device_atomics.cuh
@@ -28,6 +28,7 @@
  */
 
 #include <type_traits>
+#include <cooperative_groups.h>
 
 namespace raft {
 
@@ -636,3 +637,32 @@ __forceinline__ __device__ T atomicXor(T* address, T val)
 {
   return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceXor{});
 }
+
+/**
+ * @brief: Warp aggregated atomic increment
+ *
+ * increments an atomic counter using all active threads in a warp. The return
+ * value is the original value of the counter plus the rank of the calling
+ * thread.
+ *
+ * The use of atomicIncWarp is a performance optimization. It can reduce the
+ * amount of atomic memory traffic by a factor of 32.
+ *
+ * Adapted from:
+ * https://developer.nvidia.com/blog/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/
+ *
+ * @tparam          T An integral type
+ * @param[in,out] ctr The address of old value
+ *
+ * @return The old value of the counter plus the rank of the calling thread.
+ */
+template <typename T                                                = unsigned int,
+          typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
+__device__ T atomicIncWarp(T* ctr)
+{
+  namespace cg = cooperative_groups;
+  auto g       = cg::coalesced_threads();
+  T warp_res;
+  if (g.thread_rank() == 0) { warp_res = atomicAdd(ctr, static_cast<T>(g.size())); }
+  return g.shfl(warp_res, 0) + g.thread_rank();
+}