Make warp-level bitonic sort public

rapidsai · rapids-bot · Jan 23, 2023 · Dec 9, 2022 · Dec 9, 2022 · Dec 9, 2022
commit 39c10a9aecd53038d211358f2e409aaf66f9b5ba
@@ -18,9 +18,9 @@
 
 #include <raft/util/cuda_utils.cuh>
 
-namespace raft::spatial::knn::detail::topk {
+namespace raft::util {
 
-namespace helpers {
+namespace {
 
 template <typename T>
 __device__ __forceinline__ void swap(T& x, T& y)
@@ -36,7 +36,7 @@ __device__ __forceinline__ void conditional_assign(bool cond, T& ptr, T x)
   if (cond) { ptr = x; }
 }
 
-}  // namespace helpers
+}  // namespace
 
 /**
  * Warp-wide bitonic merge and sort.
@@ -95,7 +95,7 @@ class bitonic {
    *
    *   1) Sort any bitonic sequence.
    *   2) Merge two halves of the input data assuming they're already sorted, and their order is
-   *      opposite (i.e. either ascending, descending or vice-versa).
+   *      opposite (i.e. either ascending+descending or descending+ascending).
    *
    * The input pointers are unique per-thread.
    * See the class description for the description of the data layout.
@@ -111,7 +111,7 @@ class bitonic {
   __device__ __forceinline__ void merge(KeyT* __restrict__ keys,
                                         PayloadTs* __restrict__... payloads) const
   {
-    return bitonic<Size>::merge_(ascending_, warp_width_, keys, payloads...);
+    return bitonic<Size>::merge_impl(ascending_, warp_width_, keys, payloads...);
   }
 
   /**
@@ -130,7 +130,7 @@ class bitonic {
   __device__ __forceinline__ void sort(KeyT* __restrict__ keys,
                                        PayloadTs* __restrict__... payloads) const
   {
-    return bitonic<Size>::sort_(ascending_, warp_width_, keys, payloads...);
+    return bitonic<Size>::sort_impl(ascending_, warp_width_, keys, payloads...);
   }
 
   /**
@@ -173,10 +173,10 @@ class bitonic {
   friend class bitonic;
 
   template <typename KeyT, typename... PayloadTs>
-  static __device__ __forceinline__ void merge_(bool ascending,
-                                                int warp_width,
-                                                KeyT* __restrict__ keys,
-                                                PayloadTs* __restrict__... payloads)
+  static __device__ __forceinline__ void merge_impl(bool ascending,
+                                                    int warp_width,
+                                                    KeyT* __restrict__ keys,
+                                                    PayloadTs* __restrict__... payloads)
   {
 #pragma unroll
     for (int size = Size; size > 1; size >>= 1) {
@@ -189,8 +189,8 @@ class bitonic {
           KeyT& key         = keys[i];
           KeyT& other       = keys[other_i];
           if (ascending ? key > other : key < other) {
-            helpers::swap(key, other);
-            (helpers::swap(payloads[i], payloads[other_i]), ...);
+            swap(key, other);
+            (swap(payloads[i], payloads[other_i]), ...);
           }
         }
       }
@@ -204,33 +204,32 @@ class bitonic {
         const KeyT other     = shfl_xor(key, stride, warp_width);
         const bool do_assign = (ascending != is_second) ? key > other : key < other;
 
-        helpers::conditional_assign(do_assign, key, other);
+        conditional_assign(do_assign, key, other);
         // NB: don't put shfl_xor in a conditional; it must be called by all threads in a warp.
-        (helpers::conditional_assign(
-           do_assign, payloads[i], shfl_xor(payloads[i], stride, warp_width)),
+        (conditional_assign(do_assign, payloads[i], shfl_xor(payloads[i], stride, warp_width)),
          ...);
       }
     }
   }
 
   template <typename KeyT, typename... PayloadTs>
-  static __device__ __forceinline__ void sort_(bool ascending,
-                                               int warp_width,
-                                               KeyT* __restrict__ keys,
-                                               PayloadTs* __restrict__... payloads)
+  static __device__ __forceinline__ void sort_impl(bool ascending,
+                                                   int warp_width,
+                                                   KeyT* __restrict__ keys,
+                                                   PayloadTs* __restrict__... payloads)
   {
     if constexpr (Size == 1) {
       const int lane = laneId();
       for (int width = 2; width < warp_width; width <<= 1) {
-        bitonic<1>::merge_(lane & width, width, keys, payloads...);
+        bitonic<1>::merge_impl(lane & width, width, keys, payloads...);
       }
     } else {
       constexpr int kSize2 = Size / 2;
-      bitonic<kSize2>::sort_(false, warp_width, keys, payloads...);
-      bitonic<kSize2>::sort_(true, warp_width, keys + kSize2, (payloads + kSize2)...);
+      bitonic<kSize2>::sort_impl(false, warp_width, keys, payloads...);
+      bitonic<kSize2>::sort_impl(true, warp_width, keys + kSize2, (payloads + kSize2)...);
     }
-    bitonic<Size>::merge_(ascending, warp_width, keys, payloads...);
+    bitonic<Size>::merge_impl(ascending, warp_width, keys, payloads...);
   }
 };
 
-}  // namespace raft::spatial::knn::detail::topk
+}  // namespace raft::util
@@ -288,7 +288,14 @@ if(BUILD_TESTS)
   )
 
   ConfigureTest(
-    NAME UTILS_TEST PATH test/common/seive.cu test/cudart_utils.cpp test/device_atomics.cu
-    test/integer_utils.cpp test/pow2_utils.cu
+    NAME
+    UTILS_TEST
+    PATH
+    test/common/seive.cu
+    test/cudart_utils.cpp
+    test/device_atomics.cu
+    test/integer_utils.cpp
+    test/pow2_utils.cu
+    test/util/bitonic_sort.cu
   )
 endif()
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/util/bitonic_sort.cuh>
+
+#include <algorithm>
+#include <gtest/gtest.h>
+#include <numeric>
+#include <raft/cudart_utils.h>
+#include <raft/random/rng.hpp>
+
+#include "../test_utils.h"
+
+namespace raft::util {
+
+constexpr int kMaxBlockSize = 512;
+constexpr int kMaxCapacity  = 128;
+
+struct test_spec {
+  int n_inputs;
+  int warp_width;
+  int capacity;
+  bool ascending;
+
+  [[nodiscard]] auto len() const -> int { return n_inputs * warp_width * capacity; }
+};
+
+auto operator<<(std::ostream& os, const test_spec& ss) -> std::ostream&
+{
+  os << "spec{n_inputs: " << ss.n_inputs << ", input_len: " << (ss.warp_width * ss.capacity) << " ("
+     << ss.warp_width << " * " << ss.capacity << ")";
+  os << (ss.ascending ? "; asc}" : "; dsc}");
+  return os;
+}
+
+template <int Capacity, typename T>
+__global__ void bitonic_kernel(T* arr, bool ascending, int warp_width, int n_inputs)
+{
+  const int tid          = blockDim.x * blockIdx.x + threadIdx.x;
+  const int subwarp_id   = tid / warp_width;
+  const int subwarp_lane = tid % warp_width;
+  T local_arr[Capacity];  // NOLINT
+  // Split the data into chunks of size `warp_width * Capacity`, each thread poiting
+  // to the beginning of its stride within the chunk.
+  T* per_thread_arr = arr + subwarp_id * warp_width * Capacity + subwarp_lane;
+
+  if (subwarp_id < n_inputs) {
+#pragma unroll
+    for (int i = 0; i < Capacity; i++) {
+      local_arr[i] = per_thread_arr[i * warp_width];
+    }
+  }
+
+  bitonic<Capacity>(ascending, warp_width).sort(local_arr);
+
+  if (subwarp_id < n_inputs) {
+#pragma unroll
+    for (int i = 0; i < Capacity; i++) {
+      per_thread_arr[i * warp_width] = local_arr[i];
+    }
+  }
+}
+
+template <int Capacity>
+struct bitonic_launch {
+  template <typename T>
+  static void run(const test_spec& spec, T* arr, rmm::cuda_stream_view stream)
+  {
+    ASSERT(spec.capacity <= Capacity, "Invalid input: the requested capacity is too high.");
+    ASSERT(spec.warp_width <= WarpSize,
+           "Invalid input: the requested warp_width must be not larger than the WarpSize.");
+    if constexpr (Capacity > 1) {
+      if (spec.capacity < Capacity) {
+        return bitonic_launch<std::max(1, Capacity / 2)>::run(spec, arr, stream);
+      }
+    }
+    int max_block_size, min_grid_size;
+    RAFT_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize(
+      &min_grid_size, &max_block_size, bitonic_kernel<Capacity, T>, 0, kMaxBlockSize));
+    const int n_warps =
+      ceildiv(std::min(spec.n_inputs * spec.warp_width, max_block_size), WarpSize);
+    const int block_dim  = n_warps * WarpSize;
+    const int n_subwarps = block_dim / spec.warp_width;
+    const int grid_dim   = ceildiv(spec.n_inputs, n_subwarps);
+    bitonic_kernel<Capacity, T>
+      <<<grid_dim, block_dim, 0, stream>>>(arr, spec.ascending, spec.warp_width, spec.n_inputs);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+};
+
+template <typename T>
+class BitonicTest : public testing::TestWithParam<test_spec> {  // NOLINT
+ protected:
+  const test_spec spec;  // NOLINT
+  std::vector<T> in;     // NOLINT
+  std::vector<T> out;    // NOLINT
+  std::vector<T> ref;    // NOLINT
+
+  void segmented_sort(std::vector<T>& vec, int k, bool ascending)  // NOLINT
+  {
+    std::vector<int> p(vec.size());
+    std::iota(p.begin(), p.end(), 0);
+    std::sort(p.begin(), p.end(), [&vec, k, ascending](int i, int j) {
+      const int ik = i / k;
+      const int jk = j / k;
+      if (ik == jk) { return ascending ? vec[i] < vec[j] : vec[i] > vec[j]; }
+      return ik < jk;
+    });
+    for (auto i = int(vec.size()) - 1; i > 0; i--) {
+      auto j = p[i];
+      while (j > i)
+        j = p[j];
+      std::swap(vec[j], vec[i]);
+    }
+  }
+
+  void fill_random(rmm::device_uvector<T>& arr, rmm::cuda_stream_view stream)
+  {
+    raft::random::Rng rng(42);
+    if constexpr (std::is_floating_point_v<T>) {
+      return rng.normal(arr.data(), arr.size(), T(10), T(100), stream);
+    }
+    if constexpr (std::is_integral_v<T>) {
+      return rng.normalInt(arr.data(), arr.size(), T(10), T(100), stream);
+    }
+  }
+
+ public:
+  explicit BitonicTest()
+    : spec(testing::TestWithParam<test_spec>::GetParam()),
+      in(spec.len()),
+      out(spec.len()),
+      ref(spec.len())
+  {
+    auto stream = rmm::cuda_stream_default;
+
+    // generate input
+    rmm::device_uvector<T> arr_d(spec.len(), stream);
+    fill_random(arr_d, stream);
+    update_host(in.data(), arr_d.data(), arr_d.size(), stream);
+
+    // calculate the results
+    bitonic_launch<kMaxCapacity>::run(spec, arr_d.data(), stream);
+    update_host(out.data(), arr_d.data(), arr_d.size(), stream);
+
+    // make sure the results are available on host
+    stream.synchronize();
+
+    // calculate the reference
+    std::copy(in.begin(), in.end(), ref.begin());
+    segmented_sort(ref, spec.warp_width * spec.capacity, spec.ascending);
+  }
+
+  void run() { ASSERT_TRUE(hostVecMatch(ref, out, Compare<T>())); }
+};
+
+auto inputs = ::testing::Values(test_spec{1, 1, 1, true},
+                                test_spec{1, 2, 1, true},
+                                test_spec{1, 4, 1, true},
+                                test_spec{1, 8, 1, true},
+                                test_spec{1, 16, 1, false},
+                                test_spec{1, 32, 1, false},
+                                test_spec{1, 32, 2, false},
+                                test_spec{1, 32, 4, true},
+                                test_spec{1, 32, 8, true},
+                                test_spec{5, 32, 2, true},
+                                test_spec{7, 16, 4, true},
+                                test_spec{7, 8, 2, true},
+                                test_spec{70, 4, 32, true},
+                                test_spec{70, 1, 64, true},
+                                test_spec{70, 2, 128, false});
+
+using Floats = BitonicTest<float>;                     // NOLINT
+TEST_P(Floats, Run) { run(); }                         // NOLINT
+INSTANTIATE_TEST_CASE_P(BitonicTest, Floats, inputs);  // NOLINT
+
+using Ints = BitonicTest<int>;                       // NOLINT
+TEST_P(Ints, Run) { run(); }                         // NOLINT
+INSTANTIATE_TEST_CASE_P(BitonicTest, Ints, inputs);  // NOLINT
+
+using Doubles = BitonicTest<double>;                    // NOLINT
+TEST_P(Doubles, Run) { run(); }                         // NOLINT
+INSTANTIATE_TEST_CASE_P(BitonicTest, Doubles, inputs);  // NOLINT
+
+}  // namespace raft::util