Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

matrix::select_k: move selection and warp-sort primitives #1085

Merged
merged 42 commits into from
Jan 23, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
39c10a9
Make warp-level bitonic sort public
achirkin Dec 9, 2022
6cda736
Move spatial::*::select_topk to matrix::select_k
achirkin Dec 9, 2022
c5631bf
Fix includes style
achirkin Dec 9, 2022
fb88433
Use cmake-format
achirkin Dec 9, 2022
f64325b
Refactored warpsort module and made tests for all implementations in …
achirkin Dec 13, 2022
20d01d7
Resort to UVM when radix buffers are too big
achirkin Dec 14, 2022
4813bae
Adjust the dummy_block_sort_t to the changes in the warpsort impl
achirkin Dec 14, 2022
6cdb79a
Fix incorrect include
achirkin Dec 14, 2022
870fc86
Add benchmarks
achirkin Dec 14, 2022
2af45bf
Update CMakeLists.txt style
achirkin Dec 14, 2022
5b336ee
Update CMakeLists.txt style
achirkin Dec 14, 2022
b3e5d9c
Add mdspanified interface
achirkin Dec 15, 2022
164157b
Remove benchmarks for the legacy interface
achirkin Dec 15, 2022
69c81dd
Remove a TODO comment about a seemingly resolved bug
achirkin Dec 15, 2022
d64b12b
Merge remote-tracking branch 'rapidsai/branch-23.02' into enh-matrix-…
achirkin Dec 15, 2022
9d4476a
Fix the changed include extension
achirkin Dec 15, 2022
3e40435
Fix includes in tests
achirkin Dec 16, 2022
e20578e
Merge remote-tracking branch 'rapidsai/branch-23.02' into enh-matrix-…
achirkin Dec 16, 2022
b2c79f5
Merge branch 'branch-23.02' into enh-matrix-topk
achirkin Dec 20, 2022
98e2c2a
Address comments: bitonic_sort
achirkin Dec 20, 2022
af4c146
Replace stream argument with handle_t
achirkin Dec 20, 2022
471828e
rename files to select.* -> select_k.*
achirkin Dec 20, 2022
f6ff223
Use raft macros
achirkin Dec 20, 2022
066208d
Try to pass null and non-null arguments to select_k
achirkin Dec 20, 2022
aeaa1ef
Remove raw-pointer api from the public namespace
achirkin Dec 20, 2022
685b6bf
Updates public docs (add example usage)
achirkin Dec 21, 2022
5c42209
Merge remote-tracking branch 'rapidsai/branch-23.02' into enh-matrix-…
achirkin Jan 9, 2023
2cea50d
Add device_mem_resource
achirkin Jan 9, 2023
a31e61e
Add Doxygen docs
achirkin Jan 10, 2023
a8c5a70
Merge remote-tracking branch 'rapidsai/branch-23.02' into enh-matrix-…
achirkin Jan 10, 2023
8a5978b
Revert the memory_resource param changes in the detail namespace to a…
achirkin Jan 10, 2023
8e58cab
Merge remote-tracking branch 'rapidsai/branch-23.02' into enh-matrix-…
achirkin Jan 11, 2023
a01a75f
Remove device_mem_resource
achirkin Jan 11, 2023
c6256b7
Merge branch 'branch-23.02' into enh-matrix-topk
achirkin Jan 16, 2023
c25e859
Merge branch 'branch-23.02' into enh-matrix-topk
cjnolet Jan 19, 2023
6e56106
Merge branch 'branch-23.02' into enh-matrix-topk
achirkin Jan 20, 2023
c78d9b0
Reference a TODO issue
achirkin Jan 20, 2023
a55a6cb
Merge branch 'enh-matrix-topk' of github.com:achirkin/raft into enh-m…
achirkin Jan 20, 2023
307b113
Deprecation notice
achirkin Jan 20, 2023
c0ce160
Add [in] annotation to all arguments
achirkin Jan 20, 2023
e2cc7ad
Merge branch 'branch-23.02' into enh-matrix-topk
achirkin Jan 23, 2023
dc3043c
Merge branch 'branch-23.02' into enh-matrix-topk
cjnolet Jan 23, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Make warp-level bitonic sort public
  • Loading branch information
achirkin committed Dec 9, 2022
commit 39c10a9aecd53038d211358f2e409aaf66f9b5ba
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@

#include <raft/util/cuda_utils.cuh>

namespace raft::spatial::knn::detail::topk {
namespace raft::util {
achirkin marked this conversation as resolved.
Show resolved Hide resolved

namespace helpers {
namespace {

template <typename T>
__device__ __forceinline__ void swap(T& x, T& y)
Expand All @@ -36,7 +36,7 @@ __device__ __forceinline__ void conditional_assign(bool cond, T& ptr, T x)
if (cond) { ptr = x; }
}

} // namespace helpers
} // namespace

/**
* Warp-wide bitonic merge and sort.
Expand Down Expand Up @@ -95,7 +95,7 @@ class bitonic {
*
* 1) Sort any bitonic sequence.
* 2) Merge two halves of the input data assuming they're already sorted, and their order is
* opposite (i.e. either ascending, descending or vice-versa).
* opposite (i.e. either ascending+descending or descending+ascending).
*
* The input pointers are unique per-thread.
* See the class description for the description of the data layout.
Expand All @@ -111,7 +111,7 @@ class bitonic {
__device__ __forceinline__ void merge(KeyT* __restrict__ keys,
PayloadTs* __restrict__... payloads) const
{
return bitonic<Size>::merge_(ascending_, warp_width_, keys, payloads...);
return bitonic<Size>::merge_impl(ascending_, warp_width_, keys, payloads...);
}

/**
Expand All @@ -130,7 +130,7 @@ class bitonic {
__device__ __forceinline__ void sort(KeyT* __restrict__ keys,
PayloadTs* __restrict__... payloads) const
{
return bitonic<Size>::sort_(ascending_, warp_width_, keys, payloads...);
return bitonic<Size>::sort_impl(ascending_, warp_width_, keys, payloads...);
}

/**
Expand Down Expand Up @@ -173,10 +173,10 @@ class bitonic {
friend class bitonic;

template <typename KeyT, typename... PayloadTs>
static __device__ __forceinline__ void merge_(bool ascending,
int warp_width,
KeyT* __restrict__ keys,
PayloadTs* __restrict__... payloads)
static __device__ __forceinline__ void merge_impl(bool ascending,
achirkin marked this conversation as resolved.
Show resolved Hide resolved
int warp_width,
KeyT* __restrict__ keys,
PayloadTs* __restrict__... payloads)
{
#pragma unroll
for (int size = Size; size > 1; size >>= 1) {
Expand All @@ -189,8 +189,8 @@ class bitonic {
KeyT& key = keys[i];
KeyT& other = keys[other_i];
if (ascending ? key > other : key < other) {
helpers::swap(key, other);
(helpers::swap(payloads[i], payloads[other_i]), ...);
swap(key, other);
(swap(payloads[i], payloads[other_i]), ...);
}
}
}
Expand All @@ -204,33 +204,32 @@ class bitonic {
const KeyT other = shfl_xor(key, stride, warp_width);
const bool do_assign = (ascending != is_second) ? key > other : key < other;

helpers::conditional_assign(do_assign, key, other);
conditional_assign(do_assign, key, other);
// NB: don't put shfl_xor in a conditional; it must be called by all threads in a warp.
(helpers::conditional_assign(
do_assign, payloads[i], shfl_xor(payloads[i], stride, warp_width)),
(conditional_assign(do_assign, payloads[i], shfl_xor(payloads[i], stride, warp_width)),
...);
}
}
}

template <typename KeyT, typename... PayloadTs>
static __device__ __forceinline__ void sort_(bool ascending,
int warp_width,
KeyT* __restrict__ keys,
PayloadTs* __restrict__... payloads)
static __device__ __forceinline__ void sort_impl(bool ascending,
int warp_width,
KeyT* __restrict__ keys,
PayloadTs* __restrict__... payloads)
{
if constexpr (Size == 1) {
const int lane = laneId();
for (int width = 2; width < warp_width; width <<= 1) {
bitonic<1>::merge_(lane & width, width, keys, payloads...);
bitonic<1>::merge_impl(lane & width, width, keys, payloads...);
}
} else {
constexpr int kSize2 = Size / 2;
bitonic<kSize2>::sort_(false, warp_width, keys, payloads...);
bitonic<kSize2>::sort_(true, warp_width, keys + kSize2, (payloads + kSize2)...);
bitonic<kSize2>::sort_impl(false, warp_width, keys, payloads...);
bitonic<kSize2>::sort_impl(true, warp_width, keys + kSize2, (payloads + kSize2)...);
}
bitonic<Size>::merge_(ascending, warp_width, keys, payloads...);
bitonic<Size>::merge_impl(ascending, warp_width, keys, payloads...);
}
};

} // namespace raft::spatial::knn::detail::topk
} // namespace raft::util
11 changes: 9 additions & 2 deletions cpp/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,14 @@ if(BUILD_TESTS)
)

ConfigureTest(
NAME UTILS_TEST PATH test/common/seive.cu test/cudart_utils.cpp test/device_atomics.cu
test/integer_utils.cpp test/pow2_utils.cu
NAME
UTILS_TEST
PATH
test/common/seive.cu
test/cudart_utils.cpp
test/device_atomics.cu
test/integer_utils.cpp
test/pow2_utils.cu
test/util/bitonic_sort.cu
)
endif()
198 changes: 198 additions & 0 deletions cpp/test/util/bitonic_sort.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <raft/util/bitonic_sort.cuh>

#include <algorithm>
#include <gtest/gtest.h>
#include <numeric>
#include <raft/cudart_utils.h>
#include <raft/random/rng.hpp>

#include "../test_utils.h"

namespace raft::util {

constexpr int kMaxBlockSize = 512;
constexpr int kMaxCapacity = 128;

struct test_spec {
int n_inputs;
int warp_width;
int capacity;
bool ascending;

[[nodiscard]] auto len() const -> int { return n_inputs * warp_width * capacity; }
};

auto operator<<(std::ostream& os, const test_spec& ss) -> std::ostream&
{
os << "spec{n_inputs: " << ss.n_inputs << ", input_len: " << (ss.warp_width * ss.capacity) << " ("
<< ss.warp_width << " * " << ss.capacity << ")";
os << (ss.ascending ? "; asc}" : "; dsc}");
return os;
}

template <int Capacity, typename T>
__global__ void bitonic_kernel(T* arr, bool ascending, int warp_width, int n_inputs)
{
const int tid = blockDim.x * blockIdx.x + threadIdx.x;
const int subwarp_id = tid / warp_width;
const int subwarp_lane = tid % warp_width;
T local_arr[Capacity]; // NOLINT
// Split the data into chunks of size `warp_width * Capacity`, each thread poiting
// to the beginning of its stride within the chunk.
T* per_thread_arr = arr + subwarp_id * warp_width * Capacity + subwarp_lane;

if (subwarp_id < n_inputs) {
#pragma unroll
for (int i = 0; i < Capacity; i++) {
local_arr[i] = per_thread_arr[i * warp_width];
}
}

bitonic<Capacity>(ascending, warp_width).sort(local_arr);

if (subwarp_id < n_inputs) {
#pragma unroll
for (int i = 0; i < Capacity; i++) {
per_thread_arr[i * warp_width] = local_arr[i];
}
}
}

template <int Capacity>
struct bitonic_launch {
template <typename T>
static void run(const test_spec& spec, T* arr, rmm::cuda_stream_view stream)
{
ASSERT(spec.capacity <= Capacity, "Invalid input: the requested capacity is too high.");
ASSERT(spec.warp_width <= WarpSize,
"Invalid input: the requested warp_width must be not larger than the WarpSize.");
if constexpr (Capacity > 1) {
if (spec.capacity < Capacity) {
return bitonic_launch<std::max(1, Capacity / 2)>::run(spec, arr, stream);
}
}
int max_block_size, min_grid_size;
RAFT_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize(
&min_grid_size, &max_block_size, bitonic_kernel<Capacity, T>, 0, kMaxBlockSize));
const int n_warps =
ceildiv(std::min(spec.n_inputs * spec.warp_width, max_block_size), WarpSize);
const int block_dim = n_warps * WarpSize;
const int n_subwarps = block_dim / spec.warp_width;
const int grid_dim = ceildiv(spec.n_inputs, n_subwarps);
bitonic_kernel<Capacity, T>
<<<grid_dim, block_dim, 0, stream>>>(arr, spec.ascending, spec.warp_width, spec.n_inputs);
RAFT_CUDA_TRY(cudaPeekAtLastError());
}
};

template <typename T>
class BitonicTest : public testing::TestWithParam<test_spec> { // NOLINT
protected:
const test_spec spec; // NOLINT
std::vector<T> in; // NOLINT
std::vector<T> out; // NOLINT
std::vector<T> ref; // NOLINT

void segmented_sort(std::vector<T>& vec, int k, bool ascending) // NOLINT
{
std::vector<int> p(vec.size());
std::iota(p.begin(), p.end(), 0);
std::sort(p.begin(), p.end(), [&vec, k, ascending](int i, int j) {
const int ik = i / k;
const int jk = j / k;
if (ik == jk) { return ascending ? vec[i] < vec[j] : vec[i] > vec[j]; }
return ik < jk;
});
for (auto i = int(vec.size()) - 1; i > 0; i--) {
auto j = p[i];
while (j > i)
j = p[j];
std::swap(vec[j], vec[i]);
}
}

void fill_random(rmm::device_uvector<T>& arr, rmm::cuda_stream_view stream)
{
raft::random::Rng rng(42);
if constexpr (std::is_floating_point_v<T>) {
return rng.normal(arr.data(), arr.size(), T(10), T(100), stream);
}
if constexpr (std::is_integral_v<T>) {
return rng.normalInt(arr.data(), arr.size(), T(10), T(100), stream);
}
}

public:
explicit BitonicTest()
: spec(testing::TestWithParam<test_spec>::GetParam()),
in(spec.len()),
out(spec.len()),
ref(spec.len())
{
auto stream = rmm::cuda_stream_default;

// generate input
rmm::device_uvector<T> arr_d(spec.len(), stream);
fill_random(arr_d, stream);
update_host(in.data(), arr_d.data(), arr_d.size(), stream);

// calculate the results
bitonic_launch<kMaxCapacity>::run(spec, arr_d.data(), stream);
update_host(out.data(), arr_d.data(), arr_d.size(), stream);

// make sure the results are available on host
stream.synchronize();

// calculate the reference
std::copy(in.begin(), in.end(), ref.begin());
segmented_sort(ref, spec.warp_width * spec.capacity, spec.ascending);
}

void run() { ASSERT_TRUE(hostVecMatch(ref, out, Compare<T>())); }
};

auto inputs = ::testing::Values(test_spec{1, 1, 1, true},
test_spec{1, 2, 1, true},
test_spec{1, 4, 1, true},
test_spec{1, 8, 1, true},
test_spec{1, 16, 1, false},
test_spec{1, 32, 1, false},
test_spec{1, 32, 2, false},
test_spec{1, 32, 4, true},
test_spec{1, 32, 8, true},
test_spec{5, 32, 2, true},
test_spec{7, 16, 4, true},
test_spec{7, 8, 2, true},
test_spec{70, 4, 32, true},
test_spec{70, 1, 64, true},
test_spec{70, 2, 128, false});

using Floats = BitonicTest<float>; // NOLINT
TEST_P(Floats, Run) { run(); } // NOLINT
INSTANTIATE_TEST_CASE_P(BitonicTest, Floats, inputs); // NOLINT

using Ints = BitonicTest<int>; // NOLINT
TEST_P(Ints, Run) { run(); } // NOLINT
INSTANTIATE_TEST_CASE_P(BitonicTest, Ints, inputs); // NOLINT

using Doubles = BitonicTest<double>; // NOLINT
TEST_P(Doubles, Run) { run(); } // NOLINT
INSTANTIATE_TEST_CASE_P(BitonicTest, Doubles, inputs); // NOLINT

} // namespace raft::util