Skip to content

Commit

Permalink
Add fusedL2NN benchmark (#936)
Browse files Browse the repository at this point in the history
Adds a benchmark for `fusedL2NN`. The benchmark used the wrapper `fusedL2NNMinReduce` compiled in the distance library, for faster compilation times.

Authors:
  - Louis Sugy (https://github.com/Nyrio)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: #936
  • Loading branch information
Nyrio authored Oct 27, 2022
1 parent 9ab17db commit 93a20b8
Show file tree
Hide file tree
Showing 5 changed files with 156 additions and 128 deletions.
2 changes: 1 addition & 1 deletion cpp/bench/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ if(BUILD_BENCH)
bench/distance/distance_exp_l2.cu
bench/distance/distance_l1.cu
bench/distance/distance_unexp_l2.cu
bench/distance/fused_l2_nn.cu
bench/distance/kernels.cu
bench/main.cpp
OPTIONAL DIST
Expand Down Expand Up @@ -116,7 +117,6 @@ if(BUILD_BENCH)

ConfigureBench(NAME NEIGHBORS_BENCH
PATH
bench/neighbors/fused_l2_nn.cu
bench/neighbors/knn/brute_force_float_int64_t.cu
bench/neighbors/knn/brute_force_float_uint32_t.cu
bench/neighbors/knn/ivf_flat_float_int64_t.cu
Expand Down
153 changes: 153 additions & 0 deletions cpp/bench/distance/fused_l2_nn.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <common/benchmark.hpp>
#include <raft/distance/fused_l2_nn.cuh>
#include <raft/util/cudart_utils.hpp>
#if defined RAFT_DISTANCE_COMPILED
#include <raft/distance/specializations.cuh>
#endif
#include <rmm/device_uvector.hpp>

namespace raft::bench::distance {

struct fusedl2nn_inputs {
int64_t m, n, k;
}; // struct fusedl2nn_inputs

inline auto operator<<(std::ostream& os, const fusedl2nn_inputs& p) -> std::ostream&
{
os << p.m << "#" << p.n << "#" << p.k;
return os;
}

template <typename DataT, typename IdxT, typename OutT>
struct fusedl2nn : public fixture {
fusedl2nn(const fusedl2nn_inputs& p) : params(p) {}

void allocate_data(const ::benchmark::State& state) override
{
x = raft::make_device_matrix<DataT, IdxT>(handle, params.m, params.k);
y = raft::make_device_matrix<DataT, IdxT>(handle, params.n, params.k);
x_norm = raft::make_device_vector<DataT, IdxT>(handle, params.m);
y_norm = raft::make_device_vector<DataT, IdxT>(handle, params.n);
out = raft::make_device_vector<OutT, IdxT>(handle, params.m);

raft::random::RngState rng{1234};
raft::random::uniform(
handle, rng, x.data_handle(), params.m * params.k, (DataT)-1.0, (DataT)1.0);
raft::random::uniform(
handle, rng, y.data_handle(), params.n * params.k, (DataT)-1.0, (DataT)1.0);

// Pre-compute norms
raft::linalg::rowNorm(x_norm.data_handle(),
x.data_handle(),
params.k,
params.m,
raft::linalg::L2Norm,
true,
stream);
raft::linalg::rowNorm(y_norm.data_handle(),
y.data_handle(),
params.k,
params.n,
raft::linalg::L2Norm,
true,
stream);
handle.sync_stream(stream);
}

void allocate_temp_buffers(const ::benchmark::State& state) override
{
workspace = raft::make_device_vector<char, IdxT>(handle, params.m * sizeof(IdxT));
}

void run_benchmark(::benchmark::State& state) override
{
std::ostringstream label_stream;
label_stream << params;
state.SetLabel(label_stream.str());

loop_on_state(state, [this]() {
raft::distance::fusedL2NNMinReduce<DataT, OutT, IdxT>(out.data_handle(),
x.data_handle(),
y.data_handle(),
x_norm.data_handle(),
y_norm.data_handle(),
static_cast<IdxT>(params.m),
static_cast<IdxT>(params.n),
static_cast<IdxT>(params.k),
(void*)workspace.data_handle(),
false,
true,
stream);
});

int64_t num_flops = 2 * params.m * params.n * params.k;

int64_t read_elts = params.n * params.k + params.m * params.k;
int64_t write_elts = params.m;

state.counters["FLOP/s"] = benchmark::Counter(
num_flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1000);

state.counters["BW Wr"] = benchmark::Counter(write_elts * sizeof(OutT),
benchmark::Counter::kIsIterationInvariantRate,
benchmark::Counter::OneK::kIs1000);
state.counters["BW Rd"] = benchmark::Counter(read_elts * sizeof(DataT),
benchmark::Counter::kIsIterationInvariantRate,
benchmark::Counter::OneK::kIs1000);
}

private:
fusedl2nn_inputs params;
raft::device_matrix<DataT, IdxT> x, y;
raft::device_vector<DataT, IdxT> x_norm, y_norm;
raft::device_vector<OutT, IdxT> out;
raft::device_vector<char, IdxT> workspace;
}; // struct fusedl2nn

template <typename IdxT>
std::vector<fusedl2nn_inputs> getFusedL2NNInputs()
{
std::vector<fusedl2nn_inputs> inputs;
std::vector<int64_t> m_list = {100000, 1000000};
if constexpr (sizeof(IdxT) == 8) { m_list.push_back(10000000); }
std::vector<int64_t> n_list = {100, 1000, 10000};
std::vector<int64_t> k_list = {64, 128, 256};
for (auto m : m_list) {
for (auto n : n_list) {
for (auto k : k_list) {
inputs.push_back({m, n, k});
}
}
}
return inputs;
}

#define FUSEDL2NN_BENCH(DataT, IdxT, OutT) \
RAFT_BENCH_REGISTER((fusedl2nn<DataT, IdxT, RAFT_DEPAREN(OutT)>), "", getFusedL2NNInputs<IdxT>())

FUSEDL2NN_BENCH(float, int, float);
FUSEDL2NN_BENCH(double, int, double);
FUSEDL2NN_BENCH(float, int, (raft::KeyValuePair<int, float>));
FUSEDL2NN_BENCH(double, int, (raft::KeyValuePair<int, double>));
FUSEDL2NN_BENCH(float, int64_t, float);
FUSEDL2NN_BENCH(double, int64_t, double);
FUSEDL2NN_BENCH(float, int64_t, (raft::KeyValuePair<int64_t, float>));
FUSEDL2NN_BENCH(double, int64_t, (raft::KeyValuePair<int64_t, double>));

} // namespace raft::bench::distance
123 changes: 0 additions & 123 deletions cpp/bench/neighbors/fused_l2_nn.cu

This file was deleted.

2 changes: 0 additions & 2 deletions cpp/include/raft/linalg/contractions.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -200,14 +200,12 @@ struct Policy2x8 {
template <int _veclen>
struct Policy2x8<float, _veclen> {
typedef KernelPolicy<float, _veclen, 16, 2, 8, 8, 32> Policy;
typedef ColKernelPolicy<float, _veclen, 16, 2, 8, 8, 32> ColPolicy;
};

template <int _veclen>
struct Policy2x8<double, _veclen> {
// this is not used just for keeping compiler happy.
typedef KernelPolicy<double, _veclen, 32, 1, 2, 8, 32> Policy;
typedef ColKernelPolicy<double, _veclen, 32, 1, 2, 8, 32> ColPolicy;
};
/** @} */

Expand Down
4 changes: 2 additions & 2 deletions cpp/include/raft/random/detail/rng_device.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -667,7 +667,7 @@ __global__ void rngKernel(DeviceState<GenType> rng_state,
LenType len,
ParamType params)
{
LenType tid = threadIdx.x + blockIdx.x * blockDim.x;
LenType tid = threadIdx.x + static_cast<LenType>(blockIdx.x) * blockDim.x;
GenType gen(rng_state, (uint64_t)tid);
const LenType stride = gridDim.x * blockDim.x;
for (LenType idx = tid; idx < len; idx += stride * ITEMS_PER_CALL) {
Expand All @@ -692,7 +692,7 @@ template <typename OutType,
__global__ void fillKernel(
uint64_t seed, uint64_t adv_subs, uint64_t offset, OutType* ptr, LenType len, ParamType params)
{
LenType tid = threadIdx.x + blockIdx.x * blockDim.x;
LenType tid = threadIdx.x + static_cast<LenType>(blockIdx.x) * blockDim.x;
GenType gen(seed, adv_subs + (uint64_t)tid, offset);
const LenType stride = gridDim.x * blockDim.x;
for (LenType idx = tid; idx < len; idx += stride * ITEMS_PER_CALL) {
Expand Down

0 comments on commit 93a20b8

Please sign in to comment.