Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add random subsampling for IVF methods #2077

Merged
merged 11 commits into from
Jan 23, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
remove random_seed parameter
  • Loading branch information
tfeher committed Jan 21, 2024
commit 2ee6aff7987b9411d1ccfddc314bccba2d3e2245
4 changes: 1 addition & 3 deletions cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -55,7 +55,6 @@ void parse_build_param(const nlohmann::json& conf,
param.n_lists = conf.at("nlist");
if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); }
if (conf.contains("ratio")) { param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio"); }
if (conf.contains("random_seed")) { param.random_seed = conf.at("random_seed"); }
}

template <typename T, typename IdxT>
Expand Down Expand Up @@ -88,7 +87,6 @@ void parse_build_param(const nlohmann::json& conf,
"', should be either 'cluster' or 'subspace'");
}
}
if (conf.contains("random_seed")) { param.random_seed = conf.at("random_seed"); }
}

template <typename T, typename IdxT>
Expand Down
3 changes: 2 additions & 1 deletion cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -361,12 +361,13 @@ inline auto build(raft::resources const& handle,

// Train the kmeans clustering
{
int random_seed = 137;
auto trainset_ratio = std::max<size_t>(
1, n_rows / std::max<size_t>(params.kmeans_trainset_fraction * n_rows, index.n_lists()));
auto n_rows_train = n_rows / trainset_ratio;
auto trainset = make_device_matrix<T, IdxT>(handle, n_rows_train, index.dim());
raft::spatial::knn::detail::utils::subsample(
handle, dataset, n_rows, trainset.view(), params.random_seed);
handle, dataset, n_rows, trainset.view(), random_seed);
auto centers_view = raft::make_device_matrix_view<float, IdxT>(
index.centers().data_handle(), index.n_lists(), index.dim());
raft::cluster::kmeans_balanced_params kmeans_params;
Expand Down
5 changes: 3 additions & 2 deletions cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1755,6 +1755,7 @@ auto build(raft::resources const& handle,
utils::memzero(index.inds_ptrs().data_handle(), index.inds_ptrs().size(), stream);

{
int random_seed = 137;
auto trainset_ratio = std::max<size_t>(
1,
size_t(n_rows) / std::max<size_t>(params.kmeans_trainset_fraction * n_rows, index.n_lists()));
Expand All @@ -1770,14 +1771,14 @@ auto build(raft::resources const& handle,

if constexpr (std::is_same_v<T, float>) {
raft::spatial::knn::detail::utils::subsample(
handle, dataset, n_rows, trainset.view(), params.random_seed);
handle, dataset, n_rows, trainset.view(), random_seed);
} else {
// TODO(tfeher): Enable codebook generation with any type T, and then remove
// trainset tmp.
auto trainset_tmp =
make_device_mdarray<T>(handle, device_mr, make_extents<IdxT>(n_rows_train, dim));
raft::spatial::knn::detail::utils::subsample(
handle, dataset, n_rows, trainset_tmp.view(), params.random_seed);
handle, dataset, n_rows, trainset_tmp.view(), random_seed);
cudaDeviceSynchronize();
RAFT_LOG_INFO("Subsampling done, converting to float");
raft::linalg::unaryOp(trainset.data_handle(),
Expand Down
8 changes: 1 addition & 7 deletions cpp/include/raft/neighbors/ivf_flat_types.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -76,12 +76,6 @@ struct index_params : ann::index_params {
* flag to `true` if you prefer to use as little GPU memory for the database as possible.
*/
bool conservative_memory_allocation = false;
/**
* Seed used for random sampling if kmeans_trainset_fraction < 1.
*
* Value -1 disables random sampling, and results in sampling with a fixed stride.
*/
int random_seed = 0;
};

struct search_params : ann::search_params {
Expand Down
9 changes: 1 addition & 8 deletions cpp/include/raft/neighbors/ivf_pq_types.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -105,13 +105,6 @@ struct index_params : ann::index_params {
* flag to `true` if you prefer to use as little GPU memory for the database as possible.
*/
bool conservative_memory_allocation = false;

/**
* Seed used for random sampling if kmeans_trainset_fraction < 1.
*
* Value -1 disables random sampling, and results in sampling with a fixed stride.
*/
int random_seed = 0;
};

struct search_params : ann::search_params {
Expand Down
10 changes: 2 additions & 8 deletions cpp/test/neighbors/ann_ivf_flat.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -73,15 +73,14 @@ struct AnnIvfFlatInputs {
raft::distance::DistanceType metric;
bool adaptive_centers;
bool host_dataset;
int seed;
};

template <typename IdxT>
::std::ostream& operator<<(::std::ostream& os, const AnnIvfFlatInputs<IdxT>& p)
{
os << "{ " << p.num_queries << ", " << p.num_db_vecs << ", " << p.dim << ", " << p.k << ", "
<< p.nprobe << ", " << p.nlist << ", " << static_cast<int>(p.metric) << ", "
<< p.adaptive_centers << ", " << p.host_dataset << "," << p.seed << '}' << std::endl;
<< p.adaptive_centers << ", " << p.host_dataset << '}' << std::endl;
return os;
}

Expand Down Expand Up @@ -179,7 +178,6 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
index_params.add_data_on_build = false;
index_params.kmeans_trainset_fraction = 0.5;
index_params.metric_arg = 0;
index_params.random_seed = ps.seed;

ivf_flat::index<DataT, IdxT> idx(handle_, index_params, ps.dim);
ivf_flat::index<DataT, IdxT> index_2(handle_, index_params, ps.dim);
Expand Down Expand Up @@ -329,7 +327,6 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
index_params.add_data_on_build = false;
index_params.kmeans_trainset_fraction = 1.0;
index_params.metric_arg = 0;
index_params.random_seed = ps.seed;

auto database_view = raft::make_device_matrix_view<const DataT, IdxT>(
(const DataT*)database.data(), ps.num_db_vecs, ps.dim);
Expand Down Expand Up @@ -500,7 +497,6 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
index_params.add_data_on_build = true;
index_params.kmeans_trainset_fraction = 0.5;
index_params.metric_arg = 0;
index_params.random_seed = ps.seed;

// Create IVF Flat index
auto database_view = raft::make_device_matrix_view<const DataT, IdxT>(
Expand Down Expand Up @@ -611,8 +607,6 @@ const std::vector<AnnIvfFlatInputs<int64_t>> inputs = {
{20, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::L2Expanded, true},
{1000, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::L2Expanded, true},
{10000, 131072, 8, 10, 20, 1024, raft::distance::DistanceType::L2Expanded, false},
{10000, 1000000, 96, 10, 20, 1024, raft::distance::DistanceType::L2Expanded, false, true, -1},
{10000, 1000000, 96, 10, 20, 1024, raft::distance::DistanceType::L2Expanded, false, false, -1},

// host input data
{1000, 10000, 16, 10, 40, 1024, raft::distance::DistanceType::L2Expanded, false, true},
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
# Copyright (c) 2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -56,7 +56,6 @@ cdef extern from "raft/neighbors/ivf_flat_types.hpp" \
double kmeans_trainset_fraction
bool adaptive_centers
bool conservative_memory_allocation
int random_seed

cdef cppclass index[T, IdxT](ann_index):
index(const device_resources& handle,
Expand Down
15 changes: 2 additions & 13 deletions python/pylibraft/pylibraft/neighbors/ivf_flat/ivf_flat.pyx
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
# Copyright (c) 2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -113,11 +113,6 @@ cdef class IndexParams:
adding new data (through the classification of the added data);
that is, `index.centers()` "drift" together with the changing
distribution of the newly added data.
random_seed : int, default = 0
Seed used for random sampling if kmeans_trainset_fraction < 1.
Value -1 disables random sampling, and results in sampling with a
fixed stride.

"""
cdef c_ivf_flat.index_params params

Expand All @@ -127,16 +122,14 @@ cdef class IndexParams:
kmeans_n_iters=20,
kmeans_trainset_fraction=0.5,
add_data_on_build=True,
bool adaptive_centers=False,
random_seed=0):
bool adaptive_centers=False):
self.params.n_lists = n_lists
self.params.metric = _get_metric(metric)
self.params.metric_arg = 0
self.params.kmeans_n_iters = kmeans_n_iters
self.params.kmeans_trainset_fraction = kmeans_trainset_fraction
self.params.add_data_on_build = add_data_on_build
self.params.adaptive_centers = adaptive_centers
self.params.random_seed = random_seed

@property
def n_lists(self):
Expand All @@ -162,10 +155,6 @@ cdef class IndexParams:
def adaptive_centers(self):
return self.params.adaptive_centers

@property
def random_seed(self):
return self.params.random_seed


cdef class Index:
cdef readonly bool trained
Expand Down
3 changes: 1 addition & 2 deletions python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -78,7 +78,6 @@ cdef extern from "raft/neighbors/ivf_pq_types.hpp" \
codebook_gen codebook_kind
bool force_random_rotation
bool conservative_memory_allocation
int random_seed

cdef cppclass index[IdxT](ann_index):
index(const device_resources& handle,
Expand Down
14 changes: 2 additions & 12 deletions python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -156,10 +156,6 @@ cdef class IndexParams:
repeated calls to `extend` (extending the database).
To disable this behavior and use as little GPU memory for the
database as possible, set this flat to `True`.
random_seed : int, default = 0
Seed used for random sampling if kmeans_trainset_fraction < 1.
Value -1 disables random sampling, and results in sampling with a
fixed stride.
"""
def __init__(self, *,
n_lists=1024,
Expand All @@ -171,8 +167,7 @@ cdef class IndexParams:
codebook_kind="subspace",
force_random_rotation=False,
add_data_on_build=True,
conservative_memory_allocation=False,
random_seed=0):
conservative_memory_allocation=False):
self.params.n_lists = n_lists
self.params.metric = _get_metric(metric)
self.params.metric_arg = 0
Expand All @@ -190,7 +185,6 @@ cdef class IndexParams:
self.params.add_data_on_build = add_data_on_build
self.params.conservative_memory_allocation = \
conservative_memory_allocation
self.params.random_seed = random_seed

@property
def n_lists(self):
Expand Down Expand Up @@ -232,10 +226,6 @@ cdef class IndexParams:
def conservative_memory_allocation(self):
return self.params.conservative_memory_allocation

@property
def random_seed(self):
return self.params.random_seed


cdef class Index:
# We store a pointer to the index because it dose not have a trivial
Expand Down