Skip to content

Commit d576348

Browse files
authored
Updates to enable HDBSCAN (#208)
Authors: - Corey J. Nolet (https://github.com/cjnolet) - Divye Gala (https://github.com/divyegala) Approvers: - Dante Gama Dessavre (https://github.com/dantegd) URL: #208
1 parent c00d3b2 commit d576348

File tree

7 files changed

+86
-141
lines changed

7 files changed

+86
-141
lines changed

cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh

Lines changed: 16 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -73,9 +73,15 @@ class UnionFind {
7373
};
7474

7575
/**
76-
* Standard single-threaded agglomerative labeling on host. This should work
77-
* well for smaller sizes of m. This is a C++ port of the original reference
78-
* implementation of HDBSCAN.
76+
* Agglomerative labeling on host. This has not been found to be a bottleneck
77+
* in the algorithm. A parallel version of this can be done using a parallel
78+
* variant of Kruskal's MST algorithm
79+
* (ref http://cucis.ece.northwestern.edu/publications/pdf/HenPat12.pdf),
80+
* which breaks apart the sorted MST results into overlapping subsets and
81+
* independently runs Kruskal's algorithm on each subset, merging them back
82+
* together into a single hierarchy when complete. Unfortunately,
83+
* this is nontrivial and the speedup wouldn't be useful until this
84+
* becomes a bottleneck.
7985
*
8086
* @tparam value_idx
8187
* @tparam value_t
@@ -91,9 +97,8 @@ class UnionFind {
9197
template <typename value_idx, typename value_t>
9298
void build_dendrogram_host(const handle_t &handle, const value_idx *rows,
9399
const value_idx *cols, const value_t *data,
94-
size_t nnz, value_idx *children,
95-
rmm::device_uvector<value_t> &out_delta,
96-
rmm::device_uvector<value_idx> &out_size) {
100+
size_t nnz, value_idx *children, value_t *out_delta,
101+
value_idx *out_size) {
97102
auto d_alloc = handle.get_device_allocator();
98103
auto stream = handle.get_stream();
99104

@@ -103,8 +108,6 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows,
103108
std::vector<value_idx> mst_dst_h(n_edges);
104109
std::vector<value_t> mst_weights_h(n_edges);
105110

106-
printf("n_edges: %d\n", n_edges);
107-
108111
update_host(mst_src_h.data(), rows, n_edges, stream);
109112
update_host(mst_dst_h.data(), cols, n_edges, stream);
110113
update_host(mst_weights_h.data(), data, n_edges, stream);
@@ -113,12 +116,14 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows,
113116

114117
std::vector<value_idx> children_h(n_edges * 2);
115118
std::vector<value_idx> out_size_h(n_edges);
119+
std::vector<value_t> out_delta_h(n_edges);
116120

117121
UnionFind<value_idx, value_t> U(nnz + 1);
118122

119123
for (value_idx i = 0; i < nnz; i++) {
120124
value_idx a = mst_src_h[i];
121125
value_idx b = mst_dst_h[i];
126+
value_t delta = mst_weights_h[i];
122127

123128
value_idx aa = U.find(a);
124129
value_idx bb = U.find(b);
@@ -127,72 +132,15 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows,
127132

128133
children_h[children_idx] = aa;
129134
children_h[children_idx + 1] = bb;
135+
out_delta_h[i] = delta;
130136
out_size_h[i] = U.size[aa] + U.size[bb];
131137

132138
U.perform_union(aa, bb);
133139
}
134140

135-
out_size.resize(n_edges, stream);
136-
137-
printf("Finished dendrogram\n");
138-
139141
raft::update_device(children, children_h.data(), n_edges * 2, stream);
140-
raft::update_device(out_size.data(), out_size_h.data(), n_edges, stream);
141-
}
142-
143-
/**
144-
* Parallel agglomerative labeling. This amounts to a parallel Kruskal's
145-
* MST algorithm, which breaks apart the sorted MST results into overlapping
146-
* subsets and independently runs Kruskal's algorithm on each subset,
147-
* merging them back together into a single hierarchy when complete.
148-
*
149-
* This outputs the same format as the reference HDBSCAN, but as 4 separate
150-
* arrays, rather than a single 2D array.
151-
*
152-
* Reference: http://cucis.ece.northwestern.edu/publications/pdf/HenPat12.pdf
153-
*
154-
* TODO: Investigate potential for the following end-to-end single-hierarchy batching:
155-
* For each of k (independent) batches over the input:
156-
* - Sample n elements from X
157-
* - Compute mutual reachability graph of batch
158-
* - Construct labels from batch
159-
*
160-
* The sampled datasets should have some overlap across batches. This will
161-
* allow for the cluster hierarchies to be merged. Being able to batch
162-
* will reduce the memory cost so that the full n^2 pairwise distances
163-
* don't need to be materialized in memory all at once.
164-
*
165-
* @tparam value_idx
166-
* @tparam value_t
167-
* @param[in] handle the raft handle
168-
* @param[in] rows src edges of the sorted MST
169-
* @param[in] cols dst edges of the sorted MST
170-
* @param[in] nnz the number of edges in the sorted MST
171-
* @param[out] out_src parents of output
172-
* @param[out] out_dst children of output
173-
* @param[out] out_delta distances of output
174-
* @param[out] out_size cluster sizes of output
175-
* @param[in] k_folds number of folds for parallelizing label step
176-
*/
177-
template <typename value_idx, typename value_t>
178-
void build_dendrogram_device(const handle_t &handle, const value_idx *rows,
179-
const value_idx *cols, const value_t *data,
180-
value_idx nnz, value_idx *children,
181-
value_t *out_delta, value_idx *out_size,
182-
value_idx k_folds) {
183-
ASSERT(k_folds < nnz / 2, "k_folds must be < n_edges / 2");
184-
/**
185-
* divide (sorted) mst coo into overlapping subsets. Easiest way to do this is to
186-
* break it into k-folds and iterate through two folds at a time.
187-
*/
188-
189-
// 1. Generate ranges for the overlapping subsets
190-
191-
// 2. Run union-find in parallel for each pair of folds
192-
193-
// 3. Sort individual label hierarchies
194-
195-
// 4. Merge label hierarchies together
142+
raft::update_device(out_size, out_size_h.data(), n_edges, stream);
143+
raft::update_device(out_delta, out_delta_h.data(), n_edges, stream);
196144
}
197145

198146
template <typename value_idx>

cpp/include/raft/sparse/hierarchy/detail/mst.cuh

Lines changed: 23 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include <raft/cudart_utils.h>
2020
#include <raft/cuda_utils.cuh>
2121

22+
#include <raft/sparse/op/sort.h>
2223
#include <raft/mr/device/buffer.hpp>
2324
#include <raft/sparse/mst/mst.cuh>
2425
#include <raft/sparse/selection/connect_components.cuh>
@@ -35,29 +36,6 @@ namespace raft {
3536
namespace hierarchy {
3637
namespace detail {
3738

38-
/**
39-
* Sorts a COO by its weight
40-
* @tparam value_idx
41-
* @tparam value_t
42-
* @param[inout] rows source edges
43-
* @param[inout] cols dest edges
44-
* @param[inout] data edge weights
45-
* @param[in] nnz number of edges in edge list
46-
* @param[in] stream cuda stream for which to order cuda operations
47-
*/
48-
template <typename value_idx, typename value_t>
49-
void sort_coo_by_data(value_idx *rows, value_idx *cols, value_t *data,
50-
value_idx nnz, cudaStream_t stream) {
51-
thrust::device_ptr<value_idx> t_rows = thrust::device_pointer_cast(rows);
52-
thrust::device_ptr<value_idx> t_cols = thrust::device_pointer_cast(cols);
53-
thrust::device_ptr<value_t> t_data = thrust::device_pointer_cast(data);
54-
55-
auto first = thrust::make_zip_iterator(thrust::make_tuple(rows, cols));
56-
57-
thrust::sort_by_key(thrust::cuda::par.on(stream), t_data, t_data + nnz,
58-
first);
59-
}
60-
6139
template <typename value_idx, typename value_t>
6240
void merge_msts(raft::Graph_COO<value_idx, value_idx, value_t> &coo1,
6341
raft::Graph_COO<value_idx, value_idx, value_t> &coo2,
@@ -95,19 +73,20 @@ void merge_msts(raft::Graph_COO<value_idx, value_idx, value_t> &coo1,
9573
* @param[inout] color the color labels array returned from the mst invocation
9674
* @return updated MST edge list
9775
*/
98-
template <typename value_idx, typename value_t>
76+
template <typename value_idx, typename value_t, typename red_op>
9977
void connect_knn_graph(const raft::handle_t &handle, const value_t *X,
10078
raft::Graph_COO<value_idx, value_idx, value_t> &msf,
10179
size_t m, size_t n, value_idx *color,
80+
red_op reduction_op,
10281
raft::distance::DistanceType metric =
10382
raft::distance::DistanceType::L2SqrtExpanded) {
10483
auto d_alloc = handle.get_device_allocator();
10584
auto stream = handle.get_stream();
10685

10786
raft::sparse::COO<value_t, value_idx> connected_edges(d_alloc, stream);
10887

109-
raft::linkage::connect_components<value_idx, value_t>(handle, connected_edges,
110-
X, color, m, n);
88+
raft::linkage::connect_components<value_idx, value_t>(
89+
handle, connected_edges, X, color, m, n, reduction_op);
11190

11291
rmm::device_uvector<value_idx> indptr2(m + 1, stream);
11392
raft::sparse::convert::sorted_coo_to_csr(connected_edges.rows(),
@@ -147,38 +126,34 @@ void connect_knn_graph(const raft::handle_t &handle, const value_t *X,
147126
* @param[in] max_iter maximum iterations to run knn graph connection. This
148127
* argument is really just a safeguard against the potential for infinite loops.
149128
*/
150-
template <typename value_idx, typename value_t>
129+
template <typename value_idx, typename value_t, typename red_op>
151130
void build_sorted_mst(const raft::handle_t &handle, const value_t *X,
152131
const value_idx *indptr, const value_idx *indices,
153132
const value_t *pw_dists, size_t m, size_t n,
154-
rmm::device_uvector<value_idx> &mst_src,
155-
rmm::device_uvector<value_idx> &mst_dst,
156-
rmm::device_uvector<value_t> &mst_weight,
157-
const size_t nnz,
133+
value_idx *mst_src, value_idx *mst_dst,
134+
value_t *mst_weight, value_idx *color, size_t nnz,
135+
red_op reduction_op,
158136
raft::distance::DistanceType metric =
159137
raft::distance::DistanceType::L2SqrtExpanded,
160138
int max_iter = 10) {
161139
auto d_alloc = handle.get_device_allocator();
162140
auto stream = handle.get_stream();
163141

164-
rmm::device_uvector<value_idx> color(m, stream);
165-
166142
// We want to have MST initialize colors on first call.
167143
auto mst_coo = raft::mst::mst<value_idx, value_idx, value_t, double>(
168-
handle, indptr, indices, pw_dists, (value_idx)m, nnz, color.data(), stream,
169-
false, true);
144+
handle, indptr, indices, pw_dists, (value_idx)m, nnz, color, stream, false,
145+
true);
170146

171147
int iters = 1;
172-
int n_components =
173-
linkage::get_n_components(color.data(), m, d_alloc, stream);
148+
int n_components = linkage::get_n_components(color, m, d_alloc, stream);
174149

175150
while (n_components > 1 && iters < max_iter) {
176-
connect_knn_graph<value_idx, value_t>(handle, X, mst_coo, m, n,
177-
color.data());
151+
connect_knn_graph<value_idx, value_t>(handle, X, mst_coo, m, n, color,
152+
reduction_op);
178153

179154
iters++;
180155

181-
n_components = linkage::get_n_components(color.data(), m, d_alloc, stream);
156+
n_components = linkage::get_n_components(color, m, d_alloc, stream);
182157
}
183158

184159
/**
@@ -189,7 +164,7 @@ void build_sorted_mst(const raft::handle_t &handle, const value_t *X,
189164
* 1. There is a bug in this code somewhere
190165
* 2. Either the given KNN graph wasn't generated from X or the same metric is not being used
191166
* to generate the 1-nn (currently only L2SqrtExpanded is supported).
192-
* 3. max_iter was not large enough to connect the graph.
167+
* 3. max_iter was not large enough to connect the graph (less likely).
193168
*
194169
* Note that a KNN graph generated from 50 random isotropic balls (with significant overlap)
195170
* was able to be connected in a single iteration.
@@ -201,20 +176,15 @@ void build_sorted_mst(const raft::handle_t &handle, const value_t *X,
201176
" or increase 'max_iter'",
202177
max_iter);
203178

204-
sort_coo_by_data(mst_coo.src.data(), mst_coo.dst.data(),
205-
mst_coo.weights.data(), mst_coo.n_edges, stream);
206-
207-
// TODO: be nice if we could pass these directly into the MST
208-
mst_src.resize(mst_coo.n_edges, stream);
209-
mst_dst.resize(mst_coo.n_edges, stream);
210-
mst_weight.resize(mst_coo.n_edges, stream);
179+
raft::sparse::op::coo_sort_by_weight(mst_coo.src.data(), mst_coo.dst.data(),
180+
mst_coo.weights.data(), mst_coo.n_edges,
181+
stream);
211182

212-
raft::copy_async(mst_src.data(), mst_coo.src.data(), mst_coo.n_edges, stream);
213-
raft::copy_async(mst_dst.data(), mst_coo.dst.data(), mst_coo.n_edges, stream);
214-
raft::copy_async(mst_weight.data(), mst_coo.weights.data(), mst_coo.n_edges,
215-
stream);
183+
raft::copy_async(mst_src, mst_coo.src.data(), mst_coo.n_edges, stream);
184+
raft::copy_async(mst_dst, mst_coo.dst.data(), mst_coo.n_edges, stream);
185+
raft::copy_async(mst_weight, mst_coo.weights.data(), mst_coo.n_edges, stream);
216186
}
217187

218188
}; // namespace detail
219189
}; // namespace hierarchy
220-
}; // namespace raft
190+
}; // namespace raft

cpp/include/raft/sparse/hierarchy/single_linkage.hpp

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -70,16 +70,19 @@ void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m,
7070
detail::get_distance_graph<value_idx, value_t, dist_type>(
7171
handle, X, m, n, metric, indptr, indices, pw_dists, c);
7272

73-
rmm::device_uvector<value_idx> mst_rows(EMPTY, stream);
74-
rmm::device_uvector<value_idx> mst_cols(EMPTY, stream);
75-
rmm::device_uvector<value_t> mst_data(EMPTY, stream);
73+
rmm::device_uvector<value_idx> mst_rows(m - 1, stream);
74+
rmm::device_uvector<value_idx> mst_cols(m - 1, stream);
75+
rmm::device_uvector<value_t> mst_data(m - 1, stream);
7676

7777
/**
7878
* 2. Construct MST, sorted by weights
7979
*/
80+
rmm::device_uvector<value_idx> color(m, stream);
81+
raft::linkage::FixConnectivitiesRedOp<value_idx, value_t> op(color.data(), m);
8082
detail::build_sorted_mst<value_idx, value_t>(
81-
handle, X, indptr.data(), indices.data(), pw_dists.data(), m, n, mst_rows,
82-
mst_cols, mst_data, indices.size(), metric);
83+
handle, X, indptr.data(), indices.data(), pw_dists.data(), m, n,
84+
mst_rows.data(), mst_cols.data(), mst_data.data(), color.data(),
85+
indices.size(), op, metric);
8386

8487
pw_dists.release();
8588

@@ -93,7 +96,7 @@ void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m,
9396
// Create dendrogram
9497
detail::build_dendrogram_host<value_idx, value_t>(
9598
handle, mst_rows.data(), mst_cols.data(), mst_data.data(), n_edges,
96-
out->children, out_delta, out_size);
99+
out->children, out_delta.data(), out_size.data());
97100
detail::extract_flattened_clusters(handle, out->labels, out->children,
98101
n_clusters, m);
99102

cpp/include/raft/sparse/op/reduce.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,8 @@ void max_duplicates(const raft::handle_t &handle,
136136

137137
compute_duplicates_mask(diff.data(), rows, cols, nnz, stream);
138138

139-
thrust::exclusive_scan(exec_policy, diff.data(), diff.data() + diff.size(),
140-
diff.data());
139+
thrust::exclusive_scan(thrust::cuda::par.on(stream), diff.data(),
140+
diff.data() + diff.size(), diff.data());
141141

142142
// compute final size
143143
value_idx size = 0;

cpp/include/raft/sparse/op/sort.h

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,29 @@ void coo_sort(COO<T> *const in,
9292
coo_sort<T>(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(),
9393
in->vals(), d_alloc, stream);
9494
}
95+
96+
/**
97+
* Sorts a COO by its weight
98+
* @tparam value_idx
99+
* @tparam value_t
100+
* @param[inout] rows source edges
101+
* @param[inout] cols dest edges
102+
* @param[inout] data edge weights
103+
* @param[in] nnz number of edges in edge list
104+
* @param[in] stream cuda stream for which to order cuda operations
105+
*/
106+
template <typename value_idx, typename value_t>
107+
void coo_sort_by_weight(value_idx *rows, value_idx *cols, value_t *data,
108+
value_idx nnz, cudaStream_t stream) {
109+
thrust::device_ptr<value_idx> t_rows = thrust::device_pointer_cast(rows);
110+
thrust::device_ptr<value_idx> t_cols = thrust::device_pointer_cast(cols);
111+
thrust::device_ptr<value_t> t_data = thrust::device_pointer_cast(data);
112+
113+
auto first = thrust::make_zip_iterator(thrust::make_tuple(rows, cols));
114+
115+
thrust::sort_by_key(thrust::cuda::par.on(stream), t_data, t_data + nnz,
116+
first);
117+
}
95118
}; // namespace op
96119
}; // end NAMESPACE sparse
97120
}; // end NAMESPACE raft

0 commit comments

Comments
 (0)