Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEA] [REVIEW] RMAT rectangular graph generator #738

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fixed the hang in shfl-logic due to divergence
  • Loading branch information
teju85 committed Jul 8, 2022
commit 117c5022278e7e12b4794ace19e2936c683713c0
72 changes: 32 additions & 40 deletions cpp/include/raft/random/detail/rmat_rectangular_generator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -26,31 +26,34 @@ namespace random {
namespace detail {

template <typename IdxT, typename ProbT>
DI void gen_bits(IdxT& src_bit, IdxT& dst_bit, ProbT a, ProbT ab, ProbT abc, bool clip_and_flip,
raft::random::PCGenerator& gen)
DI void gen_and_update_bits(
IdxT& src_id, IdxT& dst_id, ProbT a, ProbT ab, ProbT abc, bool clip_and_flip, IdxT r_scale,
IdxT c_scale, IdxT curr_depth, raft::random::PCGenerator& gen)
{
src_bit = dst_bit = 0;
bool src_bit, dst_bit;
ProbT val;
gen.next(val);
if (val <= a) {
src_bit = dst_bit = 0;
src_bit = dst_bit = false;
} else if (val <= ab) {
src_bit = 0;
dst_bit = 1;
src_bit = false;
dst_bit = true;
} else if (val <= abc) {
src_bit = 1;
dst_bit = 0;
src_bit = true;
dst_bit = false;
} else {
src_bit = dst_bit = 1;
src_bit = dst_bit = false;
}
// Courtesy: clip-and-flip from the existing RMAT generator in cuGraph
if (clip_and_flip) {
if (src_id == dst_id) {
if (!src_bit && dst_bit) {
src_bit = !src_bit;
dst_bit = !dst_bit;
}
}
if (clip_and_flip && (src_id == dst_id) && (!src_bit && dst_bit)) {
src_bit = !src_bit;
dst_bit = !dst_bit;
}
if (curr_depth < r_scale) {
src_id += (src_bit << (r_scale - curr_depth));
}
if (curr_depth < c_scale) {
dst_id += (dst_bit << (c_scale - curr_depth));
}
}

Expand Down Expand Up @@ -80,36 +83,32 @@ __global__ void rmat_gen_kernel(
{
IdxT idx = threadIdx.x + ((IdxT)blockIdx.x * blockDim.x);
extern __shared__ ProbT s_theta[];
auto lid = raft::laneId();
unsigned mask = 0xfu << (lid / 4);
auto lid4 = raft::laneId() % 4;
auto theta_len = max_scale * 2 * 2;
auto num_theta_aligned = raft::alignTo<IdxT>(theta_len, raft::WarpSize);
// NOTE: assumes that blockDim.x is a multiple of 4!
for (int i = threadIdx.x; i < max_scale * 2 * 2; i += blockDim.x) {
for (int i = threadIdx.x; i < num_theta_aligned; i += blockDim.x) {
// for each consecutive 4 lanes compute the cdf of a, b, c, d (RMAT numbers)
// this will be used to determine which quadrant to be selected at each level
auto r_theta = theta[i];
auto r_theta = i < theta_len ? theta[i] : ProbT(0);
auto other = raft::shfl_up(r_theta, 0x1);
if (lid % 4 >= 1) {
if (lid4 >= 1) {
r_theta += other;
}
other = raft::shfl_up(r_theta, 0x2);
if (lid % 4 >= 2) {
if (lid4 >= 2) {
r_theta += other;
}
s_theta[i] = r_theta;
if (i < theta_len) {
s_theta[i] = r_theta;
}
}
__syncthreads();
IdxT src_id{0}, dst_id{0};
raft::random::PCGenerator gen{r.seed, r.base_subsequence + idx, 0};
for (IdxT i = 0; i < max_scale; ++i) {
auto a = s_theta[i * 4], ab = s_theta[i * 4 + 1], abc = s_theta[i * 4 + 2];
IdxT src_bit, dst_bit;
gen_bits(src_bit, dst_bit, a, ab, abc, clip_and_flip, gen);
if (i < r_scale) {
src_id += (src_bit << (r_scale - i));
}
if (i < c_scale) {
dst_id += (dst_bit << (c_scale - i));
}
gen_and_update_bits(src_id, dst_id, a, ab, abc, clip_and_flip, r_scale, c_scale, i, gen);
}
store_ids(out, out_src, out_dst, src_id, dst_id, idx, n_edges);
}
Expand Down Expand Up @@ -146,14 +145,7 @@ __global__ void rmat_gen_kernel(
IdxT src_id{0}, dst_id{0};
raft::random::PCGenerator gen{r.seed, r.base_subsequence + idx, 0};
for (IdxT i = 0; i < max_scale; ++i) {
IdxT src_bit, dst_bit;
gen_bits(src_bit, dst_bit, a, ab, abc, clip_and_flip, gen);
if (i < r_scale) {
src_id += (src_bit << (r_scale - i));
}
if (i < c_scale) {
dst_id += (dst_bit << (c_scale - i));
}
gen_and_update_bits(src_id, dst_id, a, ab, abc, clip_and_flip, r_scale, c_scale, i, gen);
}
store_ids(out, out_src, out_dst, src_id, dst_id, idx, n_edges);
}
Expand All @@ -177,7 +169,7 @@ void rmat_rectangular_gen_caller(IdxT* out,
auto max_scale = max(r_scale, c_scale);
auto n_blks = raft::ceildiv<IdxT>(n_edges, N_THREADS);
auto ab = a + b, abc = ab + c;
rmat_gen_kernel<<<n_blks, N_THREADS, smem_size, stream>>>(
rmat_gen_kernel<<<n_blks, N_THREADS, 0, stream>>>(
out, out_src, out_dst, a, ab, abc, r_scale, c_scale, n_edges, clip_and_flip, max_scale, r);
RAFT_CUDA_TRY(cudaGetLastError());
r.advance(n_edges, max_scale);
Expand Down
2 changes: 2 additions & 0 deletions cpp/include/raft/random/rmat_rectangular_generator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ void rmat_rectangular_gen(IdxT* out,
r_scale,
c_scale,
n_edges,
clip_and_flip,
stream,
r);
}
Expand Down Expand Up @@ -111,6 +112,7 @@ void rmat_rectangular_gen(IdxT* out,
r_scale,
c_scale,
n_edges,
clip_and_flip,
stream,
r);
}
Expand Down
32 changes: 19 additions & 13 deletions cpp/test/random/rmat_rectangular_generator.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@
* limitations under the License.
*/

#include <cub/cub.cuh>
#include <gtest/gtest.h>
#include <sys/timeb.h>
#include <vector>

#include "../test_utils.h"
#include <cub/cub.cuh>
#include <gtest/gtest.h>

#include <raft/cuda_utils.cuh>
#include <raft/cudart_utils.h>
#include <raft/random/rmat_rectangular_generator.cuh>
Expand All @@ -28,8 +30,8 @@ namespace raft {
namespace random {

// Courtesy: cuGraph unit-tests
static constexpr float kTolerance = 0.01f;
static constexpr size_t kMinEdges = 100000;
// static constexpr float kTolerance = 0.01f;
// static constexpr size_t kMinEdges = 100000;

struct RmatInputs {
size_t r_scale;
Expand Down Expand Up @@ -58,12 +60,15 @@ __global__ void normalize_kernel(float* theta, size_t len) {
class RmatGenTest : public ::testing::TestWithParam<RmatInputs> {
public:
RmatGenTest()
: params(::testing::TestWithParam<RmatInputs>::GetParam()),
stream(handle.get_stream()),
out(params.n_edges * 2, stream),
out_src(params.n_edges, stream),
out_dst(params.n_edges, stream),
state(seed, GeneratorType::GenPC)
: handle{},
stream{handle.get_stream()},
params{::testing::TestWithParam<RmatInputs>::GetParam()},
out{params.n_edges * 2, stream},
out_src{params.n_edges, stream},
out_dst{params.n_edges, stream},
theta{0, stream},
h_theta{},
state{params.seed, GeneratorType::GenPC}
{
auto theta_len = params.theta_array ? max(params.r_scale, params.c_scale) : 1;
theta.resize(4 * theta_len, stream);
Expand All @@ -72,14 +77,15 @@ class RmatGenTest : public ::testing::TestWithParam<RmatInputs> {
// won't be that large!
normalize_kernel<<<1, 256, 0, stream>>>(theta.data(), theta_len);
RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
raft::update_host(h_theta, theta.data(), 4, stream);
h_theta.resize(theta.size());
raft::update_host(h_theta, theta.data(), theta.size(), stream);
RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
}

protected:
void SetUp() override
{
if (theta_array) {
if (params.theta_array) {
rmat_rectangular_gen(out.data(), out_src.data(), out_dst.data(), theta.data(), params.r_scale,
params.c_scale, params.n_edges, params.clip_and_flip, stream, state);
} else {
Expand All @@ -102,7 +108,7 @@ class RmatGenTest : public ::testing::TestWithParam<RmatInputs> {
RmatInputs params;
rmm::device_uvector<size_t> out, out_src, out_dst;
rmm::device_uvector<float> theta;
float h_theta[4];
std::vector<float> h_theta;
RngState state;
};

Expand Down