Closed
Description
opened on Feb 13, 2023
When linking with with libomp (or libiomp5), we are seeing a segfault coming from shared_ptr
usage in raft::interruptible. Following is a reproducer for this bug:
- Using ngc pytorch 23.01 container:
- Install gtest and libomp:
apt update && apt install libgtest-dev libomp-dev
- Compile the following code with:
nvcc -Xcompiler=-fopenmp --std=c++17 test.cu -o test -lgtest -lgtest_main -lomp5
and run./test
#include <cstddef>
#include <gtest/gtest.h>
#include <iostream>
#include <memory>
#include <omp.h>
#include <raft/interruptible.hpp>
#include <rmm/cuda_stream.hpp>
#include <thread>
#include <vector>
namespace raft {
__global__ void gpu_wait(int millis)
{
for (auto i = millis; i > 0; i--) {
#if __CUDA_ARCH__ >= 700
__nanosleep(1000000);
#else
// For older CUDA devices:
// just do some random work that takes more or less the same time from run to run.
volatile double x = 0;
for (int i = 0; i < 10000; i++) {
x = x + double(i);
x /= 2.0;
__syncthreads();
}
#endif
}
}
TEST(Raft, InterruptibleOpenMP)
{
// number of threads must be smaller than max number of resident grids for GPU
const int n_threads = 10;
// 1 <= n_expected_succeed <= n_threads
const int n_expected_succeed = 5;
// How many milliseconds passes between a thread i and i+1 finishes.
// i.e. thread i executes (C + i*n_expected_succeed) milliseconds in total.
const int thread_delay_millis = 20;
std::vector<std::shared_ptr<interruptible>> thread_tokens(n_threads);
int n_finished = 0;
int n_cancelled = 0;
omp_set_dynamic(0);
omp_set_num_threads(n_threads);
#pragma omp parallel reduction(+ : n_finished) reduction(+ : n_cancelled) num_threads(n_threads)
{
auto i = omp_get_thread_num();
rmm::cuda_stream stream;
gpu_wait<<<1, 1, 0, stream.value()>>>(1);
interruptible::synchronize(stream);
thread_tokens[i] = interruptible::get_token();
#pragma omp barrier
try {
gpu_wait<<<1, 1, 0, stream.value()>>>((1 + i) * thread_delay_millis);
interruptible::synchronize(stream);
n_finished = 1;
} catch (interrupted_exception&) {
n_cancelled = 1;
}
if (i == n_expected_succeed - 1) {
for (auto token : thread_tokens)
token->cancel();
}
#pragma omp barrier
// clear the cancellation state to not disrupt other tests
interruptible::yield_no_throw();
}
ASSERT_EQ(n_finished, n_expected_succeed);
ASSERT_EQ(n_cancelled, n_threads - n_expected_succeed);
}
} // namespace raft
The workload (https://github.com/rapidsai/cuml/blob/branch-23.04/notebooks/random_forest_demo.ipynb) where this bug originated showed the following backtrace:
Backtrace:
/lib/libcuml++.so(_ZNSt10_HashtableINSt6thread2idESt4pairIKS1_St8weak_ptrIN4raft13interruptibleEEESaIS8_ENSt8__detail10_Select1stESt8equal_toIS1_ESt4hashIS1_ENSA_18_Mod_range_hashingENSA_20_Default_ranged_hashENSA_20_Prime_rehash_policyENSA_17_Hashtable_traitsILb0ELb0ELb1EEEE4findERS3_+0x40)[0x7f89afa46060]
/lib/libcuml++.so(_ZNSt19_Sp_counted_deleterIPN4raft13interruptibleEZNS1_14get_token_implILb1EEESt10shared_ptrIS1_ENSt6thread2idEEUlT_E_SaIvELN9__gnu_cxx12_Lock_policyE2EE10_M_disposeEv+0x45)[0x7f89afa5f895]
/lib/libcuml++.so(_ZNSt10shared_ptrIN4raft13interruptibleEED1Ev+0x50)[0x7f89afa0fd90]
/usr/lib/x86_64-linux-gnu/libc.so.6(__call_tls_dtors+0x3f)[0x7f8b473ea2bf]
/usr/lib/x86_64-linux-gnu/libpthread.so.0(+0x8617)[0x7f8b47388617]
/usr/lib/x86_64-linux-gnu/libc.so.6(clone+0x43)[0x7f8b474c2133]
The backtrace is very similar to the one reported here: #1225, so it's possible this is the same issue, but we need to verify. Note that LLVM OpenMP "should" be a drop-in replacement for libgomp from what I learned so far. So it's possible that this is a real use case where linking with llvm openmp is causing this bug, but I think it's very unlikely.
Activity