[BUG] Segmentation fault in Interruptible OpenMP

When linking with with libomp (or libiomp5), we are seeing a segfault coming from `shared_ptr` usage in raft::interruptible. Following is a reproducer for this bug:

- Using ngc pytorch 23.01 container:
- Install gtest and libomp: `apt update && apt install libgtest-dev libomp-dev`
- Compile the following code with: `nvcc -Xcompiler=-fopenmp --std=c++17 test.cu -o test -lgtest -lgtest_main -lomp5` and run `./test`
```CPP
#include <cstddef>
#include <gtest/gtest.h>
#include <iostream>
#include <memory>
#include <omp.h>
#include <raft/interruptible.hpp>
#include <rmm/cuda_stream.hpp>
#include <thread>
#include <vector>

namespace raft {

__global__ void gpu_wait(int millis)
{
  for (auto i = millis; i > 0; i--) {
#if __CUDA_ARCH__ >= 700
    __nanosleep(1000000);
#else
    // For older CUDA devices:
    // just do some random work that takes more or less the same time from run to run.
    volatile double x = 0;
    for (int i = 0; i < 10000; i++) {
      x = x + double(i);
      x /= 2.0;
      __syncthreads();
    }
#endif
  }
}

TEST(Raft, InterruptibleOpenMP)
{
  // number of threads must be smaller than max number of resident grids for GPU
  const int n_threads = 10;
  // 1 <= n_expected_succeed <= n_threads
  const int n_expected_succeed = 5;
  // How many milliseconds passes between a thread i and i+1 finishes.
  // i.e. thread i executes (C + i*n_expected_succeed) milliseconds in total.
  const int thread_delay_millis = 20;

  std::vector<std::shared_ptr<interruptible>> thread_tokens(n_threads);
  int n_finished  = 0;
  int n_cancelled = 0;

  omp_set_dynamic(0);
  omp_set_num_threads(n_threads);
#pragma omp parallel reduction(+ : n_finished) reduction(+ : n_cancelled) num_threads(n_threads)
  {
    auto i = omp_get_thread_num();
    rmm::cuda_stream stream;
    gpu_wait<<<1, 1, 0, stream.value()>>>(1);
    interruptible::synchronize(stream);
    thread_tokens[i] = interruptible::get_token();

#pragma omp barrier
    try {
      gpu_wait<<<1, 1, 0, stream.value()>>>((1 + i) * thread_delay_millis);
      interruptible::synchronize(stream);
      n_finished = 1;
    } catch (interrupted_exception&) {
      n_cancelled = 1;
    }
    if (i == n_expected_succeed - 1) {
      for (auto token : thread_tokens)
        token->cancel();
    }

#pragma omp barrier
    // clear the cancellation state to not disrupt other tests
    interruptible::yield_no_throw();
  }
  ASSERT_EQ(n_finished, n_expected_succeed);
  ASSERT_EQ(n_cancelled, n_threads - n_expected_succeed);
}
}  // namespace raft
```

The workload (https://github.com/rapidsai/cuml/blob/branch-23.04/notebooks/random_forest_demo.ipynb) where this bug originated showed the following backtrace:
```
Backtrace:
/lib/libcuml++.so(_ZNSt10_HashtableINSt6thread2idESt4pairIKS1_St8weak_ptrIN4raft13interruptibleEEESaIS8_ENSt8__detail10_Select1stESt8equal_toIS1_ESt4hashIS1_ENSA_18_Mod_range_hashingENSA_20_Default_ranged_hashENSA_20_Prime_rehash_policyENSA_17_Hashtable_traitsILb0ELb0ELb1EEEE4findERS3_+0x40)[0x7f89afa46060]
/lib/libcuml++.so(_ZNSt19_Sp_counted_deleterIPN4raft13interruptibleEZNS1_14get_token_implILb1EEESt10shared_ptrIS1_ENSt6thread2idEEUlT_E_SaIvELN9__gnu_cxx12_Lock_policyE2EE10_M_disposeEv+0x45)[0x7f89afa5f895]
/lib/libcuml++.so(_ZNSt10shared_ptrIN4raft13interruptibleEED1Ev+0x50)[0x7f89afa0fd90]
/usr/lib/x86_64-linux-gnu/libc.so.6(__call_tls_dtors+0x3f)[0x7f8b473ea2bf]
/usr/lib/x86_64-linux-gnu/libpthread.so.0(+0x8617)[0x7f8b47388617]
/usr/lib/x86_64-linux-gnu/libc.so.6(clone+0x43)[0x7f8b474c2133]
```
The backtrace is very similar to the one reported here: https://github.com/rapidsai/raft/issues/1225, so it's possible this is the same issue, but we need to verify. Note that LLVM OpenMP "should" be a drop-in replacement for libgomp from what I learned so far. So it's possible that this is a real use case where linking with llvm openmp is causing this bug, but I think it's very unlikely. 

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[BUG] Segmentation fault in Interruptible OpenMP #1275

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development