add prior ampere pairwisedistmat kernel to prevent redundant kernel c…

…ompilation, fix clang formating and correct line endings
rapidsai · rapids-bot · Nov 16, 2022 · Oct 20, 2022 · Oct 21, 2022 · Oct 21, 2022
commit a9dabc8ccf65c75d7596e2f4439ac124367b3a63
diff --git a/cpp/include/raft/core/cudart_utils.hpp b/cpp/include/raft/core/cudart_utils.hpp
@@ -355,7 +355,7 @@ inline int getMultiProcessorCount()
 }
 
 /** helper method to get max usable shared mem per block parameter */
-inline std::pair <int,int> getMajorMinorVersion()
+inline std::pair<int, int> getMajorMinorVersion()
 {
   int devId;
   RAFT_CUDA_TRY(cudaGetDevice(&devId));
@@ -366,7 +366,6 @@ inline std::pair <int,int> getMajorMinorVersion()
   return std::make_pair(majorVer, minorVer);
 }
 
-
 /** helper method to convert an array on device to a string on host */
 template <typename T>
 std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t stream, int width = 4)

@@ -17,25 +17,23 @@
 #pragma once
 
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/norm.cuh>
 #include <raft/distance/detail/pairwise_distance_cutlass_base.cuh>
+#include <raft/linalg/norm.cuh>
 
 namespace raft {
 namespace distance {
 namespace detail {
 
 template <typename DataT, typename AccT>
 struct CosineOp {
-    __device__ __host__ CosineOp() { }
-    __device__ __host__ AccT operator() (DataT &aNorm, const DataT &bNorm, DataT &accVal) const {
-        return static_cast<AccT>(1.0) - (AccT) (accVal / (aNorm * bNorm));
-    }
-    __device__ __host__ AccT operator() (DataT aData) const {
-        return aData;
-    }
+  __device__ __host__ CosineOp() {}
+  __device__ __host__ AccT operator()(DataT& aNorm, const DataT& bNorm, DataT& accVal) const
+  {
+    return static_cast<AccT>(1.0) - (AccT)(accVal / (aNorm * bNorm));
+  }
+  __device__ __host__ AccT operator()(DataT aData) const { return aData; }
 };
 
-
 /**
  * @brief the cosine distance matrix calculation implementer
  *  It computes the following equation:
@@ -84,16 +82,15 @@ void cosineImpl(const DataT* x,
                 FinalLambda fin_op,
                 cudaStream_t stream)
 {
-  const auto deviceVersion  = getMajorMinorVersion();
+  const auto deviceVersion = getMajorMinorVersion();
   if (deviceVersion.first >= 8) {
     using CosineOp_ = CosineOp<DataT, AccT>;
     CosineOp_ cosine_dist_op;
 
     cutlassDistanceKernel<DataT, AccT, OutT, IdxT, VecLen, FinalLambda, CosineOp_, isRowMajor>(
-                    x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, cosine_dist_op, stream);
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, cosine_dist_op, stream);
 
   } else {
-
     typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
     typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
@@ -106,46 +103,46 @@ void cosineImpl(const DataT* x,
 
     // epilogue operation lambda for final value calculation
     auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                      DataT * regxn,
-                                      DataT * regyn,
-                                      IdxT gridStrideX,
-                                      IdxT gridStrideY) {
-  #pragma unroll
+                                       DataT * regxn,
+                                       DataT * regyn,
+                                       IdxT gridStrideX,
+                                       IdxT gridStrideY) {
+#pragma unroll
       for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
-  #pragma unroll
+#pragma unroll
         for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-          acc[i][j] = 1.0 - (acc[i][j] / (regxn[i] * regyn[j]) );
+          acc[i][j] = 1.0 - (acc[i][j] / (regxn[i] * regyn[j]));
         }
       }
     };
 
     constexpr size_t shmemSize =
       KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT));
     if (isRowMajor) {
-      auto cosineRowMajor = pairwiseDistanceMatKernel<true,
-                                                      DataT,
-                                                      AccT,
-                                                      OutT,
-                                                      IdxT,
-                                                      KPolicy,
-                                                      decltype(core_lambda),
-                                                      decltype(epilog_lambda),
-                                                      FinalLambda,
-                                                      true>;
+      auto cosineRowMajor = pairwiseDistanceMatKernelPriorToAmpere<true,
+                                                                   DataT,
+                                                                   AccT,
+                                                                   OutT,
+                                                                   IdxT,
+                                                                   KPolicy,
+                                                                   decltype(core_lambda),
+                                                                   decltype(epilog_lambda),
+                                                                   FinalLambda,
+                                                                   true>;
       dim3 grid           = launchConfigGenerator<KPolicy>(m, n, shmemSize, cosineRowMajor);
       cosineRowMajor<<<grid, blk, shmemSize, stream>>>(
         x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
     } else {
-      auto cosineColMajor = pairwiseDistanceMatKernel<true,
-                                                      DataT,
-                                                      AccT,
-                                                      OutT,
-                                                      IdxT,
-                                                      KPolicy,
-                                                      decltype(core_lambda),
-                                                      decltype(epilog_lambda),
-                                                      FinalLambda,
-                                                      false>;
+      auto cosineColMajor = pairwiseDistanceMatKernelPriorToAmpere<true,
+                                                                   DataT,
+                                                                   AccT,
+                                                                   OutT,
+                                                                   IdxT,
+                                                                   KPolicy,
+                                                                   decltype(core_lambda),
+                                                                   decltype(epilog_lambda),
+                                                                   FinalLambda,
+                                                                   false>;
       dim3 grid           = launchConfigGenerator<KPolicy>(m, n, shmemSize, cosineColMajor);
       cosineColMajor<<<grid, blk, shmemSize, stream>>>(
         x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);

@@ -618,11 +618,9 @@ void distance(const InType* x,
 
 template <typename AccType, typename OutType, typename Index>
 struct default_fin_op {
-    __host__ __device__  default_fin_op() { };
-    // functor signature.
-    __host__ __device__ OutType operator()(AccType d_val, Index g_d_idx) const {
-      return d_val;
-    }
+  __host__ __device__ default_fin_op(){};
+  // functor signature.
+  __host__ __device__ OutType operator()(AccType d_val, Index g_d_idx) const { return d_val; }
 };
 
 template <raft::distance::DistanceType distanceType,

@@ -17,27 +17,26 @@
 #pragma once
 
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/norm.cuh>
 #include <raft/distance/detail/pairwise_distance_cutlass_base.cuh>
+#include <raft/linalg/norm.cuh>
 
 namespace raft {
 namespace distance {
 namespace detail {
 
 template <typename DataT, typename AccT>
 struct L2ExpandedOp {
-    bool sqrt;
-
-    __device__ __host__ L2ExpandedOp() : sqrt(false) { }
-    __device__ __host__ L2ExpandedOp(bool isSqrt) : sqrt(isSqrt) { }
-    __device__ __host__ AccT operator() (DataT &aNorm, const DataT &bNorm, DataT &accVal) const {
-        AccT outVal = aNorm + bNorm - DataT(2.0) * accVal;
-        return sqrt ? raft::mySqrt(outVal) : outVal;
-    }
+  bool sqrt;
+
+  __device__ __host__ L2ExpandedOp() : sqrt(false) {}
+  __device__ __host__ L2ExpandedOp(bool isSqrt) : sqrt(isSqrt) {}
+  __device__ __host__ AccT operator()(DataT& aNorm, const DataT& bNorm, DataT& accVal) const
+  {
+    AccT outVal = aNorm + bNorm - DataT(2.0) * accVal;
+    return sqrt ? raft::mySqrt(outVal) : outVal;
+  }
 
-    __device__ __host__ AccT operator() (DataT aData) const {
-        return aData;
-    }
+  __device__ __host__ AccT operator()(DataT aData) const { return aData; }
 };
 
 /**
@@ -89,16 +88,15 @@ void euclideanExpImpl(const DataT* x,
                       FinalLambda fin_op,
                       cudaStream_t stream)
 {
-  const auto deviceVersion  = getMajorMinorVersion();
+  const auto deviceVersion = getMajorMinorVersion();
   if (deviceVersion.first >= 8) {
     using L2Op = L2ExpandedOp<DataT, AccT>;
     L2Op L2_dist_op(sqrt);
 
     cutlassDistanceKernel<DataT, AccT, OutT, IdxT, VecLen, FinalLambda, L2Op, isRowMajor>(
-                    x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, L2_dist_op, stream);
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, L2_dist_op, stream);
 
   } else {
-
     typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
     typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
@@ -111,55 +109,56 @@ void euclideanExpImpl(const DataT* x,
 
     // epilogue operation lambda for final value calculation
     auto epilog_lambda = [sqrt] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                          DataT * regxn,
-                                          DataT * regyn,
-                                          IdxT gridStrideX,
-                                          IdxT gridStrideY) {
-  #pragma unroll
+                                           DataT * regxn,
+                                           DataT * regyn,
+                                           IdxT gridStrideX,
+                                           IdxT gridStrideY) {
+#pragma unroll
       for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
-  #pragma unroll
+#pragma unroll
         for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
           acc[i][j] = regxn[i] + regyn[j] - (DataT)2.0 * acc[i][j];
         }
       }
       if (sqrt) {
-  #pragma unroll
+#pragma unroll
         for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
-  #pragma unroll
+#pragma unroll
           for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
             acc[i][j] = raft::mySqrt(acc[i][j]);
           }
         }
       }
     };
 
-    constexpr size_t shmemSize = KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT));
+    constexpr size_t shmemSize =
+      KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT));
     if (isRowMajor) {
-      auto euclideanExpRowMajor = pairwiseDistanceMatKernel<true,
-                                                            DataT,
-                                                            AccT,
-                                                            OutT,
-                                                            IdxT,
-                                                            KPolicy,
-                                                            decltype(core_lambda),
-                                                            decltype(epilog_lambda),
-                                                            FinalLambda,
-                                                            true>;
+      auto euclideanExpRowMajor = pairwiseDistanceMatKernelPriorToAmpere<true,
+                                                                         DataT,
+                                                                         AccT,
+                                                                         OutT,
+                                                                         IdxT,
+                                                                         KPolicy,
+                                                                         decltype(core_lambda),
+                                                                         decltype(epilog_lambda),
+                                                                         FinalLambda,
+                                                                         true>;
       dim3 grid = launchConfigGenerator<KPolicy>(m, n, shmemSize, euclideanExpRowMajor);
 
       euclideanExpRowMajor<<<grid, blk, shmemSize, stream>>>(
         x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
     } else {
-      auto euclideanExpColMajor = pairwiseDistanceMatKernel<true,
-                                                            DataT,
-                                                            AccT,
-                                                            OutT,
-                                                            IdxT,
-                                                            KPolicy,
-                                                            decltype(core_lambda),
-                                                            decltype(epilog_lambda),
-                                                            FinalLambda,
-                                                            false>;
+      auto euclideanExpColMajor = pairwiseDistanceMatKernelPriorToAmpere<true,
+                                                                         DataT,
+                                                                         AccT,
+                                                                         OutT,
+                                                                         IdxT,
+                                                                         KPolicy,
+                                                                         decltype(core_lambda),
+                                                                         decltype(epilog_lambda),
+                                                                         FinalLambda,
+                                                                         false>;
       dim3 grid = launchConfigGenerator<KPolicy>(m, n, shmemSize, euclideanExpColMajor);
       euclideanExpColMajor<<<grid, blk, shmemSize, stream>>>(
         x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);

@@ -364,6 +364,89 @@ __global__ __launch_bounds__(Policy::Nthreads, 2)
   obj.run();
 }
 
+/**
+ * @brief the distance matrix calculation kernel for L2 and cosine
+ * for GPU arch < SM 8.0, this version is to make sure we don't recompile
+ * these kernels for ampere or higher as we use cutlass kernel for it.
+ * @tparam useNorms       whether norms are needed
+ * @tparam DataT          input data-type (for A and B matrices)
+ * @tparam AccT           accumulation data-type
+ * @tparam OutT           output data-type (for C and D matrices)
+ * @tparam IdxT           index data-type
+ * @tparam Policy         struct which tunes the Contraction kernel
+ * @tparam CoreLambda     lambda which implements accumulation operation
+ * @tparam EpilogueLambda lambda which implements operation for calculating
+                          final value.
+ * @tparam FinalLambda    final lambda called on final distance value
+ * @tparam isRowMajor     true if input/output is row major(default),
+                          false for column major
+ *
+ * @param[in]       x input matrix
+ * @param[in]       y input matrix
+ * @param[in]       xn row norms of input matrix A.
+ * @param[in]       yn row norms of input matrix B.
+ * @param[in]       m number of rows of A and C/D
+ * @param[in]       n number of columns of B and C/D
+ * @param[in]       k number of cols of A and rows of B
+ * @param[in]       lda leading dimension of A
+ * @param[in]       ldb leading dimension of B
+ * @param[in]       ldd leading dimension of C/D
+ * @param[output]   pD output matrix
+ * @param core_op   the core lambda
+ * @param epilog_op the epilogue lambda
+ * @param fin_op    the final gemm epilogue lambda
+ */
+
+template <bool useNorms,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename Policy,
+          typename CoreLambda,
+          typename EpilogueLambda,
+          typename FinalLambda,
+          bool isRowMajor = true,
+          bool writeOut   = true>
+__global__ __launch_bounds__(Policy::Nthreads, 2)
+
+  void pairwiseDistanceMatKernelPriorToAmpere(const DataT* x,
+                                              const DataT* y,
+                                              const DataT* _xn,
+                                              const DataT* _yn,
+                                              IdxT m,
+                                              IdxT n,
+                                              IdxT k,
+                                              IdxT lda,
+                                              IdxT ldb,
+                                              IdxT ldd,
+                                              OutT* dOutput,
+                                              CoreLambda core_op,
+                                              EpilogueLambda epilog_op,
+                                              FinalLambda fin_op)
+{
+#if __CUDA_ARCH__ < 800
+  extern __shared__ char smem[];
+  auto rowEpilog = [] __device__(IdxT starty) { return; };
+
+  PairwiseDistances<useNorms,
+                    DataT,
+                    AccT,
+                    OutT,
+                    IdxT,
+                    Policy,
+                    CoreLambda,
+                    EpilogueLambda,
+                    FinalLambda,
+                    decltype(rowEpilog),
+                    isRowMajor,
+                    writeOut>
+    obj(
+      x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op, epilog_op, fin_op, rowEpilog);
+  obj.run();
+#endif
+}
+
 template <typename P, typename IdxT, typename T>
 dim3 launchConfigGenerator(IdxT m, IdxT n, std::size_t sMemSize, T func)
 {