@@ -261,6 +261,130 @@ void run_dots_packed(std::string name, //
261261 bench_config.matrix_height , bench_config.matrix_width , bench_config.matrix_depth );
262262}
263263
264+ template <nk_dtype_t input_dtype_>
265+ void measure_angulars_packed ( //
266+ bm::State &state, //
267+ typename nk::type_for<input_dtype_>::type::dots_packed_size_kernel_t packed_size_fn, //
268+ typename nk::type_for<input_dtype_>::type::dots_pack_kernel_t pack_fn, //
269+ typename nk::type_for<input_dtype_>::type::angulars_packed_kernel_t kernel, //
270+ std::size_t m, std::size_t n, std::size_t k) {
271+
272+ using input_t = typename nk::type_for<input_dtype_>::type;
273+ using output_t = typename input_t ::angular_result_t ;
274+ using raw_input_t = typename input_t ::raw_t ;
275+ using raw_output_t = typename output_t ::raw_t ;
276+
277+ nk_size_t values_per_row = nk::divide_round_up (k, nk::dimensions_per_value<input_t >());
278+ nk_size_t a_stride_bytes = values_per_row * sizeof (typename input_t ::raw_t );
279+ nk_size_t b_stride_bytes = values_per_row * sizeof (typename input_t ::raw_t );
280+ nk_size_t packed_bytes = packed_size_fn (n, k);
281+
282+ std::size_t bytes_per_set = m * a_stride_bytes + n * b_stride_bytes + packed_bytes + m * n * sizeof (raw_output_t );
283+ std::size_t const sets_count = bench_input_count (bytes_per_set);
284+
285+ struct gemm_set_t {
286+ nk::vector<input_t > a, b;
287+ std::vector<char > b_packed;
288+ nk::vector<output_t > c;
289+ };
290+ std::vector<gemm_set_t > sets (sets_count);
291+ auto generator = make_random_engine ();
292+ for (auto &s : sets) {
293+ s.a = make_vector_for_matrix<input_dtype_>(m, k);
294+ s.b = make_vector_for_matrix<input_dtype_>(n, k);
295+ s.b_packed .resize (packed_bytes, 0 );
296+ s.c = make_vector<output_t >(m * n);
297+ nk::fill_uniform (generator, s.a .values_data (), s.a .size_values ());
298+ nk::fill_uniform (generator, s.b .values_data (), s.b .size_values ());
299+ pack_fn (s.b .raw_values_data (), n, k, b_stride_bytes, s.b_packed .data ());
300+ }
301+
302+ std::size_t iterations = 0 ;
303+ for (auto _ : state) {
304+ auto &s = sets[iterations & (sets_count - 1 )];
305+ bm::DoNotOptimize (s.c .raw_values_data ());
306+ kernel (s.a .raw_values_data (), s.b_packed .data (), s.c .raw_values_data (), //
307+ m, n, k, a_stride_bytes, n * sizeof (raw_output_t ));
308+ ++iterations;
309+ }
310+
311+ state.counters [" scalar-ops" ] = bm::Counter (iterations * 2.0 * m * n * k, bm::Counter::kIsRate );
312+ }
313+
314+ template <nk_dtype_t input_dtype_>
315+ void run_angulars_packed (std::string name, //
316+ typename nk::type_for<input_dtype_>::type::dots_packed_size_kernel_t packed_size_fn,
317+ typename nk::type_for<input_dtype_>::type::dots_pack_kernel_t pack_fn,
318+ typename nk::type_for<input_dtype_>::type::angulars_packed_kernel_t kernel) {
319+ std::string bench_name = name + " <" + std::to_string (bench_config.matrix_height ) + " x" +
320+ std::to_string (bench_config.matrix_width ) + " x" +
321+ std::to_string (bench_config.matrix_depth ) + " >" ;
322+ bm::RegisterBenchmark (bench_name.c_str (), measure_angulars_packed<input_dtype_>, packed_size_fn, pack_fn, kernel,
323+ bench_config.matrix_height , bench_config.matrix_width , bench_config.matrix_depth );
324+ }
325+
326+ template <nk_dtype_t input_dtype_>
327+ void measure_euclideans_packed ( //
328+ bm::State &state, //
329+ typename nk::type_for<input_dtype_>::type::dots_packed_size_kernel_t packed_size_fn, //
330+ typename nk::type_for<input_dtype_>::type::dots_pack_kernel_t pack_fn, //
331+ typename nk::type_for<input_dtype_>::type::euclideans_packed_kernel_t kernel, //
332+ std::size_t m, std::size_t n, std::size_t k) {
333+
334+ using input_t = typename nk::type_for<input_dtype_>::type;
335+ using output_t = typename input_t ::euclidean_result_t ;
336+ using raw_input_t = typename input_t ::raw_t ;
337+ using raw_output_t = typename output_t ::raw_t ;
338+
339+ nk_size_t values_per_row = nk::divide_round_up (k, nk::dimensions_per_value<input_t >());
340+ nk_size_t a_stride_bytes = values_per_row * sizeof (typename input_t ::raw_t );
341+ nk_size_t b_stride_bytes = values_per_row * sizeof (typename input_t ::raw_t );
342+ nk_size_t packed_bytes = packed_size_fn (n, k);
343+
344+ std::size_t bytes_per_set = m * a_stride_bytes + n * b_stride_bytes + packed_bytes + m * n * sizeof (raw_output_t );
345+ std::size_t const sets_count = bench_input_count (bytes_per_set);
346+
347+ struct gemm_set_t {
348+ nk::vector<input_t > a, b;
349+ std::vector<char > b_packed;
350+ nk::vector<output_t > c;
351+ };
352+ std::vector<gemm_set_t > sets (sets_count);
353+ auto generator = make_random_engine ();
354+ for (auto &s : sets) {
355+ s.a = make_vector_for_matrix<input_dtype_>(m, k);
356+ s.b = make_vector_for_matrix<input_dtype_>(n, k);
357+ s.b_packed .resize (packed_bytes, 0 );
358+ s.c = make_vector<output_t >(m * n);
359+ nk::fill_uniform (generator, s.a .values_data (), s.a .size_values ());
360+ nk::fill_uniform (generator, s.b .values_data (), s.b .size_values ());
361+ pack_fn (s.b .raw_values_data (), n, k, b_stride_bytes, s.b_packed .data ());
362+ }
363+
364+ std::size_t iterations = 0 ;
365+ for (auto _ : state) {
366+ auto &s = sets[iterations & (sets_count - 1 )];
367+ bm::DoNotOptimize (s.c .raw_values_data ());
368+ kernel (s.a .raw_values_data (), s.b_packed .data (), s.c .raw_values_data (), //
369+ m, n, k, a_stride_bytes, n * sizeof (raw_output_t ));
370+ ++iterations;
371+ }
372+
373+ state.counters [" scalar-ops" ] = bm::Counter (iterations * 2.0 * m * n * k, bm::Counter::kIsRate );
374+ }
375+
376+ template <nk_dtype_t input_dtype_>
377+ void run_euclideans_packed (std::string name, //
378+ typename nk::type_for<input_dtype_>::type::dots_packed_size_kernel_t packed_size_fn,
379+ typename nk::type_for<input_dtype_>::type::dots_pack_kernel_t pack_fn,
380+ typename nk::type_for<input_dtype_>::type::euclideans_packed_kernel_t kernel) {
381+ std::string bench_name = name + " <" + std::to_string (bench_config.matrix_height ) + " x" +
382+ std::to_string (bench_config.matrix_width ) + " x" +
383+ std::to_string (bench_config.matrix_depth ) + " >" ;
384+ bm::RegisterBenchmark (bench_name.c_str (), measure_euclideans_packed<input_dtype_>, packed_size_fn, pack_fn, kernel,
385+ bench_config.matrix_height , bench_config.matrix_width , bench_config.matrix_depth );
386+ }
387+
264388template <nk_dtype_t input_dtype_>
265389void measure_dots_symmetric ( //
266390 bm::State &state, //
@@ -314,6 +438,108 @@ void run_dots_symmetric(std::string name, //
314438 kernel, bench_config.matrix_height , bench_config.matrix_depth );
315439}
316440
441+ template <nk_dtype_t input_dtype_>
442+ void measure_angulars_symmetric ( //
443+ bm::State &state, //
444+ typename nk::type_for<input_dtype_>::type::angulars_symmetric_kernel_t kernel, //
445+ std::size_t n, std::size_t k) {
446+
447+ using input_t = typename nk::type_for<input_dtype_>::type;
448+ using output_t = typename input_t ::angular_result_t ;
449+ using raw_input_t = typename input_t ::raw_t ;
450+ using raw_output_t = typename output_t ::raw_t ;
451+
452+ nk_size_t input_values_per_row = nk::divide_round_up (k, nk::dimensions_per_value<input_t >());
453+ nk_size_t input_stride_bytes = input_values_per_row * sizeof (typename input_t ::raw_t );
454+ nk_size_t output_stride_bytes = n * sizeof (raw_output_t );
455+
456+ std::size_t bytes_per_set = n * input_stride_bytes + n * n * sizeof (raw_output_t );
457+ std::size_t const sets_count = bench_input_count (bytes_per_set);
458+
459+ struct syrk_set_t {
460+ nk::vector<input_t > a;
461+ nk::vector<output_t > c;
462+ };
463+ std::vector<syrk_set_t > sets (sets_count);
464+ auto generator = make_random_engine ();
465+ for (auto &s : sets) {
466+ s.a = make_vector_for_matrix<input_dtype_>(n, k);
467+ s.c = make_vector<output_t >(n * n);
468+ nk::fill_uniform (generator, s.a .values_data (), s.a .size_values ());
469+ }
470+
471+ std::size_t iterations = 0 ;
472+ for (auto _ : state) {
473+ auto &s = sets[iterations & (sets_count - 1 )];
474+ bm::DoNotOptimize (s.c .raw_values_data ());
475+ kernel (s.a .raw_values_data (), n, k, input_stride_bytes, //
476+ s.c .raw_values_data (), output_stride_bytes, 0 , n);
477+ ++iterations;
478+ }
479+
480+ state.counters [" scalar-ops" ] = bm::Counter (iterations * n * (n + 1 ) * k, bm::Counter::kIsRate );
481+ }
482+
483+ template <nk_dtype_t input_dtype_>
484+ void run_angulars_symmetric (std::string name, //
485+ typename nk::type_for<input_dtype_>::type::angulars_symmetric_kernel_t kernel) {
486+ std::string bench_name = name + " <" + std::to_string (bench_config.matrix_height ) + " x" +
487+ std::to_string (bench_config.matrix_depth ) + " >" ;
488+ bm::RegisterBenchmark (bench_name.c_str (), measure_angulars_symmetric<input_dtype_>, //
489+ kernel, bench_config.matrix_height , bench_config.matrix_depth );
490+ }
491+
492+ template <nk_dtype_t input_dtype_>
493+ void measure_euclideans_symmetric ( //
494+ bm::State &state, //
495+ typename nk::type_for<input_dtype_>::type::euclideans_symmetric_kernel_t kernel, //
496+ std::size_t n, std::size_t k) {
497+
498+ using input_t = typename nk::type_for<input_dtype_>::type;
499+ using output_t = typename input_t ::euclidean_result_t ;
500+ using raw_input_t = typename input_t ::raw_t ;
501+ using raw_output_t = typename output_t ::raw_t ;
502+
503+ nk_size_t input_values_per_row = nk::divide_round_up (k, nk::dimensions_per_value<input_t >());
504+ nk_size_t input_stride_bytes = input_values_per_row * sizeof (typename input_t ::raw_t );
505+ nk_size_t output_stride_bytes = n * sizeof (raw_output_t );
506+
507+ std::size_t bytes_per_set = n * input_stride_bytes + n * n * sizeof (raw_output_t );
508+ std::size_t const sets_count = bench_input_count (bytes_per_set);
509+
510+ struct syrk_set_t {
511+ nk::vector<input_t > a;
512+ nk::vector<output_t > c;
513+ };
514+ std::vector<syrk_set_t > sets (sets_count);
515+ auto generator = make_random_engine ();
516+ for (auto &s : sets) {
517+ s.a = make_vector_for_matrix<input_dtype_>(n, k);
518+ s.c = make_vector<output_t >(n * n);
519+ nk::fill_uniform (generator, s.a .values_data (), s.a .size_values ());
520+ }
521+
522+ std::size_t iterations = 0 ;
523+ for (auto _ : state) {
524+ auto &s = sets[iterations & (sets_count - 1 )];
525+ bm::DoNotOptimize (s.c .raw_values_data ());
526+ kernel (s.a .raw_values_data (), n, k, input_stride_bytes, //
527+ s.c .raw_values_data (), output_stride_bytes, 0 , n);
528+ ++iterations;
529+ }
530+
531+ state.counters [" scalar-ops" ] = bm::Counter (iterations * n * (n + 1 ) * k, bm::Counter::kIsRate );
532+ }
533+
534+ template <nk_dtype_t input_dtype_>
535+ void run_euclideans_symmetric (std::string name, //
536+ typename nk::type_for<input_dtype_>::type::euclideans_symmetric_kernel_t kernel) {
537+ std::string bench_name = name + " <" + std::to_string (bench_config.matrix_height ) + " x" +
538+ std::to_string (bench_config.matrix_depth ) + " >" ;
539+ bm::RegisterBenchmark (bench_name.c_str (), measure_euclideans_symmetric<input_dtype_>, //
540+ kernel, bench_config.matrix_height , bench_config.matrix_depth );
541+ }
542+
317543/* *
318544 * @brief Measure packed Hamming distance computation.
319545 * Used by: all bench_cross_*.cpp files
0 commit comments