Skip to content

Commit 179e00f

Browse files
committed
Fix: Differntiate angulars and euclideans signatures
1 parent b7e8beb commit 179e00f

File tree

14 files changed

+827
-542
lines changed

14 files changed

+827
-542
lines changed

bench/bench.hpp

Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,130 @@ void run_dots_packed(std::string name, //
261261
bench_config.matrix_height, bench_config.matrix_width, bench_config.matrix_depth);
262262
}
263263

264+
template <nk_dtype_t input_dtype_>
265+
void measure_angulars_packed( //
266+
bm::State &state, //
267+
typename nk::type_for<input_dtype_>::type::dots_packed_size_kernel_t packed_size_fn, //
268+
typename nk::type_for<input_dtype_>::type::dots_pack_kernel_t pack_fn, //
269+
typename nk::type_for<input_dtype_>::type::angulars_packed_kernel_t kernel, //
270+
std::size_t m, std::size_t n, std::size_t k) {
271+
272+
using input_t = typename nk::type_for<input_dtype_>::type;
273+
using output_t = typename input_t::angular_result_t;
274+
using raw_input_t = typename input_t::raw_t;
275+
using raw_output_t = typename output_t::raw_t;
276+
277+
nk_size_t values_per_row = nk::divide_round_up(k, nk::dimensions_per_value<input_t>());
278+
nk_size_t a_stride_bytes = values_per_row * sizeof(typename input_t::raw_t);
279+
nk_size_t b_stride_bytes = values_per_row * sizeof(typename input_t::raw_t);
280+
nk_size_t packed_bytes = packed_size_fn(n, k);
281+
282+
std::size_t bytes_per_set = m * a_stride_bytes + n * b_stride_bytes + packed_bytes + m * n * sizeof(raw_output_t);
283+
std::size_t const sets_count = bench_input_count(bytes_per_set);
284+
285+
struct gemm_set_t {
286+
nk::vector<input_t> a, b;
287+
std::vector<char> b_packed;
288+
nk::vector<output_t> c;
289+
};
290+
std::vector<gemm_set_t> sets(sets_count);
291+
auto generator = make_random_engine();
292+
for (auto &s : sets) {
293+
s.a = make_vector_for_matrix<input_dtype_>(m, k);
294+
s.b = make_vector_for_matrix<input_dtype_>(n, k);
295+
s.b_packed.resize(packed_bytes, 0);
296+
s.c = make_vector<output_t>(m * n);
297+
nk::fill_uniform(generator, s.a.values_data(), s.a.size_values());
298+
nk::fill_uniform(generator, s.b.values_data(), s.b.size_values());
299+
pack_fn(s.b.raw_values_data(), n, k, b_stride_bytes, s.b_packed.data());
300+
}
301+
302+
std::size_t iterations = 0;
303+
for (auto _ : state) {
304+
auto &s = sets[iterations & (sets_count - 1)];
305+
bm::DoNotOptimize(s.c.raw_values_data());
306+
kernel(s.a.raw_values_data(), s.b_packed.data(), s.c.raw_values_data(), //
307+
m, n, k, a_stride_bytes, n * sizeof(raw_output_t));
308+
++iterations;
309+
}
310+
311+
state.counters["scalar-ops"] = bm::Counter(iterations * 2.0 * m * n * k, bm::Counter::kIsRate);
312+
}
313+
314+
template <nk_dtype_t input_dtype_>
315+
void run_angulars_packed(std::string name, //
316+
typename nk::type_for<input_dtype_>::type::dots_packed_size_kernel_t packed_size_fn,
317+
typename nk::type_for<input_dtype_>::type::dots_pack_kernel_t pack_fn,
318+
typename nk::type_for<input_dtype_>::type::angulars_packed_kernel_t kernel) {
319+
std::string bench_name = name + "<" + std::to_string(bench_config.matrix_height) + "x" +
320+
std::to_string(bench_config.matrix_width) + "x" +
321+
std::to_string(bench_config.matrix_depth) + ">";
322+
bm::RegisterBenchmark(bench_name.c_str(), measure_angulars_packed<input_dtype_>, packed_size_fn, pack_fn, kernel,
323+
bench_config.matrix_height, bench_config.matrix_width, bench_config.matrix_depth);
324+
}
325+
326+
template <nk_dtype_t input_dtype_>
327+
void measure_euclideans_packed( //
328+
bm::State &state, //
329+
typename nk::type_for<input_dtype_>::type::dots_packed_size_kernel_t packed_size_fn, //
330+
typename nk::type_for<input_dtype_>::type::dots_pack_kernel_t pack_fn, //
331+
typename nk::type_for<input_dtype_>::type::euclideans_packed_kernel_t kernel, //
332+
std::size_t m, std::size_t n, std::size_t k) {
333+
334+
using input_t = typename nk::type_for<input_dtype_>::type;
335+
using output_t = typename input_t::euclidean_result_t;
336+
using raw_input_t = typename input_t::raw_t;
337+
using raw_output_t = typename output_t::raw_t;
338+
339+
nk_size_t values_per_row = nk::divide_round_up(k, nk::dimensions_per_value<input_t>());
340+
nk_size_t a_stride_bytes = values_per_row * sizeof(typename input_t::raw_t);
341+
nk_size_t b_stride_bytes = values_per_row * sizeof(typename input_t::raw_t);
342+
nk_size_t packed_bytes = packed_size_fn(n, k);
343+
344+
std::size_t bytes_per_set = m * a_stride_bytes + n * b_stride_bytes + packed_bytes + m * n * sizeof(raw_output_t);
345+
std::size_t const sets_count = bench_input_count(bytes_per_set);
346+
347+
struct gemm_set_t {
348+
nk::vector<input_t> a, b;
349+
std::vector<char> b_packed;
350+
nk::vector<output_t> c;
351+
};
352+
std::vector<gemm_set_t> sets(sets_count);
353+
auto generator = make_random_engine();
354+
for (auto &s : sets) {
355+
s.a = make_vector_for_matrix<input_dtype_>(m, k);
356+
s.b = make_vector_for_matrix<input_dtype_>(n, k);
357+
s.b_packed.resize(packed_bytes, 0);
358+
s.c = make_vector<output_t>(m * n);
359+
nk::fill_uniform(generator, s.a.values_data(), s.a.size_values());
360+
nk::fill_uniform(generator, s.b.values_data(), s.b.size_values());
361+
pack_fn(s.b.raw_values_data(), n, k, b_stride_bytes, s.b_packed.data());
362+
}
363+
364+
std::size_t iterations = 0;
365+
for (auto _ : state) {
366+
auto &s = sets[iterations & (sets_count - 1)];
367+
bm::DoNotOptimize(s.c.raw_values_data());
368+
kernel(s.a.raw_values_data(), s.b_packed.data(), s.c.raw_values_data(), //
369+
m, n, k, a_stride_bytes, n * sizeof(raw_output_t));
370+
++iterations;
371+
}
372+
373+
state.counters["scalar-ops"] = bm::Counter(iterations * 2.0 * m * n * k, bm::Counter::kIsRate);
374+
}
375+
376+
template <nk_dtype_t input_dtype_>
377+
void run_euclideans_packed(std::string name, //
378+
typename nk::type_for<input_dtype_>::type::dots_packed_size_kernel_t packed_size_fn,
379+
typename nk::type_for<input_dtype_>::type::dots_pack_kernel_t pack_fn,
380+
typename nk::type_for<input_dtype_>::type::euclideans_packed_kernel_t kernel) {
381+
std::string bench_name = name + "<" + std::to_string(bench_config.matrix_height) + "x" +
382+
std::to_string(bench_config.matrix_width) + "x" +
383+
std::to_string(bench_config.matrix_depth) + ">";
384+
bm::RegisterBenchmark(bench_name.c_str(), measure_euclideans_packed<input_dtype_>, packed_size_fn, pack_fn, kernel,
385+
bench_config.matrix_height, bench_config.matrix_width, bench_config.matrix_depth);
386+
}
387+
264388
template <nk_dtype_t input_dtype_>
265389
void measure_dots_symmetric( //
266390
bm::State &state, //
@@ -314,6 +438,108 @@ void run_dots_symmetric(std::string name, //
314438
kernel, bench_config.matrix_height, bench_config.matrix_depth);
315439
}
316440

441+
template <nk_dtype_t input_dtype_>
442+
void measure_angulars_symmetric( //
443+
bm::State &state, //
444+
typename nk::type_for<input_dtype_>::type::angulars_symmetric_kernel_t kernel, //
445+
std::size_t n, std::size_t k) {
446+
447+
using input_t = typename nk::type_for<input_dtype_>::type;
448+
using output_t = typename input_t::angular_result_t;
449+
using raw_input_t = typename input_t::raw_t;
450+
using raw_output_t = typename output_t::raw_t;
451+
452+
nk_size_t input_values_per_row = nk::divide_round_up(k, nk::dimensions_per_value<input_t>());
453+
nk_size_t input_stride_bytes = input_values_per_row * sizeof(typename input_t::raw_t);
454+
nk_size_t output_stride_bytes = n * sizeof(raw_output_t);
455+
456+
std::size_t bytes_per_set = n * input_stride_bytes + n * n * sizeof(raw_output_t);
457+
std::size_t const sets_count = bench_input_count(bytes_per_set);
458+
459+
struct syrk_set_t {
460+
nk::vector<input_t> a;
461+
nk::vector<output_t> c;
462+
};
463+
std::vector<syrk_set_t> sets(sets_count);
464+
auto generator = make_random_engine();
465+
for (auto &s : sets) {
466+
s.a = make_vector_for_matrix<input_dtype_>(n, k);
467+
s.c = make_vector<output_t>(n * n);
468+
nk::fill_uniform(generator, s.a.values_data(), s.a.size_values());
469+
}
470+
471+
std::size_t iterations = 0;
472+
for (auto _ : state) {
473+
auto &s = sets[iterations & (sets_count - 1)];
474+
bm::DoNotOptimize(s.c.raw_values_data());
475+
kernel(s.a.raw_values_data(), n, k, input_stride_bytes, //
476+
s.c.raw_values_data(), output_stride_bytes, 0, n);
477+
++iterations;
478+
}
479+
480+
state.counters["scalar-ops"] = bm::Counter(iterations * n * (n + 1) * k, bm::Counter::kIsRate);
481+
}
482+
483+
template <nk_dtype_t input_dtype_>
484+
void run_angulars_symmetric(std::string name, //
485+
typename nk::type_for<input_dtype_>::type::angulars_symmetric_kernel_t kernel) {
486+
std::string bench_name = name + "<" + std::to_string(bench_config.matrix_height) + "x" +
487+
std::to_string(bench_config.matrix_depth) + ">";
488+
bm::RegisterBenchmark(bench_name.c_str(), measure_angulars_symmetric<input_dtype_>, //
489+
kernel, bench_config.matrix_height, bench_config.matrix_depth);
490+
}
491+
492+
template <nk_dtype_t input_dtype_>
493+
void measure_euclideans_symmetric( //
494+
bm::State &state, //
495+
typename nk::type_for<input_dtype_>::type::euclideans_symmetric_kernel_t kernel, //
496+
std::size_t n, std::size_t k) {
497+
498+
using input_t = typename nk::type_for<input_dtype_>::type;
499+
using output_t = typename input_t::euclidean_result_t;
500+
using raw_input_t = typename input_t::raw_t;
501+
using raw_output_t = typename output_t::raw_t;
502+
503+
nk_size_t input_values_per_row = nk::divide_round_up(k, nk::dimensions_per_value<input_t>());
504+
nk_size_t input_stride_bytes = input_values_per_row * sizeof(typename input_t::raw_t);
505+
nk_size_t output_stride_bytes = n * sizeof(raw_output_t);
506+
507+
std::size_t bytes_per_set = n * input_stride_bytes + n * n * sizeof(raw_output_t);
508+
std::size_t const sets_count = bench_input_count(bytes_per_set);
509+
510+
struct syrk_set_t {
511+
nk::vector<input_t> a;
512+
nk::vector<output_t> c;
513+
};
514+
std::vector<syrk_set_t> sets(sets_count);
515+
auto generator = make_random_engine();
516+
for (auto &s : sets) {
517+
s.a = make_vector_for_matrix<input_dtype_>(n, k);
518+
s.c = make_vector<output_t>(n * n);
519+
nk::fill_uniform(generator, s.a.values_data(), s.a.size_values());
520+
}
521+
522+
std::size_t iterations = 0;
523+
for (auto _ : state) {
524+
auto &s = sets[iterations & (sets_count - 1)];
525+
bm::DoNotOptimize(s.c.raw_values_data());
526+
kernel(s.a.raw_values_data(), n, k, input_stride_bytes, //
527+
s.c.raw_values_data(), output_stride_bytes, 0, n);
528+
++iterations;
529+
}
530+
531+
state.counters["scalar-ops"] = bm::Counter(iterations * n * (n + 1) * k, bm::Counter::kIsRate);
532+
}
533+
534+
template <nk_dtype_t input_dtype_>
535+
void run_euclideans_symmetric(std::string name, //
536+
typename nk::type_for<input_dtype_>::type::euclideans_symmetric_kernel_t kernel) {
537+
std::string bench_name = name + "<" + std::to_string(bench_config.matrix_height) + "x" +
538+
std::to_string(bench_config.matrix_depth) + ">";
539+
bm::RegisterBenchmark(bench_name.c_str(), measure_euclideans_symmetric<input_dtype_>, //
540+
kernel, bench_config.matrix_height, bench_config.matrix_depth);
541+
}
542+
317543
/**
318544
* @brief Measure packed Hamming distance computation.
319545
* Used by: all bench_cross_*.cpp files

0 commit comments

Comments
 (0)