Skip to content

Commit

Permalink
Simplify avx2 chunkset a bit
Browse files Browse the repository at this point in the history
Put length 16 in the length checking ladder and take care of it there
since it's also a simple case to handle. We kind of went out of our way
to pretend 128 bit vectors didn't exist when using avx2 but this can be
handled in a single instruction. Strangely the intrinsic uses vector
register operands but the instruction itself assumes a memory operand
for the source. This also means we don't have to handle this case in our
"GET_CHUNK_MAG" function.
  • Loading branch information
KungFuJesus authored and Dead2 committed Oct 12, 2024
1 parent dae668d commit b52e703
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 7 deletions.
10 changes: 6 additions & 4 deletions arch/x86/chunkset_avx2.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ typedef __m256i chunk_t;
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
#define HAVE_CHUNKMEMSET_16
#define HAVE_CHUNK_MAG

/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
Expand Down Expand Up @@ -68,6 +69,10 @@ static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
*chunk = _mm256_set1_epi64x(tmp);
}

static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) {
*chunk = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)from));
}

static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
*chunk = _mm256_loadu_si256((__m256i *)s);
}
Expand Down Expand Up @@ -99,10 +104,7 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
perm_vec = _mm256_add_epi8(perm_vec, permute_xform);
ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
} else if (dist == 16) {
__m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
return _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
} else {
} else {
__m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
__m128i ret_vec1 = _mm_loadu_si128((__m128i*)(buf + 16));
/* Take advantage of the fact that only the latter half of the 256 bit vector will actually differ */
Expand Down
11 changes: 8 additions & 3 deletions chunkset_tpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,11 +130,16 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
#ifdef HAVE_CHUNKMEMSET_8
if (dist == 8) {
chunkmemset_8(from, &chunk_load);
} else if (dist == sizeof(chunk_t)) {
loadchunk(from, &chunk_load);
} else
#endif
{
#ifdef HAVE_CHUNKMEMSET_16
if (dist == 16) {
chunkmemset_16(from, &chunk_load);
} else
#endif
if (dist == sizeof(chunk_t)) {
loadchunk(from, &chunk_load);
} else {
chunk_load = GET_CHUNK_MAG(from, &chunk_mod, dist);
}

Expand Down

0 comments on commit b52e703

Please sign in to comment.