Skip to content

Commit

Permalink
Add slide_hash to functable, and enable the sse2-optimized version.
Browse files Browse the repository at this point in the history
Add necessary code to cmake and configure.
Fix slide_hash_sse2 to compile with zlib-ng.
  • Loading branch information
Dead2 committed Sep 4, 2019
1 parent 11f2e8f commit 4cee5dc
Show file tree
Hide file tree
Showing 7 changed files with 55 additions and 31 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,7 @@ if(WITH_OPTIM)
if(HAVE_SSE2_INTRIN)
add_definitions(-DX86_SSE2)
set(ZLIB_ARCH_SRCS ${ZLIB_ARCH_SRCS} ${ARCHDIR}/fill_window_sse.c)
set(ZLIB_ARCH_SRCS ${ZLIB_ARCH_SRCS} ${ARCHDIR}/slide_sse.c)
if(NOT ${ARCH} MATCHES "x86_64")
add_intrinsics_option("${SSE2FLAG}")
add_feature_info(FORCE_SSE2 FORCE_SSE2 "Assume CPU is SSE2 capable")
Expand Down
38 changes: 16 additions & 22 deletions arch/x86/slide_sse.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,45 +8,39 @@
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "deflate.h"
#include "../../zbuild.h"
#include "../../deflate.h"

#ifdef USE_SSE_SLIDE
#include <immintrin.h>

void slide_hash_sse(deflate_state *s)
{
ZLIB_INTERNAL void slide_hash_sse2(deflate_state *s) {
Pos *p;
unsigned n;
Posf *p;
uInt wsize = s->w_size;
z_const __m128i xmm_wsize = _mm_set1_epi16(s->w_size);
unsigned wsize = s->w_size;
const __m128i xmm_wsize = _mm_set1_epi16(s->w_size);

n = s->hash_size;
p = &s->head[n] - 8;
do {
__m128i value, result;

value = _mm_loadu_si128((__m128i *)p);
result= _mm_subs_epu16(value, xmm_wsize);
_mm_storeu_si128((__m128i *)p, result);
p -= 8;
n -= 8;
value = _mm_loadu_si128((__m128i *)p);
result= _mm_subs_epu16(value, xmm_wsize);
_mm_storeu_si128((__m128i *)p, result);
p -= 8;
n -= 8;
} while (n > 0);

#ifndef FASTEST
n = wsize;
p = &s->prev[n] - 8;
do {
__m128i value, result;

value = _mm_loadu_si128((__m128i *)p);
result= _mm_subs_epu16(value, xmm_wsize);
_mm_storeu_si128((__m128i *)p, result);
value = _mm_loadu_si128((__m128i *)p);
result= _mm_subs_epu16(value, xmm_wsize);
_mm_storeu_si128((__m128i *)p, result);

p -= 8;
n -= 8;
p -= 8;
n -= 8;
} while (n > 0);
#endif
}

#endif

8 changes: 4 additions & 4 deletions configure
Original file line number Diff line number Diff line change
Expand Up @@ -994,8 +994,8 @@ case "${ARCH}" in
if test ${HAVE_SSE2_INTRIN} -eq 1; then
CFLAGS="${CFLAGS} -DX86_SSE2"
SFLAGS="${SFLAGS} -DX86_SSE2"
ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} fill_window_sse.o"
ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} fill_window_sse.lo"
ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} fill_window_sse.o slide_sse.o"
ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} fill_window_sse.lo slide_sse.lo"

if test $forcesse2 -eq 1; then
CFLAGS="${CFLAGS} -DX86_NOCHECK_SSE2"
Expand Down Expand Up @@ -1045,8 +1045,8 @@ case "${ARCH}" in
CFLAGS="${CFLAGS} -DX86_CPUID -DX86_SSE2 -DX86_SSE42_CRC_HASH"
SFLAGS="${SFLAGS} -DX86_CPUID -DX86_SSE2 -DX86_SSE42_CRC_HASH"

ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} x86.o fill_window_sse.o insert_string_sse.o"
ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} x86.lo fill_window_sse.lo insert_string_sse.lo"
ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} x86.o fill_window_sse.o insert_string_sse.o slide_sse.o"
ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} x86.lo fill_window_sse.lo insert_string_sse.lo slide_sse.lo"

if test ${HAVE_SSE42CRC_INTRIN} -eq 1; then
CFLAGS="${CFLAGS} -DX86_SSE42_CRC_INTRIN"
Expand Down
7 changes: 3 additions & 4 deletions deflate.c
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,6 @@ typedef block_state (*compress_func) (deflate_state *s, int flush);
/* Compression function. Returns the block state after the call. */

static int deflateStateCheck (PREFIX3(stream) *strm);
static void slide_hash (deflate_state *s);
static block_state deflate_stored (deflate_state *s, int flush);
ZLIB_INTERNAL block_state deflate_fast (deflate_state *s, int flush);
ZLIB_INTERNAL block_state deflate_quick (deflate_state *s, int flush);
Expand Down Expand Up @@ -196,7 +195,7 @@ static const config configuration_table[10] = {
* bit values at the expense of memory usage). We slide even when level == 0 to
* keep the hash table consistent if we switch back to level > 0 later.
*/
static void slide_hash(deflate_state *s) {
ZLIB_INTERNAL void slide_hash_c(deflate_state *s) {
unsigned n;
Pos *p;
unsigned int wsize = s->w_size;
Expand Down Expand Up @@ -639,7 +638,7 @@ int ZEXPORT PREFIX(deflateParams)(PREFIX3(stream) *strm, int level, int strategy
if (s->level != level) {
if (s->level == 0 && s->matches != 0) {
if (s->matches == 1) {
slide_hash(s);
functable.slide_hash(s);
} else {
CLEAR_HASH(s);
}
Expand Down Expand Up @@ -1297,7 +1296,7 @@ void ZLIB_INTERNAL fill_window_c(deflate_state *s) {
s->block_start -= (long) wsize;
if (s->insert > s->strstart)
s->insert = s->strstart;
slide_hash(s);
functable.slide_hash(s);
more += wsize;
}
if (s->strm->avail_in == 0)
Expand Down
1 change: 1 addition & 0 deletions deflate.h
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,7 @@ static inline void put_short(deflate_state *s, uint16_t w) {


void ZLIB_INTERNAL fill_window_c(deflate_state *s);
void ZLIB_INTERNAL slide_hash_c(deflate_state *s);

/* in trees.c */
void ZLIB_INTERNAL zng_tr_init(deflate_state *s);
Expand Down
30 changes: 29 additions & 1 deletion functable.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,18 @@ extern void fill_window_sse(deflate_state *s);
extern void fill_window_arm(deflate_state *s);
#endif

/* slide_hash */
#ifdef X86_SSE2
void slide_hash_sse2(deflate_state *s);
#endif

/* adler32 */
extern uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len);
#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && defined(ARM_NEON_ADLER32)
extern uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len);
#endif

/* CRC32 */
ZLIB_INTERNAL uint32_t crc32_generic(uint32_t, const unsigned char *, uint64_t);

#ifdef DYNAMIC_CRC_TABLE
Expand All @@ -46,14 +52,22 @@ extern uint32_t crc32_little(uint32_t, const unsigned char *, uint64_t);
extern uint32_t crc32_big(uint32_t, const unsigned char *, uint64_t);
#endif


/* stub definitions */
ZLIB_INTERNAL Pos insert_string_stub(deflate_state *const s, const Pos str, unsigned int count);
ZLIB_INTERNAL void fill_window_stub(deflate_state *s);
ZLIB_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_t len);
ZLIB_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64_t len);
ZLIB_INTERNAL void slide_hash_stub(deflate_state *s);

/* functable init */
ZLIB_INTERNAL __thread struct functable_s functable = {fill_window_stub,insert_string_stub,adler32_stub,crc32_stub};
ZLIB_INTERNAL __thread struct functable_s functable = {
fill_window_stub,
insert_string_stub,
adler32_stub,
crc32_stub,
slide_hash_stub
};


/* stub functions */
Expand Down Expand Up @@ -88,6 +102,20 @@ ZLIB_INTERNAL void fill_window_stub(deflate_state *s) {
functable.fill_window(s);
}

ZLIB_INTERNAL void slide_hash_stub(deflate_state *s) {
// Initialize default
functable.slide_hash=&slide_hash_c;

#ifdef X86_SSE2
# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
if (x86_cpu_has_sse2)
# endif
functable.slide_hash=&slide_hash_sse2;
#endif

functable.slide_hash(s);
}

ZLIB_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_t len) {
// Initialize default
functable.adler32=&adler32_c;
Expand Down
1 change: 1 addition & 0 deletions functable.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ struct functable_s {
Pos (* insert_string) (deflate_state *const s, const Pos str, unsigned int count);
uint32_t (* adler32) (uint32_t adler, const unsigned char *buf, size_t len);
uint32_t (* crc32) (uint32_t crc, const unsigned char *buf, uint64_t len);
void (* slide_hash) (deflate_state *s);
};

ZLIB_INTERNAL extern __thread struct functable_s functable;
Expand Down

0 comments on commit 4cee5dc

Please sign in to comment.