Skip to content

Commit

Permalink
Standardize fill_window implementations and abstract out slide_hash_n…
Browse files Browse the repository at this point in the history
…eon for ARM.
  • Loading branch information
nmoinvaz authored and Dead2 committed Apr 30, 2020
1 parent 343596f commit e09d131
Show file tree
Hide file tree
Showing 23 changed files with 150 additions and 424 deletions.
16 changes: 10 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -624,16 +624,19 @@ endif()

if(WITH_OPTIM)
if(BASEARCH_ARM_FOUND)
add_definitions(-DARM_GETAUXVAL)
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/armfeature.c ${ARCHDIR}/fill_window_arm.c)
add_definitions(-DARM_CPUID)
list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/arm.h)
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/armfeature.c)
if(WITH_NEON)
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/adler32_neon.c)
add_definitions(-DARM_NEON_ADLER32)
list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/adler32_neon.h)
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/adler32_neon.c ${ARCHDIR}/slide_neon.c)
add_definitions(-DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH)
add_intrinsics_option("${NEONFLAG}")
if(MSVC)
add_definitions(-D__ARM_NEON__)
endif()
add_feature_info(NEON_FILLWINDOW 1 "Support NEON instructions in fill_window_arm, using \"${NEONFLAG}\"")
add_feature_info(NEON_ALDER32 1 "Support NEON instructions in adler32, using \"${NEONFLAG}\"")
add_feature_info(NEON_SLIDEHASH 1 "Support NEON instructions in slide_hash, using \"${NEONFLAG}\"")
endif()
if(WITH_ACLE)
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/crc32_acle.c ${ARCHDIR}/insert_string_acle.c)
Expand All @@ -659,6 +662,7 @@ if(WITH_OPTIM)
endif()
elseif(BASEARCH_X86_FOUND)
add_definitions(-DX86_CPUID)
list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/x86.h)
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/x86.c)
if(MSVC)
list(APPEND ZLIB_ARCH_HDRS fallback_builtins.h)
Expand All @@ -685,7 +689,7 @@ if(WITH_OPTIM)
endif()
if(WITH_SSE2 AND HAVE_SSE2_INTRIN)
add_definitions(-DX86_SSE2)
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/fill_window_sse.c ${ARCHDIR}/slide_sse.c)
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/slide_sse.c)
if(NOT ${ARCH} MATCHES "x86_64")
add_intrinsics_option("${SSE2FLAG}")
add_feature_info(FORCE_SSE2 FORCE_SSE2 "Assume CPU is SSE2 capable")
Expand Down
10 changes: 5 additions & 5 deletions arch/arm/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ SRCDIR=.
SRCTOP=../..
TOPDIR=$(SRCTOP)

all: adler32_neon.o adler32_neon.lo armfeature.o armfeature.lo crc32_acle.o crc32_acle.lo fill_window_arm.o fill_window_arm.lo insert_string_acle.o insert_string_acle.lo
all: adler32_neon.o adler32_neon.lo armfeature.o armfeature.lo crc32_acle.o crc32_acle.lo slide_neon.o slide_neon.lo insert_string_acle.o insert_string_acle.lo

adler32_neon.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
Expand All @@ -32,11 +32,11 @@ crc32_acle.o:
crc32_acle.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c

fill_window_arm.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_arm.c
slide_neon.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_neon.c

fill_window_arm.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_arm.c
slide_neon.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_neon.c

insert_string_acle.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
Expand Down
167 changes: 0 additions & 167 deletions arch/arm/fill_window_arm.c

This file was deleted.

48 changes: 48 additions & 0 deletions arch/arm/slide_neon.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/* slide_neon.c -- Optimized hash table shifting for ARM with support for NEON instructions
* Copyright (C) 2017 Mika T. Lindqvist
*
* Authors:
* Mika T. Lindqvist <[email protected]>
* Jun He <[email protected]>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/

#if defined(ARM_NEON_SLIDEHASH)
#include <arm_neon.h>
#include "../../zbuild.h"
#include "../../deflate.h"

/* SIMD version of hash_chain rebase */
static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t window_size) {
register uint16x8_t v, *p;
register size_t n;

size_t size = entries*sizeof(table[0]);
Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");

Assert(sizeof(Pos) == 2, "Wrong Pos size");
v = vdupq_n_u16(window_size);

p = (uint16x8_t *)table;
n = size / (sizeof(uint16x8_t) * 8);
do {
p[0] = vqsubq_u16(p[0], v);
p[1] = vqsubq_u16(p[1], v);
p[2] = vqsubq_u16(p[2], v);
p[3] = vqsubq_u16(p[3], v);
p[4] = vqsubq_u16(p[4], v);
p[5] = vqsubq_u16(p[5], v);
p[6] = vqsubq_u16(p[6], v);
p[7] = vqsubq_u16(p[7], v);
p += 8;
} while (--n);
}

ZLIB_INTERNAL void slide_hash_neon(deflate_state *s) {
unsigned int wsize = s->w_size;

slide_hash_chain(s->head, s->hash_size, wsize);
slide_hash_chain(s->prev, wsize, wsize);
}
#endif
8 changes: 1 addition & 7 deletions arch/x86/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,14 @@ SRCDIR=.
SRCTOP=../..
TOPDIR=$(SRCTOP)

all: x86.o x86.lo fill_window_sse.o fill_window_sse.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo slide_avx.o slide_avx.lo slide_sse.o slide_sse.lo
all: x86.o x86.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo slide_avx.o slide_avx.lo slide_sse.o slide_sse.lo

x86.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c

x86.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c

fill_window_sse.o:
$(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_sse.c

fill_window_sse.lo:
$(CC) $(SFLAGS) $(SSE2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_sse.c

deflate_quick.o:
$(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/deflate_quick.c

Expand Down
2 changes: 1 addition & 1 deletion arch/x86/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ Contents

|Name|Description|
|:-|:-|
|fill_window_sse.c|SSE2 optimized fill_window|
|deflate_quick.c|SSE4 optimized deflate strategy for use as level 1|
|crc_folding.c|SSE4 + PCLMULQDQ optimized CRC folding implementation|
|slide_sse2.c|SSE2 optimized slide_hash|
3 changes: 1 addition & 2 deletions arch/x86/deflate_quick.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
# include <ctype.h>
#endif

extern void fill_window_sse(deflate_state *s);
extern void flush_pending(PREFIX3(stream) *strm);

static inline long compare258(const unsigned char *const src0, const unsigned char *const src1) {
Expand Down Expand Up @@ -209,7 +208,7 @@ ZLIB_INTERNAL block_state deflate_quick(deflate_state *s, int flush) {
}

if (s->lookahead < MIN_LOOKAHEAD) {
fill_window_sse(s);
fill_window(s);
if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) {
static_emit_end_block(s, 0);
return need_more;
Expand Down
Loading

0 comments on commit e09d131

Please sign in to comment.