Skip to content

Commit

Permalink
Move crc32 folding functions into functable.
Browse files Browse the repository at this point in the history
  • Loading branch information
nmoinvaz authored and Dead2 committed Aug 13, 2021
1 parent db59b63 commit d802e89
Show file tree
Hide file tree
Showing 18 changed files with 153 additions and 114 deletions.
4 changes: 3 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -766,7 +766,7 @@ if(WITH_OPTIM)
check_pclmulqdq_intrinsics()
if(HAVE_PCLMULQDQ_INTRIN AND HAVE_SSSE3_INTRIN)
add_definitions(-DX86_PCLMULQDQ_CRC)
set(PCLMULQDQ_SRCS ${ARCHDIR}/crc_folding.c)
set(PCLMULQDQ_SRCS ${ARCHDIR}/crc32_fold_pclmulqdq.c)
add_feature_info(PCLMUL_CRC 1 "Support CRC hash generation using PCLMULQDQ, using \"${SSSE3FLAG} ${SSE4FLAG} ${PCLMULFLAG}\"")
list(APPEND ZLIB_ARCH_SRCS ${PCLMULQDQ_SRCS})
set_property(SOURCE ${PCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${SSE4FLAG} ${PCLMULFLAG} ${NOLTOFLAG}")
Expand Down Expand Up @@ -849,6 +849,7 @@ set(ZLIB_PRIVATE_HDRS
crc32_p.h
crc32_tbl.h
crc32_comb_tbl.h
crc32_fold.h
deflate.h
deflate_p.h
functable.h
Expand All @@ -873,6 +874,7 @@ set(ZLIB_SRCS
compress.c
crc32.c
crc32_comb.c
crc32_fold.c
deflate.c
deflate_fast.c
deflate_huff.c
Expand Down
2 changes: 2 additions & 0 deletions Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ OBJZ = \
compress.o \
crc32.o \
crc32_comb.o \
crc32_fold.o \
deflate.o \
deflate_fast.o \
deflate_huff.o \
Expand Down Expand Up @@ -113,6 +114,7 @@ PIC_OBJZ = \
compress.lo \
crc32.lo \
crc32_comb.lo \
crc32_fold.lo \
deflate.lo \
deflate_fast.lo \
deflate_huff.lo \
Expand Down
2 changes: 1 addition & 1 deletion arch/x86/INDEX.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ Contents
|Name|Description|
|:-|:-|
|deflate_quick.c|SSE4 optimized deflate strategy for use as level 1|
|crc_folding.c|SSE4 + PCLMULQDQ optimized CRC folding implementation|
|crc32_fold_pclmulqdq.c|SSE4 + PCLMULQDQ optimized CRC folding implementation|
|slide_hash_sse2.c|SSE2 optimized slide_hash|
10 changes: 5 additions & 5 deletions arch/x86/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ all: \
compare258_avx.o compare258_avx.lo \
compare258_sse.o compare258_sse.lo \
insert_string_sse.o insert_string_sse.lo \
crc_folding.o crc_folding.lo \
crc32_fold_pclmulqdq.o crc32_fold_pclmulqdq.lo \
slide_hash_avx.o slide_hash_avx.lo \
slide_hash_sse.o slide_hash_sse.lo

Expand Down Expand Up @@ -68,11 +68,11 @@ insert_string_sse.o:
insert_string_sse.lo:
$(CC) $(SFLAGS) $(SSE4FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse.c

crc_folding.o:
$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE4FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc_folding.c
crc32_fold_pclmulqdq.o:
$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE4FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_pclmulqdq.c

crc_folding.lo:
$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE4FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc_folding.c
crc32_fold_pclmulqdq.lo:
$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE4FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_pclmulqdq.c

slide_hash_avx.o:
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx.c
Expand Down
71 changes: 35 additions & 36 deletions arch/x86/crc_folding.c → arch/x86/crc32_fold_pclmulqdq.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,22 @@
*/

#ifdef X86_PCLMULQDQ_CRC
#include "../../zutil.h"

#include <inttypes.h>
#include <immintrin.h>
#include <wmmintrin.h>

#include "crc_folding.h"
#include "../../crc32_fold.h"

Z_INTERNAL void crc_fold_init(unsigned int crc0[4 * 5]) {
Z_INTERNAL uint32_t crc32_fold_reset_pclmulqdq(crc32_fold *crc) {
/* CRC_SAVE */
_mm_storeu_si128((__m128i *)crc0 + 0, _mm_cvtsi32_si128(0x9db42487));
_mm_storeu_si128((__m128i *)crc0 + 1, _mm_setzero_si128());
_mm_storeu_si128((__m128i *)crc0 + 2, _mm_setzero_si128());
_mm_storeu_si128((__m128i *)crc0 + 3, _mm_setzero_si128());
_mm_storeu_si128((__m128i *)crc->fold + 0, _mm_cvtsi32_si128(0x9db42487));
_mm_storeu_si128((__m128i *)crc->fold + 1, _mm_setzero_si128());
_mm_storeu_si128((__m128i *)crc->fold + 2, _mm_setzero_si128());
_mm_storeu_si128((__m128i *)crc->fold + 3, _mm_setzero_si128());

return 0;
}

static void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
Expand Down Expand Up @@ -224,16 +227,16 @@ static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
*xmm_crc3 = _mm_castps_si128(ps_res);
}

Z_INTERNAL void crc_fold_copy(unsigned int crc0[4 * 5], unsigned char *dst, const unsigned char *src, long len) {
Z_INTERNAL void crc32_fold_copy_pclmulqdq(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
unsigned long algn_diff;
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
char ALIGNED_(16) partial_buf[16] = { 0 };

/* CRC_LOAD */
__m128i xmm_crc0 = _mm_loadu_si128((__m128i *)crc0 + 0);
__m128i xmm_crc1 = _mm_loadu_si128((__m128i *)crc0 + 1);
__m128i xmm_crc2 = _mm_loadu_si128((__m128i *)crc0 + 2);
__m128i xmm_crc3 = _mm_loadu_si128((__m128i *)crc0 + 3);
__m128i xmm_crc0 = _mm_loadu_si128((__m128i *)crc->fold + 0);
__m128i xmm_crc1 = _mm_loadu_si128((__m128i *)crc->fold + 1);
__m128i xmm_crc2 = _mm_loadu_si128((__m128i *)crc->fold + 2);
__m128i xmm_crc3 = _mm_loadu_si128((__m128i *)crc->fold + 3);
__m128i xmm_crc_part;

if (len < 16) {
Expand All @@ -260,7 +263,7 @@ Z_INTERNAL void crc_fold_copy(unsigned int crc0[4 * 5], unsigned char *dst, cons
xmm_crc_part = _mm_setzero_si128();
}

while ((len -= 64) >= 0) {
while (len >= 64) {
/* CRC_LOAD */
xmm_t0 = _mm_load_si128((__m128i *)src);
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
Expand All @@ -282,14 +285,13 @@ Z_INTERNAL void crc_fold_copy(unsigned int crc0[4 * 5], unsigned char *dst, cons

src += 64;
dst += 64;
len -= 64;
}

/*
* len = num bytes left - 64
*/
if (len + 16 >= 0) {
len += 16;

if (len >= 48) {
xmm_t0 = _mm_load_si128((__m128i *)src);
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
Expand All @@ -303,15 +305,13 @@ Z_INTERNAL void crc_fold_copy(unsigned int crc0[4 * 5], unsigned char *dst, cons
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);

len -= 48;
if (len == 0)
goto done;

dst += 48;
memcpy(&xmm_crc_part, (__m128i *)src + 3, len);
} else if (len + 32 >= 0) {
len += 32;

} else if (len >= 32) {
xmm_t0 = _mm_load_si128((__m128i *)src);
xmm_t1 = _mm_load_si128((__m128i *)src + 1);

Expand All @@ -323,14 +323,13 @@ Z_INTERNAL void crc_fold_copy(unsigned int crc0[4 * 5], unsigned char *dst, cons
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);

len -= 32;
if (len == 0)
goto done;

dst += 32;
memcpy(&xmm_crc_part, (__m128i *)src + 2, len);
} else if (len + 48 >= 0) {
len += 48;

} else if (len >= 16) {
xmm_t0 = _mm_load_si128((__m128i *)src);

fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
Expand All @@ -339,13 +338,13 @@ Z_INTERNAL void crc_fold_copy(unsigned int crc0[4 * 5], unsigned char *dst, cons

xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);

len -= 16;
if (len == 0)
goto done;

dst += 16;
memcpy(&xmm_crc_part, (__m128i *)src + 1, len);
} else {
len += 64;
if (len == 0)
goto done;
memcpy(&xmm_crc_part, src, len);
Expand All @@ -358,11 +357,11 @@ Z_INTERNAL void crc_fold_copy(unsigned int crc0[4 * 5], unsigned char *dst, cons
partial_fold((size_t)len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
done:
/* CRC_SAVE */
_mm_storeu_si128((__m128i *)crc0 + 0, xmm_crc0);
_mm_storeu_si128((__m128i *)crc0 + 1, xmm_crc1);
_mm_storeu_si128((__m128i *)crc0 + 2, xmm_crc2);
_mm_storeu_si128((__m128i *)crc0 + 3, xmm_crc3);
_mm_storeu_si128((__m128i *)crc0 + 4, xmm_crc_part);
_mm_storeu_si128((__m128i *)crc->fold + 0, xmm_crc0);
_mm_storeu_si128((__m128i *)crc->fold + 1, xmm_crc1);
_mm_storeu_si128((__m128i *)crc->fold + 2, xmm_crc2);
_mm_storeu_si128((__m128i *)crc->fold + 3, xmm_crc3);
_mm_storeu_si128((__m128i *)crc->fold + 4, xmm_crc_part);
}

static const unsigned ALIGNED_(16) crc_k[] = {
Expand All @@ -382,18 +381,17 @@ static const unsigned ALIGNED_(16) crc_mask2[4] = {
0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
};

uint32_t Z_INTERNAL crc_fold_512to32(unsigned int crc0[4 * 5]) {
Z_INTERNAL uint32_t crc32_fold_final_pclmulqdq(crc32_fold *crc) {
const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);

uint32_t crc;
__m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;

/* CRC_LOAD */
__m128i xmm_crc0 = _mm_loadu_si128((__m128i *)crc0 + 0);
__m128i xmm_crc1 = _mm_loadu_si128((__m128i *)crc0 + 1);
__m128i xmm_crc2 = _mm_loadu_si128((__m128i *)crc0 + 2);
__m128i xmm_crc3 = _mm_loadu_si128((__m128i *)crc0 + 3);
__m128i xmm_crc0 = _mm_loadu_si128((__m128i *)crc->fold + 0);
__m128i xmm_crc1 = _mm_loadu_si128((__m128i *)crc->fold + 1);
__m128i xmm_crc2 = _mm_loadu_si128((__m128i *)crc->fold + 2);
__m128i xmm_crc3 = _mm_loadu_si128((__m128i *)crc->fold + 3);

/*
* k1
Expand Down Expand Up @@ -447,8 +445,9 @@ uint32_t Z_INTERNAL crc_fold_512to32(unsigned int crc0[4 * 5]) {
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);

crc = (uint32_t)_mm_extract_epi32(xmm_crc3, 2);
return ~crc;
crc->value = ~((uint32_t)_mm_extract_epi32(xmm_crc3, 2));

return crc->value;
}

#endif
19 changes: 0 additions & 19 deletions arch/x86/crc_folding.h

This file was deleted.

4 changes: 2 additions & 2 deletions configure
Original file line number Diff line number Diff line change
Expand Up @@ -1353,8 +1353,8 @@ case "${ARCH}" in
if test ${HAVE_PCLMULQDQ_INTRIN} -eq 1; then
CFLAGS="${CFLAGS} -DX86_PCLMULQDQ_CRC"
SFLAGS="${SFLAGS} -DX86_PCLMULQDQ_CRC"
ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc_folding.o"
ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc_folding.lo"
ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_fold_pclmulqdq.o"
ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_fold_pclmulqdq.lo"
fi
fi
;;
Expand Down
31 changes: 0 additions & 31 deletions crc32.c
Original file line number Diff line number Diff line change
Expand Up @@ -168,34 +168,3 @@ Z_INTERNAL uint32_t crc32_big(uint32_t crc, const unsigned char *buf, uint64_t l
return ZSWAP32(c);
}
#endif /* BYTE_ORDER == BIG_ENDIAN */

#ifdef X86_PCLMULQDQ_CRC
#include "arch/x86/x86.h"
#include "arch/x86/crc_folding.h"

Z_INTERNAL void crc_finalize(deflate_state *const s) {
if (x86_cpu_has_pclmulqdq)
s->strm->adler = crc_fold_512to32(s->crc0);
}
#endif

Z_INTERNAL void crc_reset(deflate_state *const s) {
#ifdef X86_PCLMULQDQ_CRC
x86_check_features();
if (x86_cpu_has_pclmulqdq) {
crc_fold_init(s->crc0);
}
#endif
s->strm->adler = CRC32_INITIAL_VALUE;
}

Z_INTERNAL void copy_with_crc(PREFIX3(stream) *strm, unsigned char *dst, unsigned long size) {
#ifdef X86_PCLMULQDQ_CRC
if (x86_cpu_has_pclmulqdq) {
crc_fold_copy(strm->state->crc0, dst, strm->next_in, size);
return;
}
#endif
memcpy(dst, strm->next_in, size);
strm->adler = PREFIX(crc32)(strm->adler, dst, size);
}
23 changes: 23 additions & 0 deletions crc32_fold.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
/* crc32_fold.c -- crc32 folding interface
* Copyright (C) 2021 Nathan Moinvaziri
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "zutil.h"
#include "functable.h"

#include "crc32_fold.h"

Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc) {
crc->value = CRC32_INITIAL_VALUE;
return crc->value;
}

Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
crc->value = functable.crc32(crc->value, src, len);
memcpy(dst, src, len);
}

Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc) {
return crc->value;
}
17 changes: 17 additions & 0 deletions crc32_fold.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
/* crc32_fold.h -- crc32 folding interface
* Copyright (C) 2021 Nathan Moinvaziri
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef CRC32_FOLD_H_
#define CRC32_FOLD_H_

typedef struct crc32_fold_s {
uint32_t ALIGNED_(16) fold[4 * 5];
uint32_t value;
} crc32_fold;

Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc);
Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc);

#endif
Loading

0 comments on commit d802e89

Please sign in to comment.