Skip to content

Commit

Permalink
Use GCC's may_alias attribute for unaligned memory access
Browse files Browse the repository at this point in the history
  • Loading branch information
ccawley2011 authored and Dead2 committed Dec 24, 2024
1 parent fc90e7b commit d7e121e
Show file tree
Hide file tree
Showing 25 changed files with 197 additions and 150 deletions.
13 changes: 4 additions & 9 deletions arch/arm/chunkset_neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#ifdef ARM_NEON
#include "neon_intrins.h"
#include "zbuild.h"
#include "zmemory.h"
#include "arch/generic/chunk_permute_table.h"

typedef uint8x16_t chunk_t;
Expand All @@ -31,21 +32,15 @@ static const lut_rem_pair perm_idx_lut[13] = {
};

static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
uint16_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = vreinterpretq_u8_u16(vdupq_n_u16(tmp));
*chunk = vreinterpretq_u8_u16(vdupq_n_u16(zng_memread_2(from)));
}

static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
uint32_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = vreinterpretq_u8_u32(vdupq_n_u32(tmp));
*chunk = vreinterpretq_u8_u32(vdupq_n_u32(zng_memread_4(from)));
}

static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
uint64_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = vreinterpretq_u8_u64(vdupq_n_u64(tmp));
*chunk = vreinterpretq_u8_u64(vdupq_n_u64(zng_memread_8(from)));
}

#define CHUNKSIZE chunksize_neon
Expand Down
2 changes: 1 addition & 1 deletion arch/arm/compare256_neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
*/

#include "zbuild.h"
#include "zutil_p.h"
#include "zmemory.h"
#include "deflate.h"
#include "fallback_builtins.h"

Expand Down
4 changes: 2 additions & 2 deletions arch/generic/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,10 @@ chunkset_c.o: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.
chunkset_c.lo: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c

compare256_c.o: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zutil_p.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
compare256_c.o: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c

compare256_c.lo: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zutil_p.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
compare256_c.lo: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c

crc32_braid_c.o: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
Expand Down
12 changes: 6 additions & 6 deletions arch/generic/chunkset_c.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
*/

#include "zbuild.h"
#include "zmemory.h"

typedef uint64_t chunk_t;

Expand All @@ -12,21 +13,20 @@ typedef uint64_t chunk_t;
#define HAVE_CHUNKMEMSET_8

static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
uint8_t *dest = (uint8_t *)chunk;
memcpy(dest, from, sizeof(uint32_t));
memcpy(dest+4, from, sizeof(uint32_t));
uint32_t tmp = zng_memread_4(from);
*chunk = tmp | ((chunk_t)tmp << 32);
}

static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
memcpy(chunk, from, sizeof(uint64_t));
*chunk = zng_memread_8(from);
}

static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
memcpy(chunk, (uint8_t *)s, sizeof(uint64_t));
*chunk = zng_memread_8(s);
}

static inline void storechunk(uint8_t *out, chunk_t *chunk) {
memcpy(out, chunk, sizeof(uint64_t));
zng_memwrite_8(out, *chunk);
}

#define CHUNKSIZE chunksize_c
Expand Down
10 changes: 5 additions & 5 deletions arch/generic/compare256_c.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
*/

#include "zbuild.h"
#include "zutil_p.h"
#include "zmemory.h"
#include "deflate.h"
#include "fallback_builtins.h"

Expand Down Expand Up @@ -107,8 +107,8 @@ static inline uint32_t compare256_unaligned_32_static(const uint8_t *src0, const
do {
uint32_t sv, mv, diff;

memcpy(&sv, src0, sizeof(sv));
memcpy(&mv, src1, sizeof(mv));
sv = zng_memread_4(src0);
mv = zng_memread_4(src1);

diff = sv ^ mv;
if (diff) {
Expand Down Expand Up @@ -151,8 +151,8 @@ static inline uint32_t compare256_unaligned_64_static(const uint8_t *src0, const
do {
uint64_t sv, mv, diff;

memcpy(&sv, src0, sizeof(sv));
memcpy(&mv, src1, sizeof(mv));
sv = zng_memread_8(src0);
mv = zng_memread_8(src1);

diff = sv ^ mv;
if (diff) {
Expand Down
13 changes: 4 additions & 9 deletions arch/power/chunkset_power8.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#ifdef POWER8_VSX
#include <altivec.h>
#include "zbuild.h"
#include "zmemory.h"

typedef vector unsigned char chunk_t;

Expand All @@ -15,21 +16,15 @@ typedef vector unsigned char chunk_t;
#define HAVE_CHUNKMEMSET_8

static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
uint16_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = (vector unsigned char)vec_splats(tmp);
*chunk = (vector unsigned char)vec_splats(zng_memread_2(from));
}

static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
uint32_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = (vector unsigned char)vec_splats(tmp);
*chunk = (vector unsigned char)vec_splats(zng_memread_4(from));
}

static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
uint64_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = (vector unsigned char)vec_splats((unsigned long long)tmp);
*chunk = (vector unsigned char)vec_splats((unsigned long long)zng_memread_8(from));
}

static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
Expand Down
2 changes: 1 addition & 1 deletion arch/power/compare256_power9.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#ifdef POWER9
#include <altivec.h>
#include "zbuild.h"
#include "zutil_p.h"
#include "zmemory.h"
#include "deflate.h"
#include "zendian.h"

Expand Down
2 changes: 1 addition & 1 deletion arch/riscv/compare256_rvv.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#ifdef RISCV_RVV

#include "zbuild.h"
#include "zutil_p.h"
#include "zmemory.h"
#include "deflate.h"
#include "fallback_builtins.h"

Expand Down
13 changes: 4 additions & 9 deletions arch/x86/chunkset_avx2.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "zmemory.h"

#ifdef X86_AVX2
#include "avx2_tables.h"
Expand All @@ -19,21 +20,15 @@ typedef __m128i halfchunk_t;
#define HAVE_HALF_CHUNK

static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
int16_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm256_set1_epi16(tmp);
*chunk = _mm256_set1_epi16(zng_memread_2(from));
}

static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
int32_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm256_set1_epi32(tmp);
*chunk = _mm256_set1_epi32(zng_memread_4(from));
}

static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
int64_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm256_set1_epi64x(tmp);
*chunk = _mm256_set1_epi64x(zng_memread_8(from));
}

static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) {
Expand Down
13 changes: 4 additions & 9 deletions arch/x86/chunkset_avx512.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "zmemory.h"

#ifdef X86_AVX512

Expand Down Expand Up @@ -33,21 +34,15 @@ static inline mask_t gen_mask(unsigned len) {
}

static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
int16_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm256_set1_epi16(tmp);
*chunk = _mm256_set1_epi16(zng_memread_2(from));
}

static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
int32_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm256_set1_epi32(tmp);
*chunk = _mm256_set1_epi32(zng_memread_4(from));
}

static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
int64_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm256_set1_epi64x(tmp);
*chunk = _mm256_set1_epi64x(zng_memread_8(from));
}

static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) {
Expand Down
13 changes: 4 additions & 9 deletions arch/x86/chunkset_sse2.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
*/

#include "zbuild.h"
#include "zmemory.h"

#ifdef X86_SSE2
#include <immintrin.h>
Expand All @@ -14,21 +15,15 @@ typedef __m128i chunk_t;
#define HAVE_CHUNKMEMSET_8

static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
int16_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm_set1_epi16(tmp);
*chunk = _mm_set1_epi16(zng_memread_2(from));
}

static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
int32_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm_set1_epi32(tmp);
*chunk = _mm_set1_epi32(zng_memread_4(from));
}

static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
int64_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm_set1_epi64x(tmp);
*chunk = _mm_set1_epi64x(zng_memread_8(from));
}

static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
Expand Down
13 changes: 4 additions & 9 deletions arch/x86/chunkset_ssse3.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
*/

#include "zbuild.h"
#include "zmemory.h"

#if defined(X86_SSSE3)
#include <immintrin.h>
Expand Down Expand Up @@ -33,21 +34,15 @@ static const lut_rem_pair perm_idx_lut[13] = {


static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
int16_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm_set1_epi16(tmp);
*chunk = _mm_set1_epi16(zng_memread_2(from));
}

static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
int32_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm_set1_epi32(tmp);
*chunk = _mm_set1_epi32(zng_memread_4(from));
}

static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
int64_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm_set1_epi64x(tmp);
*chunk = _mm_set1_epi64x(zng_memread_8(from));
}

static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
Expand Down
2 changes: 1 addition & 1 deletion arch/x86/compare256_avx2.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
*/

#include "zbuild.h"
#include "zutil_p.h"
#include "zmemory.h"
#include "deflate.h"
#include "fallback_builtins.h"

Expand Down
2 changes: 1 addition & 1 deletion arch/x86/compare256_sse2.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
*/

#include "zbuild.h"
#include "zutil_p.h"
#include "zmemory.h"
#include "deflate.h"
#include "fallback_builtins.h"

Expand Down
25 changes: 11 additions & 14 deletions compare256_rle.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
*/

#include "zbuild.h"
#include "zmemory.h"
#include "fallback_builtins.h"
#include "zendian.h"

Expand Down Expand Up @@ -47,25 +48,21 @@ static inline uint32_t compare256_rle_c(const uint8_t *src0, const uint8_t *src1
/* 16-bit unaligned integer comparison */
static inline uint32_t compare256_rle_unaligned_16(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
uint16_t src0_cmp, src1_cmp;
uint16_t src0_cmp;

memcpy(&src0_cmp, src0, sizeof(src0_cmp));
src0_cmp = zng_memread_2(src0);

do {
memcpy(&src1_cmp, src1, sizeof(src1_cmp));
if (src0_cmp != src1_cmp)
if (src0_cmp != zng_memread_2(src1))
return len + (*src0 == *src1);
src1 += 2, len += 2;
memcpy(&src1_cmp, src1, sizeof(src1_cmp));
if (src0_cmp != src1_cmp)
if (src0_cmp != zng_memread_2(src1))
return len + (*src0 == *src1);
src1 += 2, len += 2;
memcpy(&src1_cmp, src1, sizeof(src1_cmp));
if (src0_cmp != src1_cmp)
if (src0_cmp != zng_memread_2(src1))
return len + (*src0 == *src1);
src1 += 2, len += 2;
memcpy(&src1_cmp, src1, sizeof(src1_cmp));
if (src0_cmp != src1_cmp)
if (src0_cmp != zng_memread_2(src1))
return len + (*src0 == *src1);
src1 += 2, len += 2;
} while (len < 256);
Expand All @@ -79,13 +76,13 @@ static inline uint32_t compare256_rle_unaligned_32(const uint8_t *src0, const ui
uint32_t sv, len = 0;
uint16_t src0_cmp;

memcpy(&src0_cmp, src0, sizeof(src0_cmp));
src0_cmp = zng_memread_2(src0);
sv = ((uint32_t)src0_cmp << 16) | src0_cmp;

do {
uint32_t mv, diff;

memcpy(&mv, src1, sizeof(mv));
mv = zng_memread_4(src1);

diff = sv ^ mv;
if (diff) {
Expand All @@ -112,14 +109,14 @@ static inline uint32_t compare256_rle_unaligned_64(const uint8_t *src0, const ui
uint16_t src0_cmp;
uint64_t sv;

memcpy(&src0_cmp, src0, sizeof(src0_cmp));
src0_cmp = zng_memread_2(src0);
src0_cmp32 = ((uint32_t)src0_cmp << 16) | src0_cmp;
sv = ((uint64_t)src0_cmp32 << 32) | src0_cmp32;

do {
uint64_t mv, diff;

memcpy(&mv, src1, sizeof(mv));
mv = zng_memread_8(src1);

diff = sv ^ mv;
if (diff) {
Expand Down
Loading

0 comments on commit d7e121e

Please sign in to comment.