|
| 1 | +From d2f06cd65d7ac39c6dd6761eef162abc946b155b Mon Sep 17 00:00:00 2001 |
| 2 | +From: Adenilson Cavalcanti < [email protected]> |
| 3 | +Date: Tue, 11 Apr 2017 17:13:02 -0700 |
| 4 | +Subject: [PATCH] NEON implementation for Adler32 |
| 5 | + |
| 6 | +The checksum is calculated in the uncompressed PNG data |
| 7 | +and can be made much faster by using SIMD. |
| 8 | + |
| 9 | +Tests in ARMv8 yielded an improvement of about 3x |
| 10 | +(e.g. walltime was 350ms x 125ms for a 4096x4096 bytes |
| 11 | +executed 30 times). That results in at least 18% improvement |
| 12 | +in image decoding in Chromium. |
| 13 | + |
| 14 | +Further details at: |
| 15 | +https://bugs.chromium.org/p/chromium/issues/detail?id=688601 |
| 16 | +--- |
| 17 | + CMakeLists.txt | 29 +++++++--- |
| 18 | + adler32.c | 5 ++ |
| 19 | + contrib/README.contrib | 3 + |
| 20 | + contrib/arm/neon_adler32.c | 137 +++++++++++++++++++++++++++++++++++++++++++++ |
| 21 | + 4 files changed, 166 insertions(+), 8 deletions(-) |
| 22 | + create mode 100644 contrib/arm/neon_adler32.c |
| 23 | + |
| 24 | +diff --git a/CMakeLists.txt b/CMakeLists.txt |
| 25 | +index 0fe939df..8e75f664 100644 |
| 26 | +--- a/CMakeLists.txt |
| 27 | ++++ b/CMakeLists.txt |
| 28 | +@@ -7,6 +7,7 @@ set(VERSION "1.2.11") |
| 29 | + |
| 30 | + option(ASM686 "Enable building i686 assembly implementation") |
| 31 | + option(AMD64 "Enable building amd64 assembly implementation") |
| 32 | ++option(ARMv8 "Enable building ARM NEON intrinsics implementation") |
| 33 | + |
| 34 | + set(INSTALL_BIN_DIR "${CMAKE_INSTALL_PREFIX}/bin" CACHE PATH "Installation directory for executables") |
| 35 | + set(INSTALL_LIB_DIR "${CMAKE_INSTALL_PREFIX}/lib" CACHE PATH "Installation directory for libraries") |
| 36 | +@@ -132,14 +133,26 @@ endif() |
| 37 | + if(CMAKE_COMPILER_IS_GNUCC) |
| 38 | + if(ASM686) |
| 39 | + set(ZLIB_ASMS contrib/asm686/match.S) |
| 40 | +- elseif (AMD64) |
| 41 | ++ elseif(AMD64) |
| 42 | + set(ZLIB_ASMS contrib/amd64/amd64-match.S) |
| 43 | +- endif () |
| 44 | ++ elseif(ARMv8) |
| 45 | ++ set(ZLIB_ARMv8 contrib/arm/neon_adler32.c) |
| 46 | ++ endif() |
| 47 | + |
| 48 | +- if(ZLIB_ASMS) |
| 49 | +- add_definitions(-DASMV) |
| 50 | +- set_source_files_properties(${ZLIB_ASMS} PROPERTIES LANGUAGE C COMPILE_FLAGS -DNO_UNDERLINE) |
| 51 | +- endif() |
| 52 | ++ if(ZLIB_ASMS) |
| 53 | ++ add_definitions(-DASMV) |
| 54 | ++ set_source_files_properties(${ZLIB_ASMS} PROPERTIES LANGUAGE C COMPILE_FLAGS -DNO_UNDERLINE) |
| 55 | ++ elseif(ZLIB_ARMv8) |
| 56 | ++ add_definitions(-DARMv8) |
| 57 | ++ set(COMPILER ${CMAKE_C_COMPILER}) |
| 58 | ++ # NEON is mandatory in ARMv8. |
| 59 | ++ if(${COMPILER} MATCHES "aarch64") |
| 60 | ++ set_source_files_properties(${ZLIB_ARMv8} PROPERTIES LANGUAGE C COMPILE_FLAGS -march=armv8-a) |
| 61 | ++ # But it was optional for ARMv7. |
| 62 | ++ elseif(${COMPILER} MATCHES "arm") |
| 63 | ++ set_source_files_properties(${ZLIB_ARMv8} PROPERTIES LANGUAGE C COMPILE_FLAGS -mfpu=neon) |
| 64 | ++ endif() |
| 65 | ++ endif() |
| 66 | + endif() |
| 67 | + |
| 68 | + if(MSVC) |
| 69 | +@@ -183,8 +196,8 @@ if(MINGW) |
| 70 | + set(ZLIB_DLL_SRCS ${CMAKE_CURRENT_BINARY_DIR}/zlib1rc.obj) |
| 71 | + endif(MINGW) |
| 72 | + |
| 73 | +-add_library(zlib SHARED ${ZLIB_SRCS} ${ZLIB_ASMS} ${ZLIB_DLL_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) |
| 74 | +-add_library(zlibstatic STATIC ${ZLIB_SRCS} ${ZLIB_ASMS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) |
| 75 | ++add_library(zlib SHARED ${ZLIB_SRCS} ${ZLIB_ASMS} ${ZLIB_ARMv8} ${ZLIB_DLL_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) |
| 76 | ++add_library(zlibstatic STATIC ${ZLIB_SRCS} ${ZLIB_ASMS} ${ZLIB_ARMv8} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) |
| 77 | + set_target_properties(zlib PROPERTIES DEFINE_SYMBOL ZLIB_DLL) |
| 78 | + set_target_properties(zlib PROPERTIES SOVERSION 1) |
| 79 | + |
| 80 | +diff --git a/adler32.c b/adler32.c |
| 81 | +index d0be4380..45ebaa4b 100644 |
| 82 | +--- a/adler32.c |
| 83 | ++++ b/adler32.c |
| 84 | +@@ -136,7 +136,12 @@ uLong ZEXPORT adler32(adler, buf, len) |
| 85 | + const Bytef *buf; |
| 86 | + uInt len; |
| 87 | + { |
| 88 | ++#ifdef ARMv8 |
| 89 | ++# pragma message("Using NEON-ized Adler32.") |
| 90 | ++ return NEON_adler32(adler, buf, len); |
| 91 | ++#else |
| 92 | + return adler32_z(adler, buf, len); |
| 93 | ++#endif |
| 94 | + } |
| 95 | + |
| 96 | + /* ========================================================================= */ |
| 97 | +diff --git a/contrib/README.contrib b/contrib/README.contrib |
| 98 | +index a411d5c3..3fd1d202 100644 |
| 99 | +--- a/contrib/README.contrib |
| 100 | ++++ b/contrib/README.contrib |
| 101 | +@@ -12,6 +12,9 @@ amd64/ by Mikhail Teterin < [email protected]> |
| 102 | + asm code for AMD64 |
| 103 | + See patch at http://www.freebsd.org/cgi/query-pr.cgi?pr=bin/96393 |
| 104 | + |
| 105 | ++arm/ by Adenilson Cavalcanti <[email protected]> |
| 106 | ++ ARM optimizations (NEON and ARMv8 code). |
| 107 | ++ |
| 108 | + asm686/ by Brian Raiter < [email protected]> |
| 109 | + asm code for Pentium and PPro/PII, using the AT&T (GNU as) syntax |
| 110 | + See http://www.muppetlabs.com/~breadbox/software/assembly.html |
| 111 | +diff --git a/contrib/arm/neon_adler32.c b/contrib/arm/neon_adler32.c |
| 112 | +new file mode 100644 |
| 113 | +index 00000000..f173a74f |
| 114 | +--- /dev/null |
| 115 | ++++ b/contrib/arm/neon_adler32.c |
| 116 | +@@ -0,0 +1,137 @@ |
| 117 | ++/* Copyright (C) 1995-2011, 2016 Mark Adler |
| 118 | ++ * Copyright (C) 2017 ARM Holdings Inc. |
| 119 | ++ * Authors: Adenilson Cavalcanti <[email protected]> |
| 120 | ++ * Simon Hosie <[email protected]> |
| 121 | ++ * This software is provided 'as-is', without any express or implied |
| 122 | ++ * warranty. In no event will the authors be held liable for any damages |
| 123 | ++ * arising from the use of this software. |
| 124 | ++ * Permission is granted to anyone to use this software for any purpose, |
| 125 | ++ * including commercial applications, and to alter it and redistribute it |
| 126 | ++ * freely, subject to the following restrictions: |
| 127 | ++ * 1. The origin of this software must not be misrepresented; you must not |
| 128 | ++ * claim that you wrote the original software. If you use this software |
| 129 | ++ * in a product, an acknowledgment in the product documentation would be |
| 130 | ++ * appreciated but is not required. |
| 131 | ++ * 2. Altered source versions must be plainly marked as such, and must not be |
| 132 | ++ * misrepresented as being the original software. |
| 133 | ++ * 3. This notice may not be removed or altered from any source distribution. |
| 134 | ++ */ |
| 135 | ++ |
| 136 | ++#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) |
| 137 | ++#include <arm_neon.h> |
| 138 | ++ |
| 139 | ++static void NEON_accum32(uint32_t *s, const unsigned char *buf, |
| 140 | ++ unsigned int len) |
| 141 | ++{ |
| 142 | ++ static const uint8_t taps[32] = { |
| 143 | ++ 32, 31, 30, 29, 28, 27, 26, 25, |
| 144 | ++ 24, 23, 22, 21, 20, 19, 18, 17, |
| 145 | ++ 16, 15, 14, 13, 12, 11, 10, 9, |
| 146 | ++ 8, 7, 6, 5, 4, 3, 2, 1 }; |
| 147 | ++ |
| 148 | ++ uint32x2_t adacc2, s2acc2, as; |
| 149 | ++ uint8x16_t t0 = vld1q_u8(taps), t1 = vld1q_u8(taps + 16); |
| 150 | ++ |
| 151 | ++ uint32x4_t adacc = vdupq_n_u32(0), s2acc = vdupq_n_u32(0); |
| 152 | ++ adacc = vsetq_lane_u32(s[0], adacc, 0); |
| 153 | ++ s2acc = vsetq_lane_u32(s[1], s2acc, 0); |
| 154 | ++ |
| 155 | ++ while (len >= 2) { |
| 156 | ++ uint8x16_t d0 = vld1q_u8(buf), d1 = vld1q_u8(buf + 16); |
| 157 | ++ uint16x8_t adler, sum2; |
| 158 | ++ s2acc = vaddq_u32(s2acc, vshlq_n_u32(adacc, 5)); |
| 159 | ++ adler = vpaddlq_u8( d0); |
| 160 | ++ adler = vpadalq_u8(adler, d1); |
| 161 | ++ sum2 = vmull_u8( vget_low_u8(t0), vget_low_u8(d0)); |
| 162 | ++ sum2 = vmlal_u8(sum2, vget_high_u8(t0), vget_high_u8(d0)); |
| 163 | ++ sum2 = vmlal_u8(sum2, vget_low_u8(t1), vget_low_u8(d1)); |
| 164 | ++ sum2 = vmlal_u8(sum2, vget_high_u8(t1), vget_high_u8(d1)); |
| 165 | ++ adacc = vpadalq_u16(adacc, adler); |
| 166 | ++ s2acc = vpadalq_u16(s2acc, sum2); |
| 167 | ++ len -= 2; |
| 168 | ++ buf += 32; |
| 169 | ++ } |
| 170 | ++ |
| 171 | ++ while (len > 0) { |
| 172 | ++ uint8x16_t d0 = vld1q_u8(buf); |
| 173 | ++ uint16x8_t adler, sum2; |
| 174 | ++ s2acc = vaddq_u32(s2acc, vshlq_n_u32(adacc, 4)); |
| 175 | ++ adler = vpaddlq_u8(d0); |
| 176 | ++ sum2 = vmull_u8( vget_low_u8(t1), vget_low_u8(d0)); |
| 177 | ++ sum2 = vmlal_u8(sum2, vget_high_u8(t1), vget_high_u8(d0)); |
| 178 | ++ adacc = vpadalq_u16(adacc, adler); |
| 179 | ++ s2acc = vpadalq_u16(s2acc, sum2); |
| 180 | ++ buf += 16; |
| 181 | ++ len--; |
| 182 | ++ } |
| 183 | ++ |
| 184 | ++ adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc)); |
| 185 | ++ s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc)); |
| 186 | ++ as = vpadd_u32(adacc2, s2acc2); |
| 187 | ++ s[0] = vget_lane_u32(as, 0); |
| 188 | ++ s[1] = vget_lane_u32(as, 1); |
| 189 | ++} |
| 190 | ++ |
| 191 | ++static void NEON_handle_tail(uint32_t *pair, const unsigned char *buf, |
| 192 | ++ unsigned int len) |
| 193 | ++{ |
| 194 | ++ /* Oldie K&R code integration. */ |
| 195 | ++ unsigned int i; |
| 196 | ++ for (i = 0; i < len; ++i) { |
| 197 | ++ pair[0] += buf[i]; |
| 198 | ++ pair[1] += pair[0]; |
| 199 | ++ } |
| 200 | ++} |
| 201 | ++ |
| 202 | ++extern unsigned long NEON_adler32(unsigned long adler, const unsigned char *buf, |
| 203 | ++ const unsigned int len) |
| 204 | ++{ |
| 205 | ++ /* initial Adler-32 value (deferred check for len == 1 speed) */ |
| 206 | ++ if (!buf) |
| 207 | ++ return 1L; |
| 208 | ++ |
| 209 | ++ /* The largest prime smaller than 65536. */ |
| 210 | ++ const uint32_t M_BASE = 65521; |
| 211 | ++ /* This is the threshold where doing accumulation may overflow. */ |
| 212 | ++ const int M_NMAX = 5552; |
| 213 | ++ |
| 214 | ++ unsigned long sum2; |
| 215 | ++ uint32_t pair[2]; |
| 216 | ++ int n = M_NMAX; |
| 217 | ++ unsigned int done = 0; |
| 218 | ++ /* Oldie K&R code integration. */ |
| 219 | ++ unsigned int i; |
| 220 | ++ |
| 221 | ++ /* Split Adler-32 into component sums, it can be supplied by |
| 222 | ++ * the caller sites (e.g. in a PNG file). |
| 223 | ++ */ |
| 224 | ++ sum2 = (adler >> 16) & 0xffff; |
| 225 | ++ adler &= 0xffff; |
| 226 | ++ pair[0] = adler; |
| 227 | ++ pair[1] = sum2; |
| 228 | ++ |
| 229 | ++ for (i = 0; i < len; i += n) { |
| 230 | ++ if ((i + n) > len) |
| 231 | ++ n = len - i; |
| 232 | ++ |
| 233 | ++ if (n < 16) |
| 234 | ++ break; |
| 235 | ++ |
| 236 | ++ NEON_accum32(pair, buf + i, n / 16); |
| 237 | ++ pair[0] %= M_BASE; |
| 238 | ++ pair[1] %= M_BASE; |
| 239 | ++ |
| 240 | ++ done += (n / 16) * 16; |
| 241 | ++ } |
| 242 | ++ |
| 243 | ++ /* Handle the tail elements. */ |
| 244 | ++ if (done < len) { |
| 245 | ++ NEON_handle_tail(pair, (buf + done), len - done); |
| 246 | ++ pair[0] %= M_BASE; |
| 247 | ++ pair[1] %= M_BASE; |
| 248 | ++ } |
| 249 | ++ |
| 250 | ++ /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */ |
| 251 | ++ return (pair[1] << 16) | pair[0]; |
| 252 | ++} |
| 253 | ++#endif |
0 commit comments