diff options
author | CoprDistGit <infra@openeuler.org> | 2023-08-19 12:42:12 +0000 |
---|---|---|
committer | CoprDistGit <infra@openeuler.org> | 2023-08-19 12:42:12 +0000 |
commit | 4171d506f218bcfaa1c8d40b36596abfab644899 (patch) | |
tree | 0e62eb269ac2671efc121d54075ddd016bfe4db3 | |
parent | ec1a16522fc9c54ca26742f73b94eab08a38f25d (diff) |
automatic import of zlib
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | backport-fix-undefined-buffer-detected-by-oss-fuzz.patch | 30 | ||||
-rw-r--r-- | backport-zlib-1.2.5-minizip-fixuncrypt.patch | 14 | ||||
-rw-r--r-- | sources | 1 | ||||
-rw-r--r-- | zlib-1.3-SIMD.patch | 770 | ||||
-rw-r--r-- | zlib-Optimize-CRC32.patch | 94 | ||||
-rw-r--r-- | zlib.spec | 148 |
7 files changed, 1058 insertions, 0 deletions
@@ -0,0 +1 @@ +/zlib-1.3.tar.xz diff --git a/backport-fix-undefined-buffer-detected-by-oss-fuzz.patch b/backport-fix-undefined-buffer-detected-by-oss-fuzz.patch new file mode 100644 index 0000000..5a94ce0 --- /dev/null +++ b/backport-fix-undefined-buffer-detected-by-oss-fuzz.patch @@ -0,0 +1,30 @@ +From fbc28a919107bb6fbdceb2d3dfe610ddcbc5ac89 Mon Sep 17 00:00:00 2001 +From: fangyufa <fangyufa1@huawei.com> +Date: Tue, 3 Dec 2019 15:42:06 +0800 +Subject: [PATCH] zlib: fix undefined buffer detected by oss-fuzz + +this patch fixes a use of uninitialized value discovered by one of the +fuzzers of the oss-fuzz project: +https://github.com/google/oss-fuzz/blob/master/projects/zlib/example_dict_fuzzer.c +clear out s->prev buffer to avoid undefined behavior + +signed-off-by: fangyufa <fangyufa1@huawei.com> +--- + zlib-1.2.11/deflate.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/deflate.c b/deflate.c +index 4c42259..a03bef2 100644 +--- a/deflate.c ++++ b/deflate.c +@@ -329,6 +329,7 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy, + + s->window = (Bytef *) ZALLOC(strm, s->w_size, 2*sizeof(Byte)); + s->prev = (Posf *) ZALLOC(strm, s->w_size, sizeof(Pos)); ++ memset(s->prev, 0, s->w_size*sizeof(Pos)); + s->head = (Posf *) ZALLOC(strm, s->hash_size, sizeof(Pos)); + + s->high_water = 0; /* nothing written to s->window yet */ +-- +2.19.1 + diff --git a/backport-zlib-1.2.5-minizip-fixuncrypt.patch b/backport-zlib-1.2.5-minizip-fixuncrypt.patch new file mode 100644 index 0000000..d113879 --- /dev/null +++ b/backport-zlib-1.2.5-minizip-fixuncrypt.patch @@ -0,0 +1,14 @@ +diff -up zlib-1.2.5/contrib/minizip/unzip.c.fixuncrypt zlib-1.2.5/contrib/minizip/unzip.c +--- zlib-1.2.5/contrib/minizip/unzip.c.fixuncrypt 2011-11-11 12:13:56.335867758 -0500 ++++ zlib-1.2.5/contrib/minizip/unzip.c 2011-11-11 12:14:01.747799372 -0500 +@@ -68,10 +68,6 @@ + #include <stdlib.h> + #include <string.h> + +-#ifndef NOUNCRYPT +- #define NOUNCRYPT +-#endif +- + #include "zlib.h" + #include "unzip.h" + @@ -0,0 +1 @@ +b49e70aacafacfceb1107943497f5545 zlib-1.3.tar.xz diff --git a/zlib-1.3-SIMD.patch b/zlib-1.3-SIMD.patch new file mode 100644 index 0000000..3d4ec3e --- /dev/null +++ b/zlib-1.3-SIMD.patch @@ -0,0 +1,770 @@ +From 91c1e78feec94739cc5da8562b3e2395bfdf6193 Mon Sep 17 00:00:00 2001 +From: hedongbo <hedongbo@huawei.com> +Date: Sun, 14 Sep 2020 15:36:12 +0800 +Subject: [PATCH] zlib-1.2.11-SIMD.patch + +In the sampling of the Hive test program, it is found that inflate occupies a high proportion. +The zlib is optimized through instruction set optimization, hash replacement, and compilation option optimization. +The inflate and deflate processes of the Zlib library provided by the JDK are optimized to shorten the invoking time. +--- + CMakeLists.txt | 6 + + adler32.c | 169 +++++++++++++++++++++- + deflate.c | 22 ++- + inffast.c | 62 ++++++++- + inffast.h | 370 +++++++++++++++++++++++++++++++++++++++++++++++++ + inflate.c | 7 + + 6 files changed, 627 insertions(+), 9 deletions(-) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index b412dc7..40dc533 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -126,6 +126,12 @@ if(NOT MINGW) + ) + endif() + ++if(CMAKE_COMPILER_IS_GNUCC) ++ if(ARM_NEON) ++ add_definitions(-DHASH_ARMV8_CRC32 -march=armv8-a+crc -DUNALIGNED_OK -DADLER32_SIMD_NEON -DINFLATE_CHUNK_SIMD_NEON -O3) ++ endif() ++endif() ++ + # parse the full version number from zlib.h and include in ZLIB_FULL_VERSION + file(READ ${CMAKE_CURRENT_SOURCE_DIR}/zlib.h _zlib_h_contents) + string(REGEX REPLACE ".*#define[ \t]+ZLIB_VERSION[ \t]+\"([-0-9A-Za-z.]+)\".*" +diff --git a/adler32.c b/adler32.c +index d0be438..6ced75d 100644 +--- a/adler32.c ++++ b/adler32.c +@@ -59,7 +59,169 @@ local uLong adler32_combine_ OF((uLong adler1, uLong adler2, z_off64_t len2)); + # define MOD63(a) a %= BASE + #endif + +-/* ========================================================================= */ ++#if defined(ADLER32_SIMD_NEON) ++#include <arm_neon.h> ++/* ++ * Multiply-add bytes by [ 32, 31, 30, ... ] for s2. ++ */ ++uint32x4_t ZLIB_INTERNAL mul_add_bytes( ++ uint32x4_t v_s2, ++ uint16x8_t v_column_sum_1, ++ uint16x8_t v_column_sum_2, ++ uint16x8_t v_column_sum_3, ++ uint16x8_t v_column_sum_4) ++{ ++ v_s2 = vshlq_n_u32(v_s2, 5); ++ ++ v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_1), ++ (uint16x4_t) { 32, 31, 30, 29 }); ++ v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), ++ (uint16x4_t) { 28, 27, 26, 25 }); ++ v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_2), ++ (uint16x4_t) { 24, 23, 22, 21 }); ++ v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), ++ (uint16x4_t) { 20, 19, 18, 17 }); ++ v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_3), ++ (uint16x4_t) { 16, 15, 14, 13 }); ++ v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), ++ (uint16x4_t) { 12, 11, 10, 9 }); ++ v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_4), ++ (uint16x4_t) { 8, 7, 6, 5 }); ++ v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), ++ (uint16x4_t) { 4, 3, 2, 1 }); ++ return v_s2; ++} ++ ++/* ++ * Handle leftover data. ++ */ ++uLong ZLIB_INTERNAL leftover_handler(uint32_t s1, uint32_t s2, const Bytef *buf, z_size_t len) ++{ ++ if (len) { ++ if (len >= 16) { ++ s2 += (s1 += *buf++); ++ s2 += (s1 += *buf++); ++ s2 += (s1 += *buf++); ++ s2 += (s1 += *buf++); ++ ++ s2 += (s1 += *buf++); ++ s2 += (s1 += *buf++); ++ s2 += (s1 += *buf++); ++ s2 += (s1 += *buf++); ++ ++ s2 += (s1 += *buf++); ++ s2 += (s1 += *buf++); ++ s2 += (s1 += *buf++); ++ s2 += (s1 += *buf++); ++ ++ s2 += (s1 += *buf++); ++ s2 += (s1 += *buf++); ++ s2 += (s1 += *buf++); ++ s2 += (s1 += *buf++); ++ ++ len -= 16; ++ } ++ ++ while (len--) { ++ s2 += (s1 += *buf++); ++ } ++ ++ if (s1 >= BASE) ++ s1 -= BASE; ++ s2 %= BASE; ++ } ++ ++ /* ++ * Return the recombined sums. ++ */ ++ return s1 | (s2 << 16); ++} ++ ++uLong ZLIB_INTERNAL adler32_simd_(uLong adler, const Bytef *buf, z_size_t len) ++{ ++ /* ++ * Split Adler-32 into component sums. ++ */ ++ uint32_t s1 = adler & 0xffff; ++ uint32_t s2 = adler >> 16; ++ /* ++ * Serially compute s1 & s2, until the data is 16-byte aligned. ++ */ ++ if ((uintptr_t)buf & 0xf) { ++ while ((uintptr_t)buf & 0xf) { ++ s2 += (s1 += *buf++); ++ --len; ++ } ++ if (s1 >= BASE) ++ s1 -= BASE; ++ s2 %= BASE; ++ } ++ /* ++ * Process the data in blocks. ++ */ ++ const unsigned BLOCK_SIZE = 1 << 5; ++ z_size_t blocks = len / BLOCK_SIZE; ++ len -= blocks * BLOCK_SIZE; ++ while (blocks) { ++ unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ ++ if (n > blocks) ++ n = (unsigned) blocks; ++ blocks -= n; ++ /* ++ * Process n blocks of data. At most NMAX data bytes can be ++ * processed before s2 must be reduced modulo BASE. ++ */ ++ uint32x4_t v_s2 = (uint32x4_t) { 0, 0, 0, s1 * n }; ++ uint32x4_t v_s1 = (uint32x4_t) { 0, 0, 0, 0 }; ++ ++ uint16x8_t v_column_sum_1 = vdupq_n_u16(0); ++ uint16x8_t v_column_sum_2 = vdupq_n_u16(0); ++ uint16x8_t v_column_sum_3 = vdupq_n_u16(0); ++ uint16x8_t v_column_sum_4 = vdupq_n_u16(0); ++ do { ++ /* ++ * Load 32 input bytes. ++ */ ++ const uint8x16_t bytes1 = vld1q_u8((uint8_t*)(buf)); ++ const uint8x16_t bytes2 = vld1q_u8((uint8_t*)(buf + 16)); ++ /* ++ * Add previous block byte sum to v_s2. ++ */ ++ v_s2 = vaddq_u32(v_s2, v_s1); ++ /* ++ * Horizontally add the bytes for s1. ++ */ ++ v_s1 = vpadalq_u16(v_s1, vpadalq_u8(vpaddlq_u8(bytes1), bytes2)); ++ /* ++ * Vertically add the bytes for s2. ++ */ ++ v_column_sum_1 = vaddw_u8(v_column_sum_1, vget_low_u8 (bytes1)); ++ v_column_sum_2 = vaddw_u8(v_column_sum_2, vget_high_u8(bytes1)); ++ v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8 (bytes2)); ++ v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2)); ++ buf += BLOCK_SIZE; ++ } while (--n); ++ v_s2 = mul_add_bytes(v_s2, v_column_sum_1, v_column_sum_2, v_column_sum_3, v_column_sum_4); ++ /* ++ * Sum epi32 ints v_s1(s2) and accumulate in s1(s2). ++ */ ++ uint32x2_t sum1 = vpadd_u32(vget_low_u32(v_s1), vget_high_u32(v_s1)); ++ uint32x2_t sum2 = vpadd_u32(vget_low_u32(v_s2), vget_high_u32(v_s2)); ++ uint32x2_t s1s2 = vpadd_u32(sum1, sum2); ++ ++ s1 += vget_lane_u32(s1s2, 0); ++ s2 += vget_lane_u32(s1s2, 1); ++ /* ++ * Reduce. ++ */ ++ s1 %= BASE; ++ s2 %= BASE; ++ } ++ return leftover_handler(s1, s2, buf, len); ++ ++} ++#endif ++ + uLong ZEXPORT adler32_z(uLong adler, const Bytef *buf, z_size_t len) { + unsigned long sum2; + unsigned n; +@@ -68,6 +230,11 @@ uLong ZEXPORT adler32_z(adler, buf, len) + unsigned long sum2; + unsigned n; + ++#if defined(ADLER32_SIMD_NEON) ++ if (buf && len >= 64) ++ return adler32_simd_(adler, buf, len); ++#endif ++ + /* split Adler-32 into component sums */ + sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; +diff --git a/deflate.c b/deflate.c +index f290783..31d1cfe 100644 +--- a/deflate.c ++++ b/deflate.c +@@ -154,7 +154,16 @@ local const config configuration_table[10] = { + * characters, so that a running hash key can be computed from the previous + * key instead of complete recalculation each time. + */ +-#define UPDATE_HASH(s,h,c) (h = (((h) << s->hash_shift) ^ (c)) & s->hash_mask) ++#if defined(HASH_ARMV8_CRC32) ++#include <arm_acle.h> ++#define UPDATE_HASH_CRC_INTERNAL(s, h, c) \ ++ (h = __crc32w(0, (c) & 0xFFFFFF) & ((deflate_state *)s)->hash_mask) ++ ++#define UPDATE_HASH(s, h, c) \ ++ UPDATE_HASH_CRC_INTERNAL(s, h, *(unsigned *)((uintptr_t)(&c) - (MIN_MATCH-1))) ++#else ++#define UPDATE_HASH(s,h,c) (h = (((h)<<s->hash_shift) ^ (c)) & s->hash_mask) ++#endif + + + /* =========================================================================== +@@ -1226,14 +1235,15 @@ local unsigned read_buf(strm, buf, size) + strm->avail_in -= len; + + zmemcpy(buf, strm->next_in, len); +- if (strm->state->wrap == 1) { +- strm->adler = adler32(strm->adler, buf, len); +- } + #ifdef GZIP +- else if (strm->state->wrap == 2) { ++ if (strm->state->wrap == 2) { /* use crc32 algo */ + strm->adler = crc32(strm->adler, buf, len); +- } ++ } else + #endif ++ if (strm->state->wrap == 1) { ++ strm->adler = adler32(strm->adler, buf, len); ++ } ++ + strm->next_in += len; + strm->total_in += len; + +diff --git a/inffast.c b/inffast.c +index 1fec7f3..84c5aba 100644 +--- a/inffast.c ++++ b/inffast.c +@@ -57,6 +57,9 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ + unsigned char FAR *out; /* local strm->next_out */ + unsigned char FAR *beg; /* inflate()'s initial strm->next_out */ + unsigned char FAR *end; /* while out < end, enough space available */ ++#if defined(INFLATE_CHUNK_SIMD_NEON) ++ unsigned char FAR *limit; /* safety limit for chunky copies */ ++#endif + #ifdef INFLATE_STRICT + unsigned dmax; /* maximum distance from zlib header */ + #endif +@@ -89,7 +92,12 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ + #endif + wsize = state->wsize; + whave = state->whave; ++#if defined(INFLATE_CHUNK_SIMD_NEON) ++ limit = out + strm->avail_out; ++ wnext = (state->wnext == 0 && whave >= wsize) ? wsize : state->wnext; ++#else + wnext = state->wnext; ++#endif + window = state->window; + hold = state->hold; + bits = state->bits; +@@ -197,6 +205,45 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ + #endif + } + from = window; ++#if defined(INFLATE_CHUNK_SIMD_NEON) ++ if (wnext >= op) { /* contiguous in window */ ++ from += wnext - op; ++ } ++ else { /* wrap around window */ ++ op -= wnext; ++ from += wsize - op; ++ if (op < len) { /* some from end of window */ ++ len -= op; ++ out = chunkcopy_safe(out, from, op, limit); ++ from = window; /* more from start of window */ ++ op = wnext; ++ /* This (rare) case can create a situation where ++ the first chunkcopy below must be checked. ++ */ ++ } ++ } ++ if (op < len) { /* still need some from output */ ++ out = chunkcopy_safe(out, from, op, limit); ++ len -= op; ++ /* When dist is small the amount of data that can be ++ copied from the window is also small, and progress ++ towards the dangerous end of the output buffer is ++ also small. This means that for trivial memsets and ++ for chunkunroll_relaxed() a safety check is ++ unnecessary. However, these conditions may not be ++ entered at all, and in that case it's possible that ++ the main copy is near the end. ++ */ ++ out = chunkunroll_relaxed(out, &dist, &len); ++ out = chunkcopy_safe(out, out - dist, len, limit); ++ } ++ else { ++ /* from points to window, so there is no risk of ++ overlapping pointers requiring memset-like behaviour ++ */ ++ out = chunkcopy_safe(out, from, len, limit); ++ } ++#else + if (wnext == 0) { /* very common case */ + from += wsize - op; + if (op < len) { /* some from window */ +@@ -247,8 +294,18 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ + if (len > 1) + *out++ = *from++; + } ++#endif + } +- else { ++ else { ++#if defined(INFLATE_CHUNK_SIMD_NEON) ++ /* Whole reference is in range of current output. No ++ range checks are necessary because we start with room ++ for at least 258 bytes of output, so unroll and roundoff ++ operations can write beyond `out+len` so long as they ++ stay within 258 bytes of `out`. ++ */ ++ out = chunkcopy_lapped_relaxed(out, dist, len); ++#else + from = out - dist; /* copy direct from output */ + do { /* minimum length is three */ + *out++ = *from++; +@@ -260,7 +317,8 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ + *out++ = *from++; + if (len > 1) + *out++ = *from++; +- } ++ } ++#endif + } + } + else if ((op & 64) == 0) { /* 2nd level distance code */ +diff --git a/inffast.h b/inffast.h +index e5c1aa4..259882c 100644 +--- a/inffast.h ++++ b/inffast.h +@@ -8,4 +8,374 @@ + subject to change. Applications should only use zlib.h. + */ + ++/* ++ * The chunk-copy code below deals with writing the decoded DEFLATE data to ++ * the output with SIMD methods to increase decode speed. Reading the input ++ * to the DEFLATE decoder with a wide, SIMD method can also increase decode ++ * speed. This option is supported on little endian machines, and reads the ++ * input data in 64-bit (8 byte) chunks. ++ */ ++ + void ZLIB_INTERNAL inflate_fast(z_streamp strm, unsigned start); ++ ++#if defined(INFLATE_CHUNK_SIMD_NEON) ++ ++#include <stdint.h> ++#include "zutil.h" ++#include <arm_neon.h> ++ ++typedef uint8x16_t z_vec128i_t; ++ ++#define Z_STATIC_ASSERT(name, assert) typedef char name[(assert) ? 1 : -1] ++ ++#if __STDC_VERSION__ >= 199901L ++#define Z_RESTRICT restrict ++#else ++#define Z_RESTRICT ++#endif ++ ++#if defined(__clang__) || defined(__GNUC__) || defined(__llvm__) ++#define Z_BUILTIN_MEMCPY __builtin_memcpy ++#else ++#define Z_BUILTIN_MEMCPY zmemcpy ++#endif ++ ++/* ++ * chunk copy type: the z_vec128i_t type size should be exactly 128-bits ++ * and equal to CHUNKCOPY_CHUNK_SIZE. ++ */ ++#define CHUNKCOPY_CHUNK_SIZE sizeof(z_vec128i_t) ++ ++Z_STATIC_ASSERT(vector_128_bits_wide, ++ CHUNKCOPY_CHUNK_SIZE == sizeof(int8_t) * 16); ++ ++/* ++ * Ask the compiler to perform a wide, unaligned load with a machinevst1q_u8 ++ * instruction appropriate for the z_vec128i_t type. ++ */ ++static inline z_vec128i_t loadchunk( ++ const unsigned char FAR* s) ++{ ++ z_vec128i_t v; ++ Z_BUILTIN_MEMCPY(&v, s, sizeof(v)); ++ return v; ++} ++ ++/* ++ * Ask the compiler to perform a wide, unaligned store with a machine ++ * instruction appropriate for the z_vec128i_t type. ++ */ ++static inline void storechunk( ++ unsigned char FAR* d, ++ const z_vec128i_t v) ++{ ++ Z_BUILTIN_MEMCPY(d, &v, sizeof(v)); ++} ++ ++/* ++ * Perform a memcpy-like operation, assuming that length is non-zero and that ++ * it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if ++ * the length is shorter than this. ++ * ++ * It also guarantees that it will properly unroll the data if the distance ++ * between `out` and `from` is at least CHUNKCOPY_CHUNK_SIZE, which we rely on ++ * in chunkcopy_relaxed(). ++ * ++ * Aside from better memory bus utilisation, this means that short copies ++ * (CHUNKCOPY_CHUNK_SIZE bytes or fewer) will fall straight through the loop ++ * without iteration, which will hopefully make the branch prediction more ++ * reliable. ++ */ ++static inline unsigned char FAR* chunkcopy_core( ++ unsigned char FAR* out, ++ const unsigned char FAR* from, ++ unsigned len) ++{ ++ const int bump = (--len % CHUNKCOPY_CHUNK_SIZE) + 1; ++ storechunk(out, loadchunk(from)); ++ out += bump; ++ from += bump; ++ len /= CHUNKCOPY_CHUNK_SIZE; ++ while (len-- > 0) { ++ storechunk(out, loadchunk(from)); ++ out += CHUNKCOPY_CHUNK_SIZE; ++ from += CHUNKCOPY_CHUNK_SIZE; ++ } ++ return out; ++} ++ ++/* ++ * Like chunkcopy_core(), but avoid writing beyond of legal output. ++ * ++ * Accepts an additional pointer to the end of safe output. A generic safe ++ * copy would use (out + len), but it's normally the case that the end of the ++ * output buffer is beyond the end of the current copy, and this can still be ++ * exploited. ++ */ ++static inline unsigned char FAR* chunkcopy_core_safe( ++ unsigned char FAR* out, ++ const unsigned char FAR* from, ++ unsigned len, ++ unsigned char FAR* limit) ++{ ++ Assert(out + len <= limit, "chunk copy exceeds safety limit"); ++ if ((limit - out) < (ptrdiff_t) CHUNKCOPY_CHUNK_SIZE) { ++ const unsigned char FAR* Z_RESTRICT rfrom = from; ++ if (len & 8) { ++ Z_BUILTIN_MEMCPY(out, rfrom, 8); ++ out += 8; ++ rfrom += 8; ++ } ++ if (len & 4) { ++ Z_BUILTIN_MEMCPY(out, rfrom, 4); ++ out += 4; ++ rfrom += 4; ++ } ++ if (len & 2) { ++ Z_BUILTIN_MEMCPY(out, rfrom, 2); ++ out += 2; ++ rfrom += 2; ++ } ++ if (len & 1) { ++ *out++ = *rfrom++; ++ } ++ return out; ++ } ++ return chunkcopy_core(out, from, len); ++} ++ ++/* ++ * Perform short copies until distance can be rewritten as being at least ++ * CHUNKCOPY_CHUNK_SIZE. ++ * ++ * Assumes it's OK to overwrite at least the first 2*CHUNKCOPY_CHUNK_SIZE ++ * bytes of output even if the copy is shorter than this. This assumption ++ * holds within zlib inflate_fast(), which starts every iteration with at ++ * least 258 bytes of output space available (258 being the maximum length ++ * output from a single token; see inffast.c). ++ */ ++static inline unsigned char FAR* chunkunroll_relaxed( ++ unsigned char FAR* out, ++ unsigned FAR* dist, ++ unsigned FAR* len) ++{ ++ const unsigned char FAR* from = out - *dist; ++ while (*dist < *len && *dist < CHUNKCOPY_CHUNK_SIZE) { ++ storechunk(out, loadchunk(from)); ++ out += *dist; ++ *len -= *dist; ++ *dist += *dist; ++ } ++ return out; ++} ++ ++/* ++ * v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in ++ * every 64-bit component of the 128-bit result (64-bit int splat). ++ */ ++static inline z_vec128i_t v_load64_dup(const void* src) ++{ ++ return vcombine_u8(vld1_u8(src), vld1_u8(src)); ++} ++ ++/* ++ * v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in ++ * every 32-bit component of the 128-bit result (32-bit int splat). ++ */ ++static inline z_vec128i_t v_load32_dup(const void* src) ++{ ++ int32_t i32; ++ Z_BUILTIN_MEMCPY(&i32, src, sizeof(i32)); ++ return vreinterpretq_u8_s32(vdupq_n_s32(i32)); ++} ++ ++/* ++ * v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in ++ * every 16-bit component of the 128-bit result (16-bit int splat). ++ */ ++static inline z_vec128i_t v_load16_dup(const void* src) ++{ ++ int16_t i16; ++ Z_BUILTIN_MEMCPY(&i16, src, sizeof(i16)); ++ return vreinterpretq_u8_s16(vdupq_n_s16(i16)); ++} ++ ++/* ++ * v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit ++ * component of the 128-bit result (8-bit int splat). ++ */ ++static inline z_vec128i_t v_load8_dup(const void* src) ++{ ++ return vld1q_dup_u8((const uint8_t*) src); ++} ++ ++/* ++ * v_store_128(): store the 128-bit vec in a memory destination (that might ++ * not be 16-byte aligned) void* out. ++ */ ++static inline void v_store_128(unsigned char* out, const z_vec128i_t vec) ++{ ++ vst1q_u8(out, vec); ++} ++ ++/* ++ * Perform an overlapping copy which behaves as a memset() operation, but ++ * supporting periods other than one, and assume that length is non-zero and ++ * that it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE*3 bytes of output ++ * even if the length is shorter than this. ++ */ ++static inline unsigned char FAR* chunkset_store_result( ++ unsigned len, ++ unsigned char FAR* out, ++ z_vec128i_t v) ++{ ++ do { ++ v_store_128(out, v); ++ out += sizeof(v); ++ len -= sizeof(v); ++ } while (len > 0); ++ return out; ++} ++ ++static inline unsigned char FAR* chunkset_core(unsigned char FAR* out, unsigned period, unsigned len) ++{ ++ z_vec128i_t v; ++ const int bump = ((len - 1) % sizeof(v)) + 1; ++ switch (period) { ++ case 1: ++ v = v_load8_dup(out - 1); ++ v_store_128(out, v); ++ out += bump; ++ len -= bump; ++ while (len > 0) { ++ v_store_128(out, v); ++ out += sizeof(v); ++ len -= sizeof(v); ++ } ++ return out; ++ case 2: ++ v = v_load16_dup(out - 2); ++ v_store_128(out, v); ++ out += bump; ++ len -= bump; ++ if (len > 0) { ++ v = v_load16_dup(out - 2); ++ out = chunkset_store_result(len, out, v); ++ } ++ return out; ++ case 4: ++ v = v_load32_dup(out - 4); ++ v_store_128(out, v); ++ out += bump; ++ len -= bump; ++ if (len > 0) { ++ v = v_load32_dup(out - 4); ++ out = chunkset_store_result(len, out, v); ++ } ++ return out; ++ case 8: ++ v = v_load64_dup(out - 8); ++ v_store_128(out, v); ++ out += bump; ++ len -= bump; ++ if (len > 0) { ++ v = v_load64_dup(out - 8); ++ out = chunkset_store_result(len, out, v); ++ } ++ return out; ++ } ++ out = chunkunroll_relaxed(out, &period, &len); ++ return chunkcopy_core(out, out - period, len); ++} ++ ++/* ++ * Perform a memcpy-like operation, but assume that length is non-zero and that ++ * it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if ++ * the length is shorter than this. ++ * ++ * Unlike chunkcopy_core() above, no guarantee is made regarding the behaviour ++ * of overlapping buffers, regardless of the distance between the pointers. ++ * This is reflected in the `restrict`-qualified pointers, allowing the ++ * compiler to re-order loads and stores. ++ */ ++static inline unsigned char FAR* chunkcopy_relaxed( ++ unsigned char FAR* Z_RESTRICT out, ++ const unsigned char FAR* Z_RESTRICT from, ++ unsigned len) ++{ ++ return chunkcopy_core(out, from, len); ++} ++ ++/* ++ * Like chunkcopy_relaxed(), but avoid writing beyond of legal output. ++ * ++ * Unlike chunkcopy_core_safe() above, no guarantee is made regarding the ++ * behaviour of overlapping buffers, regardless of the distance between the ++ * pointers. This is reflected in the `restrict`-qualified pointers, allowing ++ * the compiler to re-order loads and stores. ++ * ++ * Accepts an additional pointer to the end of safe output. A generic safe ++ * copy would use (out + len), but it's normally the case that the end of the ++ * output buffer is beyond the end of the current copy, and this can still be ++ * exploited. ++ */ ++static inline unsigned char FAR* chunkcopy_safe( ++ unsigned char FAR* out, ++ const unsigned char FAR* Z_RESTRICT from, ++ unsigned len, ++ unsigned char FAR* limit) ++{ ++ Assert(out + len <= limit, "chunk copy exceeds safety limit"); ++ return chunkcopy_core_safe(out, from, len, limit); ++} ++ ++/* ++ * Perform chunky copy within the same buffer, where the source and destination ++ * may potentially overlap. ++ * ++ * Assumes that len > 0 on entry, and that it's safe to write at least ++ * CHUNKCOPY_CHUNK_SIZE*3 bytes to the output. ++ */ ++static inline unsigned char FAR* chunkcopy_lapped_relaxed( ++ unsigned char FAR* out, ++ unsigned dist, ++ unsigned len) ++{ ++ if (dist < len && dist < CHUNKCOPY_CHUNK_SIZE) { ++ return chunkset_core(out, dist, len); ++ } ++ return chunkcopy_core(out, out - dist, len); ++} ++ ++/* ++ * Behave like chunkcopy_lapped_relaxed(), but avoid writing beyond of legal ++ * output. ++ * ++ * Accepts an additional pointer to the end of safe output. A generic safe ++ * copy would use (out + len), but it's normally the case that the end of the ++ * output buffer is beyond the end of the current copy, and this can still be ++ * exploited. ++ */ ++static inline unsigned char FAR* chunkcopy_lapped_safe( ++ unsigned char FAR* out, ++ unsigned dist, ++ unsigned len, ++ unsigned char FAR* limit) ++{ ++ Assert(out + len <= limit, "chunk copy exceeds safety limit"); ++ if ((limit - out) < (ptrdiff_t) (3 * CHUNKCOPY_CHUNK_SIZE)) { ++ while (len-- > 0) { ++ *out = *(out - dist); ++ out++; ++ } ++ return out; ++ } ++ return chunkcopy_lapped_relaxed(out, dist, len); ++} ++ ++ ++#undef Z_STATIC_ASSERT ++#undef Z_RESTRICT ++#undef Z_BUILTIN_MEMCPY ++ ++#endif //defined(INFLATE_CHUNK_SIMD_NEON) +diff --git a/inflate.c b/inflate.c +index 8acbef4..4e695b1 100644 +--- a/inflate.c ++++ b/inflate.c +@@ -408,9 +408,16 @@ unsigned copy; + + /* if it hasn't been done already, allocate space for the window */ + if (state->window == Z_NULL) { ++#if defined(INFLATE_CHUNK_SIMD_NEON) ++ unsigned wsize = 1U << state->wbits; ++ state->window = (unsigned char FAR *) ++ ZALLOC(strm, CHUNKCOPY_CHUNK_SIZE + wsize, ++ sizeof(unsigned char)); ++#else + state->window = (unsigned char FAR *) + ZALLOC(strm, 1U << state->wbits, + sizeof(unsigned char)); ++#endif + if (state->window == Z_NULL) return 1; + } + +-- +2.33.0 + diff --git a/zlib-Optimize-CRC32.patch b/zlib-Optimize-CRC32.patch new file mode 100644 index 0000000..c0495a6 --- /dev/null +++ b/zlib-Optimize-CRC32.patch @@ -0,0 +1,94 @@ +From 8935175266e343ac1d52106e2e790810b54f26c1 Mon Sep 17 00:00:00 2001 +From: liqiang64 <liqiang64@huawei.com> +Date: Tue, 3 Dec 2019 03:22:00 +0000 +Subject: [PATCH] zlib: Optimize CRC32 + +This patch uses the NEON instruction set to optimize the CRC32 +algorithm. + +On the ARM architecture, we can optimize the efficiency of +crc32 through the interface provided by the neon instruction +set. +Modify by Li Qiang. +--- + crc32.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 50 insertions(+) + +diff --git a/crc32.c b/crc32.c +index f8357b0..5c53068 100644 +--- a/crc32.c ++++ b/crc32.c +@@ -28,6 +28,9 @@ + #endif /* MAKECRCH */ + + #include "zutil.h" /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */ ++#ifdef __aarch64__ ++#include "arm_acle.h" ++#endif + + /* + A CRC of a message is computed on N braids of words in the message, where +@@ -600,6 +603,49 @@ const z_crc_t FAR * ZEXPORT get_crc_table() + return (const z_crc_t FAR *)crc_table; + } + ++#ifdef __aarch64__ ++ulg crc32_neon(crc, buf, len) ++ unsigned long crc; ++ const unsigned char FAR *buf; ++ z_size_t len; ++{ ++ register uint32_t crc_result = 0xFFFFFFFFU; ++ register const uint8_t *buf1; ++ register const uint16_t *buf2; ++ register const uint32_t *buf4; ++ register const uint64_t *buf8; ++ int64_t length = (int64_t)len; ++ buf8 = (const uint64_t *)(const void *)buf; ++ ++ if (buf == NULL) { ++ crc_result = 0xffffffffL; ++ } else { ++ crc_result = crc^0xffffffffUL; ++ ++ while((length -= sizeof(uint64_t)) >= 0) { ++ crc_result = __crc32d((crc_result), *buf8++); ++ } ++ ++ buf4 = (const uint32_t *)(const void *)buf8; ++ if (length & sizeof(uint32_t)) { ++ crc_result = __crc32w((crc_result), *buf4++); ++ } ++ ++ buf2 = (const uint16_t *)(const void *)buf4; ++ if(length & sizeof(uint16_t)) { ++ crc_result = __crc32h((crc_result), *buf2++); ++ } ++ ++ buf1 = (const uint8_t *)(const void *)buf2; ++ if (length & sizeof(uint8_t)) { ++ crc_result = __crc32b((crc_result), *buf1); ++ } ++ } ++ ++ return (crc_result ^ 0xffffffffL); ++} ++#endif ++ + /* ========================================================================= + * Use ARM machine instructions if available. This will compute the CRC about + * ten times faster than the braided calculation. This code does not check for +@@ -750,6 +794,10 @@ unsigned long ZEXPORT crc32_z(crc, buf, len) + z_size_t last, last2, i; + z_size_t num; + ++ #ifdef __aarch64__ ++ return crc32_neon(crc, buf, len); ++ #endif ++ + /* Return initial CRC, if requested. */ + if (buf == Z_NULL) return 0; + +-- +2.27.0 + diff --git a/zlib.spec b/zlib.spec new file mode 100644 index 0000000..3b75709 --- /dev/null +++ b/zlib.spec @@ -0,0 +1,148 @@ +Name: zlib +Version: 1.3 +Release: 1 +Summary: A lossless data-compression library +License: zlib and Boost +URL: http://www.zlib.net +Source0: http://www.zlib.net/zlib-%{version}.tar.xz + +# Patch0 get from fedora +Patch6000: backport-zlib-1.2.5-minizip-fixuncrypt.patch +Patch6001: backport-fix-undefined-buffer-detected-by-oss-fuzz.patch + +Patch9000: zlib-Optimize-CRC32.patch +Patch9001: zlib-1.3-SIMD.patch + +BuildRequires: automake, autoconf, libtool + +%description +Zlib is a free, general-purpose, not covered by any patents, lossless data-compression +library for use on virtually any computer hardware and operating system. The zlib data +format is itself portable across platforms. + +%package devel +Summary: Header files and libraries for Zlib development +Requires: %{name} = %{version}-%{release} + +Provides: zlib-static +Obsoletes: zlib-static + +%description devel +This package contains the static library, the header files, the tests user case and other +development content. + +%package help +Summary: Help documentation related to zlib +BuildArch: noarch + +%description help +This package includes help documentation and manuals related to zlib. + +%package -n minizip +Summary: Encapsulates the operations related to zip files +Requires: %{name} = %{version}-%{release} + +%description -n minizip +Minizip is the upper library of zlib, which encapsulates the operations related to zip files. + +%package -n minizip-devel +Summary: The development-related content related to minizip +Requires: minizip = %{version}-%{release} +Requires: %{name}-devel = %{version}-%{release} + +%description -n minizip-devel +This package contains the development-related content related to minizip. + +%prep +%setup -qn %{name}-%{version} +%autosetup -b 0 -n %{name}-%{version} -p1 + +%build +export CFLAGS="$RPM_OPT_FLAGS" +%ifarch aarch64 +CFLAGS+=" -march=armv8-a+crc" +%endif + +./configure --libdir=%{_libdir} --includedir=%{_includedir} --prefix=%{_prefix} +%make_build LDFLAGS="$LDFLAGS -Wl,-z,relro -Wl,-z,now" + +cd contrib/minizip +autoreconf --install +%configure --enable-static=no +%make_build + +%install +%make_install + +%make_install -C contrib/minizip +rm -f $RPM_BUILD_ROOT%_includedir/minizip/crypt.h + +%check +%make_build test + +%files +%license LICENSE +%doc README ChangeLog FAQ +%{_libdir}/libz.so.* + +%files devel +%doc doc/algorithm.txt test/example.c +%{_includedir}/zlib.h +%{_includedir}/zconf.h + +%{_libdir}/libz.so +%{_libdir}/pkgconfig/zlib.pc +%{_libdir}/libz.a + +%files help +%{_mandir}/man3/zlib.3* + +%files -n minizip +%doc contrib/minizip/MiniZip64_info.txt contrib/minizip/MiniZip64_Changes.txt +%{_libdir}/libminizip.so.* + + +%files -n minizip-devel +%dir %{_includedir}/minizip +%{_includedir}/minizip/*.h + +%{_libdir}/libminizip.so +%{_libdir}/pkgconfig/minizip.pc + +%changelog +* Sat Aug 19 2023 Funda Wang <fundawang@yeah.net> - 1.3-1 +- update to 1.3 + +* Thu Dec 29 2022 zhoupengcheng <zhoupengcheng11@huawei.com> - 1.2.13-1 +- update to zlib-1.2.13 +- remove openEuler uncompiled patch : 0005-Accelerate-Adler32-using-arm64-SVE-instructions.patch + +* Mon Dec 26 2022 zhoupengcheng <zhoupengcheng11@huawei.com> - 1.2.11-24 +- DESC:remove unapplied patches + +* Tue Aug 23 2022 shixuantong <shixuantong@h-partners.com> - 1.2.11-23 +- Fix missing patches due to different arch + +* Fri Aug 12 2022 dongyuzhen <dongyuzhen@h-partners.com> - 1.2.11-22 +- fix CVE-2022-37434 + +* Mon Apr 18 2022 tianwei <tianwei12@h-partners.com> - 1.2.11-21 +- modify patch info for CVE-2018-25032 + +* Wed Apr 13 2022 tianwei <tianwei12@h-partners.com> - 1.2.11-20 +- fix CVE-2018-25032 + +* Thu Sep 2 2021 liqiang <liqiang64@huawei.com> - 1.2.11-19 +- Optimize Adler32 by SVE instructions. + +* Mon Sep 14 2020 noah <hedongbo@huawei.com> - 1.2.11-18 +- add zlib-1.2.11-SIMD.patch + +* Sat Dec 21 2019 openEuler Buildteam <buildteam@openeuler.org> - 1.2.11-17 +- Fix undefined buffer detected by oss-fuzz + +* Tue Dec 3 2019 liqiang <liqiang64@huawei.com> - 1.2.11-16 +- Optimize CRC32 by NEON + +* Thu Sep 5 2019 dongjian <dongjian13@huawei.com> - 1.2.11-15 +- Rebuild the zlib and fix description |