1 files changed, 94 insertions, 0 deletions
diff --git a/zlib-Optimize-CRC32.patch b/zlib-Optimize-CRC32.patch
new file mode 100644
index 0000000..c0495a6
--- /dev/null
+++ b/zlib-Optimize-CRC32.patch
@@ -0,0 +1,94 @@
+From 8935175266e343ac1d52106e2e790810b54f26c1 Mon Sep 17 00:00:00 2001
+From: liqiang64 <liqiang64@huawei.com>
+Date: Tue, 3 Dec 2019 03:22:00 +0000
+Subject: [PATCH] zlib: Optimize CRC32
+
+This patch uses the NEON instruction set to optimize the CRC32
+algorithm.
+
+On the ARM architecture, we can optimize the efficiency of
+crc32 through the interface provided by the neon instruction
+set.
+Modify by Li Qiang.
+---
+ crc32.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 50 insertions(+)
+
+diff --git a/crc32.c b/crc32.c
+index f8357b0..5c53068 100644
+--- a/crc32.c
++++ b/crc32.c
+@@ -28,6 +28,9 @@
+ #endif /* MAKECRCH */
+ 
+ #include "zutil.h"      /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
++#ifdef __aarch64__
++#include "arm_acle.h"
++#endif
+ 
+  /*
+   A CRC of a message is computed on N braids of words in the message, where
+@@ -600,6 +603,49 @@ const z_crc_t FAR * ZEXPORT get_crc_table()
+     return (const z_crc_t FAR *)crc_table;
+ }
+
++#ifdef __aarch64__
++ulg crc32_neon(crc, buf, len)
++    unsigned long crc;
++    const unsigned char FAR *buf;
++    z_size_t len;
++{
++    register uint32_t crc_result = 0xFFFFFFFFU;
++    register const uint8_t  *buf1;
++    register const uint16_t *buf2;
++    register const uint32_t *buf4;
++    register const uint64_t *buf8;
++    int64_t length = (int64_t)len;
++    buf8 = (const  uint64_t *)(const void *)buf;
++
++    if (buf == NULL) {
++        crc_result = 0xffffffffL;
++    } else {
++        crc_result = crc^0xffffffffUL;
++
++        while((length -= sizeof(uint64_t)) >= 0) {
++            crc_result = __crc32d((crc_result), *buf8++);
++        }
++
++        buf4 = (const uint32_t *)(const void *)buf8;
++        if (length & sizeof(uint32_t)) {
++            crc_result = __crc32w((crc_result), *buf4++);
++        }
++
++        buf2 = (const uint16_t *)(const void *)buf4;
++        if(length & sizeof(uint16_t)) {
++            crc_result = __crc32h((crc_result), *buf2++);
++        }
++
++        buf1 = (const uint8_t *)(const void *)buf2;
++        if (length & sizeof(uint8_t)) {
++            crc_result = __crc32b((crc_result), *buf1);
++        }
++    }
++
++    return (crc_result ^ 0xffffffffL);
++}
++#endif
++
+ /* =========================================================================
+  * Use ARM machine instructions if available. This will compute the CRC about
+  * ten times faster than the braided calculation. This code does not check for
+@@ -750,6 +794,10 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)
+     z_size_t last, last2, i;
+     z_size_t num;
+
++    #ifdef __aarch64__
++    return crc32_neon(crc, buf, len);
++    #endif
++
+     /* Return initial CRC, if requested. */
+     if (buf == Z_NULL) return 0;
+ 
+-- 
+2.27.0
+