summaryrefslogtreecommitdiff
path: root/x86-64-Optimize-load-of-all-bits-set-into-ZMM-regist.patch
diff options
context:
space:
mode:
authorCoprDistGit <infra@openeuler.org>2024-08-03 06:28:41 +0000
committerCoprDistGit <infra@openeuler.org>2024-08-03 06:28:41 +0000
commitd20db0561a6a36f914fde030512503b114ef9a0c (patch)
treed4e5e3494d95c269a1cee6195f11bf3201bcadbf /x86-64-Optimize-load-of-all-bits-set-into-ZMM-regist.patch
parent016343d99b1b269d7246ef1e143d4b54914433d4 (diff)
Diffstat (limited to 'x86-64-Optimize-load-of-all-bits-set-into-ZMM-regist.patch')
-rw-r--r--x86-64-Optimize-load-of-all-bits-set-into-ZMM-regist.patch267
1 files changed, 267 insertions, 0 deletions
diff --git a/x86-64-Optimize-load-of-all-bits-set-into-ZMM-regist.patch b/x86-64-Optimize-load-of-all-bits-set-into-ZMM-regist.patch
new file mode 100644
index 0000000..924e63c
--- /dev/null
+++ b/x86-64-Optimize-load-of-all-bits-set-into-ZMM-regist.patch
@@ -0,0 +1,267 @@
+From 78c9ec9000f873abe7a15a91b87080a2e4308260 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 20 Aug 2021 06:42:24 -0700
+Subject: [PATCH] x86-64: Optimize load of all bits set into ZMM register [BZ
+ #28252]
+
+Optimize loads of all bits set into ZMM register in AVX512 SVML codes
+by replacing
+
+ vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX
+
+and
+
+ vmovups .L_2il0floatpacket.13(%rip), %zmmX
+
+with
+ vpternlogd $0xff, %zmmX, %zmmX, %zmmX
+
+This fixes BZ #28252.
+---
+ sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S | 7 +------
+ sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S | 7 +------
+ sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S | 7 +------
+ sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S | 7 +------
+ sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S | 7 +------
+ sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S | 7 +------
+ sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S | 7 +------
+ sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++----------
+ sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S | 7 +------
+ sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S | 7 +------
+ 10 files changed, 11 insertions(+), 64 deletions(-)
+
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
+index e68fcdb..58e588a 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
+@@ -265,7 +265,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
+ vmovaps %zmm0, %zmm8
+
+ /* Check for large arguments path */
+- vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
++ vpternlogd $0xff, %zmm2, %zmm2, %zmm2
+
+ /*
+ ARGUMENT RANGE REDUCTION:
+@@ -456,8 +456,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
+ jmp .LBL_2_7
+ #endif
+ END (_ZGVeN8v_cos_skx)
+-
+- .section .rodata, "a"
+-.L_2il0floatpacket.16:
+- .long 0xffffffff,0xffffffff
+- .type .L_2il0floatpacket.16,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
+index dfa2aca..f5f117d 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
+@@ -274,7 +274,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
+
+ /* preserve mantissa, set input exponent to 2^(-10) */
+ vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2
+- vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
++ vpternlogd $0xff, %zmm1, %zmm1, %zmm1
+ vpsrlq $32, %zmm4, %zmm6
+
+ /* reciprocal approximation good to at least 11 bits */
+@@ -461,8 +461,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
+ jmp .LBL_2_7
+ #endif
+ END (_ZGVeN8v_log_skx)
+-
+- .section .rodata, "a"
+-.L_2il0floatpacket.12:
+- .long 0xffffffff,0xffffffff
+- .type .L_2il0floatpacket.12,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
+index be8ab7c..48d251d 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
+@@ -261,7 +261,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
+ andq $-64, %rsp
+ subq $1280, %rsp
+ movq __svml_d_trig_data@GOTPCREL(%rip), %rax
+- vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
++ vpternlogd $0xff, %zmm1, %zmm1, %zmm14
+ vmovups __dAbsMask(%rax), %zmm7
+ vmovups __dInvPI(%rax), %zmm2
+ vmovups __dRShifter(%rax), %zmm1
+@@ -458,8 +458,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
+ jmp .LBL_2_7
+ #endif
+ END (_ZGVeN8v_sin_skx)
+-
+- .section .rodata, "a"
+-.L_2il0floatpacket.14:
+- .long 0xffffffff,0xffffffff
+- .type .L_2il0floatpacket.14,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
+index 6118870..a4944a4 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
+@@ -430,7 +430,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos
+
+ /* SinPoly = SinR*SinPoly */
+ vfmadd213pd %zmm5, %zmm5, %zmm4
+- vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
++ vpternlogd $0xff, %zmm3, %zmm3, %zmm3
+
+ /* Update Cos result's sign */
+ vxorpd %zmm2, %zmm1, %zmm1
+@@ -741,8 +741,3 @@ END (_ZGVeN8vvv_sincos_knl)
+ ENTRY (_ZGVeN8vvv_sincos_skx)
+ WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
+ END (_ZGVeN8vvv_sincos_skx)
+-
+- .section .rodata, "a"
+-.L_2il0floatpacket.15:
+- .long 0xffffffff,0xffffffff
+- .type .L_2il0floatpacket.15,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
+index f671d60..fe8474f 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
+@@ -278,7 +278,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
+ X = X - Y*PI1 - Y*PI2 - Y*PI3
+ */
+ vmovaps %zmm0, %zmm6
+- vmovups .L_2il0floatpacket.13(%rip), %zmm12
++ vpternlogd $0xff, %zmm12, %zmm12, %zmm12
+ vmovups __sRShifter(%rax), %zmm3
+ vmovups __sPI1_FMA(%rax), %zmm5
+ vmovups __sA9_FMA(%rax), %zmm9
+@@ -453,8 +453,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
+ jmp .LBL_2_7
+ #endif
+ END (_ZGVeN16v_cosf_skx)
+-
+- .section .rodata, "a"
+-.L_2il0floatpacket.13:
+- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+- .type .L_2il0floatpacket.13,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
+index 637bfe3..229b782 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
+@@ -264,7 +264,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
+ vmovaps %zmm0, %zmm7
+
+ /* compare against threshold */
+- vmovups .L_2il0floatpacket.13(%rip), %zmm3
++ vpternlogd $0xff, %zmm3, %zmm3, %zmm3
+ vmovups __sInvLn2(%rax), %zmm4
+ vmovups __sShifter(%rax), %zmm1
+ vmovups __sLn2hi(%rax), %zmm6
+@@ -440,8 +440,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
+
+ #endif
+ END (_ZGVeN16v_expf_skx)
+-
+- .section .rodata, "a"
+-.L_2il0floatpacket.13:
+- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+- .type .L_2il0floatpacket.13,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
+index 9d790fb..fa2aae9 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
+@@ -235,7 +235,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
+ andq $-64, %rsp
+ subq $1280, %rsp
+ movq __svml_slog_data@GOTPCREL(%rip), %rax
+- vmovups .L_2il0floatpacket.7(%rip), %zmm6
++ vpternlogd $0xff, %zmm6, %zmm6, %zmm6
+ vmovups _iBrkValue(%rax), %zmm4
+ vmovups _sPoly_7(%rax), %zmm8
+
+@@ -409,8 +409,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
+
+ #endif
+ END (_ZGVeN16v_logf_skx)
+-
+- .section .rodata, "a"
+-.L_2il0floatpacket.7:
+- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+- .type .L_2il0floatpacket.7,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
+index c5c43c4..6aea2a4 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
+@@ -385,7 +385,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
+ vpsrlq $32, %zmm3, %zmm2
+ vpmovqd %zmm2, %ymm11
+ vcvtps2pd %ymm14, %zmm13
+- vmovups .L_2il0floatpacket.23(%rip), %zmm14
++ vpternlogd $0xff, %zmm14, %zmm14, %zmm14
+ vmovaps %zmm14, %zmm26
+ vpandd _ABSMASK(%rax), %zmm1, %zmm8
+ vpcmpd $1, _INF(%rax), %zmm8, %k2
+@@ -427,7 +427,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
+ vpmovqd %zmm11, %ymm5
+ vpxord %zmm10, %zmm10, %zmm10
+ vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
+- vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
++ vpternlogd $0xff, %zmm4, %zmm4, %zmm4
+ vpxord %zmm11, %zmm11, %zmm11
+ vcvtdq2pd %ymm7, %zmm7
+ vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
+@@ -643,11 +643,3 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
+ jmp .LBL_2_7
+ #endif
+ END (_ZGVeN16vv_powf_skx)
+-
+- .section .rodata, "a"
+-.L_2il0floatpacket.23:
+- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+- .type .L_2il0floatpacket.23,@object
+-.L_2il0floatpacket.24:
+- .long 0xffffffff,0xffffffff
+- .type .L_2il0floatpacket.24,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
+index 9cf359c..a446c50 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
+@@ -317,7 +317,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
+
+ /* Result sign calculations */
+ vpternlogd $150, %zmm0, %zmm14, %zmm1
+- vmovups .L_2il0floatpacket.13(%rip), %zmm14
++ vpternlogd $0xff, %zmm14, %zmm14, %zmm14
+
+ /* Add correction term 0.5 for cos() part */
+ vaddps %zmm8, %zmm5, %zmm15
+@@ -748,8 +748,3 @@ END (_ZGVeN16vvv_sincosf_knl)
+ ENTRY (_ZGVeN16vvv_sincosf_skx)
+ WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
+ END (_ZGVeN16vvv_sincosf_skx)
+-
+- .section .rodata, "a"
+-.L_2il0floatpacket.13:
+- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+- .type .L_2il0floatpacket.13,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
+index bd05109..c1b352d 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
+@@ -280,7 +280,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
+ movq __svml_s_trig_data@GOTPCREL(%rip), %rax
+
+ /* Check for large and special values */
+- vmovups .L_2il0floatpacket.11(%rip), %zmm14
++ vpternlogd $0xff, %zmm14, %zmm14, %zmm14
+ vmovups __sAbsMask(%rax), %zmm5
+ vmovups __sInvPI(%rax), %zmm1
+ vmovups __sRShifter(%rax), %zmm2
+@@ -472,8 +472,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
+ jmp .LBL_2_7
+ #endif
+ END (_ZGVeN16v_sinf_skx)
+-
+- .section .rodata, "a"
+-.L_2il0floatpacket.11:
+- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+- .type .L_2il0floatpacket.11,@object
+--
+1.8.3.1
+