summaryrefslogtreecommitdiff
path: root/0269-Disable-FMADD-in-chains-for-Zen4-and-generic.patch
diff options
context:
space:
mode:
Diffstat (limited to '0269-Disable-FMADD-in-chains-for-Zen4-and-generic.patch')
-rw-r--r--0269-Disable-FMADD-in-chains-for-Zen4-and-generic.patch142
1 files changed, 142 insertions, 0 deletions
diff --git a/0269-Disable-FMADD-in-chains-for-Zen4-and-generic.patch b/0269-Disable-FMADD-in-chains-for-Zen4-and-generic.patch
new file mode 100644
index 0000000..b0de504
--- /dev/null
+++ b/0269-Disable-FMADD-in-chains-for-Zen4-and-generic.patch
@@ -0,0 +1,142 @@
+From 19ee37b11702c86d7ed271e9e1d00e23cc4ab93c Mon Sep 17 00:00:00 2001
+From: Jan Hubicka <jh@suse.cz>
+Date: Fri, 29 Dec 2023 23:51:03 +0100
+Subject: [PATCH 15/28] Disable FMADD in chains for Zen4 and generic
+
+this patch disables use of FMA in matrix multiplication loop for generic (for
+x86-64-v3) and zen4. I tested this on zen4 and Xenon Gold Gold 6212U.
+
+For Intel this is neutral both on the matrix multiplication microbenchmark
+(attached) and spec2k17 where the difference was within noise for Core.
+
+On core the micro-benchmark runs as follows:
+
+With FMA:
+
+ 578,500,241 cycles:u # 3.645 GHz
+ ( +- 0.12% )
+ 753,318,477 instructions:u # 1.30 insn per
+cycle ( +- 0.00% )
+ 125,417,701 branches:u # 790.227 M/sec
+ ( +- 0.00% )
+ 0.159146 +- 0.000363 seconds time elapsed ( +- 0.23% )
+
+No FMA:
+
+ 577,573,960 cycles:u # 3.514 GHz
+ ( +- 0.15% )
+ 878,318,479 instructions:u # 1.52 insn per
+cycle ( +- 0.00% )
+ 125,417,702 branches:u # 763.035 M/sec
+ ( +- 0.00% )
+ 0.164734 +- 0.000321 seconds time elapsed ( +- 0.19% )
+
+So the cycle count is unchanged and discrete multiply+add takes same time as
+FMA.
+
+While on zen:
+
+With FMA:
+ 484875179 cycles:u # 3.599 GHz
+ ( +- 0.05% ) (82.11%)
+ 752031517 instructions:u # 1.55 insn per
+cycle
+ 125106525 branches:u # 928.712 M/sec
+ ( +- 0.03% ) (85.09%)
+ 128356 branch-misses:u # 0.10% of all
+branches ( +- 0.06% ) (83.58%)
+
+No FMA:
+ 375875209 cycles:u # 3.592 GHz
+ ( +- 0.08% ) (80.74%)
+ 875725341 instructions:u # 2.33 insn per
+cycle
+ 124903825 branches:u # 1.194 G/sec
+ ( +- 0.04% ) (84.59%)
+ 0.105203 +- 0.000188 seconds time elapsed ( +- 0.18% )
+
+The diffrerence is that Cores understand the fact that fmadd does not need
+all three parameters to start computation, while Zen cores doesn't.
+
+Since this seems noticeable win on zen and not loss on Core it seems like good
+default for generic.
+
+float a[SIZE][SIZE];
+float b[SIZE][SIZE];
+float c[SIZE][SIZE];
+
+void init(void)
+{
+ int i, j, k;
+ for(i=0; i<SIZE; ++i)
+ {
+ for(j=0; j<SIZE; ++j)
+ {
+ a[i][j] = (float)i + j;
+ b[i][j] = (float)i - j;
+ c[i][j] = 0.0f;
+ }
+ }
+}
+
+void mult(void)
+{
+ int i, j, k;
+
+ for(i=0; i<SIZE; ++i)
+ {
+ for(j=0; j<SIZE; ++j)
+ {
+ for(k=0; k<SIZE; ++k)
+ {
+ c[i][j] += a[i][k] * b[k][j];
+ }
+ }
+ }
+}
+
+int main(void)
+{
+ clock_t s, e;
+
+ init();
+ s=clock();
+ mult();
+ e=clock();
+ printf(" mult took %10d clocks\n", (int)(e-s));
+
+ return 0;
+
+}
+
+gcc/ChangeLog:
+
+ * config/i386/x86-tune.def (X86_TUNE_AVOID_128FMA_CHAINS,
+ X86_TUNE_AVOID_256FMA_CHAINS): Enable for znver4 and Core.
+---
+ gcc/config/i386/x86-tune.def | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
+index bdb455d20..fd095f3ec 100644
+--- a/gcc/config/i386/x86-tune.def
++++ b/gcc/config/i386/x86-tune.def
+@@ -499,12 +499,13 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
+
+ /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
+ smaller FMA chain. */
+-DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2 | m_ZNVER3)
++DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2
++ | m_ZNVER3 | m_ZNVER4 | m_GENERIC)
+
+ /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
+ smaller FMA chain. */
+ DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3
+- | m_ALDERLAKE | m_SAPPHIRERAPIDS)
++ | m_ZNVER4 | m_ALDERLAKE | m_SAPPHIRERAPIDS | m_GENERIC)
+
+ /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
+ smaller FMA chain. */
+--
+2.31.1
+