1 files changed, 142 insertions, 0 deletions
diff --git a/0269-Disable-FMADD-in-chains-for-Zen4-and-generic.patch b/0269-Disable-FMADD-in-chains-for-Zen4-and-generic.patch
new file mode 100644
index 0000000..b0de504
--- /dev/null
+++ b/0269-Disable-FMADD-in-chains-for-Zen4-and-generic.patch
@@ -0,0 +1,142 @@
+From 19ee37b11702c86d7ed271e9e1d00e23cc4ab93c Mon Sep 17 00:00:00 2001
+From: Jan Hubicka <jh@suse.cz>
+Date: Fri, 29 Dec 2023 23:51:03 +0100
+Subject: [PATCH 15/28] Disable FMADD in chains for Zen4 and generic
+
+this patch disables use of FMA in matrix multiplication loop for generic (for
+x86-64-v3) and zen4.  I tested this on zen4 and Xenon Gold Gold 6212U.
+
+For Intel this is neutral both on the matrix multiplication microbenchmark
+(attached) and spec2k17 where the difference was within noise for Core.
+
+On core the micro-benchmark runs as follows:
+
+With FMA:
+
+       578,500,241      cycles:u                         #    3.645 GHz
+                ( +-  0.12% )
+       753,318,477      instructions:u                   #    1.30  insn per
+cycle              ( +-  0.00% )
+       125,417,701      branches:u                       #  790.227 M/sec
+                ( +-  0.00% )
+          0.159146 +- 0.000363 seconds time elapsed  ( +-  0.23% )
+
+No FMA:
+
+       577,573,960      cycles:u                         #    3.514 GHz
+                ( +-  0.15% )
+       878,318,479      instructions:u                   #    1.52  insn per
+cycle              ( +-  0.00% )
+       125,417,702      branches:u                       #  763.035 M/sec
+                ( +-  0.00% )
+          0.164734 +- 0.000321 seconds time elapsed  ( +-  0.19% )
+
+So the cycle count is unchanged and discrete multiply+add takes same time as
+FMA.
+
+While on zen:
+
+With FMA:
+         484875179      cycles:u                         #    3.599 GHz
+             ( +-  0.05% )  (82.11%)
+         752031517      instructions:u                   #    1.55  insn per
+cycle
+         125106525      branches:u                       #  928.712 M/sec
+             ( +-  0.03% )  (85.09%)
+            128356      branch-misses:u                  #    0.10% of all
+branches          ( +-  0.06% )  (83.58%)
+
+No FMA:
+         375875209      cycles:u                         #    3.592 GHz
+             ( +-  0.08% )  (80.74%)
+         875725341      instructions:u                   #    2.33  insn per
+cycle
+         124903825      branches:u                       #    1.194 G/sec
+             ( +-  0.04% )  (84.59%)
+          0.105203 +- 0.000188 seconds time elapsed  ( +-  0.18% )
+
+The diffrerence is that Cores understand the fact that fmadd does not need
+all three parameters to start computation, while Zen cores doesn't.
+
+Since this seems noticeable win on zen and not loss on Core it seems like good
+default for generic.
+
+float a[SIZE][SIZE];
+float b[SIZE][SIZE];
+float c[SIZE][SIZE];
+
+void init(void)
+{
+   int i, j, k;
+   for(i=0; i<SIZE; ++i)
+   {
+      for(j=0; j<SIZE; ++j)
+      {
+         a[i][j] = (float)i + j;
+         b[i][j] = (float)i - j;
+         c[i][j] = 0.0f;
+      }
+   }
+}
+
+void mult(void)
+{
+   int i, j, k;
+
+   for(i=0; i<SIZE; ++i)
+   {
+      for(j=0; j<SIZE; ++j)
+      {
+         for(k=0; k<SIZE; ++k)
+         {
+            c[i][j] += a[i][k] * b[k][j];
+         }
+      }
+   }
+}
+
+int main(void)
+{
+   clock_t s, e;
+
+   init();
+   s=clock();
+   mult();
+   e=clock();
+   printf("        mult took %10d clocks\n", (int)(e-s));
+
+   return 0;
+
+}
+
+gcc/ChangeLog:
+
+	* config/i386/x86-tune.def (X86_TUNE_AVOID_128FMA_CHAINS,
+	X86_TUNE_AVOID_256FMA_CHAINS): Enable for znver4 and Core.
+---
+ gcc/config/i386/x86-tune.def | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
+index bdb455d20..fd095f3ec 100644
+--- a/gcc/config/i386/x86-tune.def
++++ b/gcc/config/i386/x86-tune.def
+@@ -499,12 +499,13 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
+ 
+ /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
+    smaller FMA chain.  */
+-DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2 | m_ZNVER3)
++DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2
++	  | m_ZNVER3 | m_ZNVER4 | m_GENERIC)
+ 
+ /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
+    smaller FMA chain.  */
+ DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3
+-	  | m_ALDERLAKE | m_SAPPHIRERAPIDS)
++	  | m_ZNVER4 | m_ALDERLAKE | m_SAPPHIRERAPIDS | m_GENERIC)
+ 
+ /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
+    smaller FMA chain.  */
+-- 
+2.31.1
+