diff options
Diffstat (limited to '0269-Disable-FMADD-in-chains-for-Zen4-and-generic.patch')
-rw-r--r-- | 0269-Disable-FMADD-in-chains-for-Zen4-and-generic.patch | 142 |
1 files changed, 142 insertions, 0 deletions
diff --git a/0269-Disable-FMADD-in-chains-for-Zen4-and-generic.patch b/0269-Disable-FMADD-in-chains-for-Zen4-and-generic.patch new file mode 100644 index 0000000..b0de504 --- /dev/null +++ b/0269-Disable-FMADD-in-chains-for-Zen4-and-generic.patch @@ -0,0 +1,142 @@ +From 19ee37b11702c86d7ed271e9e1d00e23cc4ab93c Mon Sep 17 00:00:00 2001 +From: Jan Hubicka <jh@suse.cz> +Date: Fri, 29 Dec 2023 23:51:03 +0100 +Subject: [PATCH 15/28] Disable FMADD in chains for Zen4 and generic + +this patch disables use of FMA in matrix multiplication loop for generic (for +x86-64-v3) and zen4. I tested this on zen4 and Xenon Gold Gold 6212U. + +For Intel this is neutral both on the matrix multiplication microbenchmark +(attached) and spec2k17 where the difference was within noise for Core. + +On core the micro-benchmark runs as follows: + +With FMA: + + 578,500,241 cycles:u # 3.645 GHz + ( +- 0.12% ) + 753,318,477 instructions:u # 1.30 insn per +cycle ( +- 0.00% ) + 125,417,701 branches:u # 790.227 M/sec + ( +- 0.00% ) + 0.159146 +- 0.000363 seconds time elapsed ( +- 0.23% ) + +No FMA: + + 577,573,960 cycles:u # 3.514 GHz + ( +- 0.15% ) + 878,318,479 instructions:u # 1.52 insn per +cycle ( +- 0.00% ) + 125,417,702 branches:u # 763.035 M/sec + ( +- 0.00% ) + 0.164734 +- 0.000321 seconds time elapsed ( +- 0.19% ) + +So the cycle count is unchanged and discrete multiply+add takes same time as +FMA. + +While on zen: + +With FMA: + 484875179 cycles:u # 3.599 GHz + ( +- 0.05% ) (82.11%) + 752031517 instructions:u # 1.55 insn per +cycle + 125106525 branches:u # 928.712 M/sec + ( +- 0.03% ) (85.09%) + 128356 branch-misses:u # 0.10% of all +branches ( +- 0.06% ) (83.58%) + +No FMA: + 375875209 cycles:u # 3.592 GHz + ( +- 0.08% ) (80.74%) + 875725341 instructions:u # 2.33 insn per +cycle + 124903825 branches:u # 1.194 G/sec + ( +- 0.04% ) (84.59%) + 0.105203 +- 0.000188 seconds time elapsed ( +- 0.18% ) + +The diffrerence is that Cores understand the fact that fmadd does not need +all three parameters to start computation, while Zen cores doesn't. + +Since this seems noticeable win on zen and not loss on Core it seems like good +default for generic. + +float a[SIZE][SIZE]; +float b[SIZE][SIZE]; +float c[SIZE][SIZE]; + +void init(void) +{ + int i, j, k; + for(i=0; i<SIZE; ++i) + { + for(j=0; j<SIZE; ++j) + { + a[i][j] = (float)i + j; + b[i][j] = (float)i - j; + c[i][j] = 0.0f; + } + } +} + +void mult(void) +{ + int i, j, k; + + for(i=0; i<SIZE; ++i) + { + for(j=0; j<SIZE; ++j) + { + for(k=0; k<SIZE; ++k) + { + c[i][j] += a[i][k] * b[k][j]; + } + } + } +} + +int main(void) +{ + clock_t s, e; + + init(); + s=clock(); + mult(); + e=clock(); + printf(" mult took %10d clocks\n", (int)(e-s)); + + return 0; + +} + +gcc/ChangeLog: + + * config/i386/x86-tune.def (X86_TUNE_AVOID_128FMA_CHAINS, + X86_TUNE_AVOID_256FMA_CHAINS): Enable for znver4 and Core. +--- + gcc/config/i386/x86-tune.def | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def +index bdb455d20..fd095f3ec 100644 +--- a/gcc/config/i386/x86-tune.def ++++ b/gcc/config/i386/x86-tune.def +@@ -499,12 +499,13 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts", + + /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or + smaller FMA chain. */ +-DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2 | m_ZNVER3) ++DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2 ++ | m_ZNVER3 | m_ZNVER4 | m_GENERIC) + + /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or + smaller FMA chain. */ + DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3 +- | m_ALDERLAKE | m_SAPPHIRERAPIDS) ++ | m_ZNVER4 | m_ALDERLAKE | m_SAPPHIRERAPIDS | m_GENERIC) + + /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or + smaller FMA chain. */ +-- +2.31.1 + |