1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
|
From 19ee37b11702c86d7ed271e9e1d00e23cc4ab93c Mon Sep 17 00:00:00 2001
From: Jan Hubicka <jh@suse.cz>
Date: Fri, 29 Dec 2023 23:51:03 +0100
Subject: [PATCH 15/28] Disable FMADD in chains for Zen4 and generic
this patch disables use of FMA in matrix multiplication loop for generic (for
x86-64-v3) and zen4. I tested this on zen4 and Xenon Gold Gold 6212U.
For Intel this is neutral both on the matrix multiplication microbenchmark
(attached) and spec2k17 where the difference was within noise for Core.
On core the micro-benchmark runs as follows:
With FMA:
578,500,241 cycles:u # 3.645 GHz
( +- 0.12% )
753,318,477 instructions:u # 1.30 insn per
cycle ( +- 0.00% )
125,417,701 branches:u # 790.227 M/sec
( +- 0.00% )
0.159146 +- 0.000363 seconds time elapsed ( +- 0.23% )
No FMA:
577,573,960 cycles:u # 3.514 GHz
( +- 0.15% )
878,318,479 instructions:u # 1.52 insn per
cycle ( +- 0.00% )
125,417,702 branches:u # 763.035 M/sec
( +- 0.00% )
0.164734 +- 0.000321 seconds time elapsed ( +- 0.19% )
So the cycle count is unchanged and discrete multiply+add takes same time as
FMA.
While on zen:
With FMA:
484875179 cycles:u # 3.599 GHz
( +- 0.05% ) (82.11%)
752031517 instructions:u # 1.55 insn per
cycle
125106525 branches:u # 928.712 M/sec
( +- 0.03% ) (85.09%)
128356 branch-misses:u # 0.10% of all
branches ( +- 0.06% ) (83.58%)
No FMA:
375875209 cycles:u # 3.592 GHz
( +- 0.08% ) (80.74%)
875725341 instructions:u # 2.33 insn per
cycle
124903825 branches:u # 1.194 G/sec
( +- 0.04% ) (84.59%)
0.105203 +- 0.000188 seconds time elapsed ( +- 0.18% )
The diffrerence is that Cores understand the fact that fmadd does not need
all three parameters to start computation, while Zen cores doesn't.
Since this seems noticeable win on zen and not loss on Core it seems like good
default for generic.
float a[SIZE][SIZE];
float b[SIZE][SIZE];
float c[SIZE][SIZE];
void init(void)
{
int i, j, k;
for(i=0; i<SIZE; ++i)
{
for(j=0; j<SIZE; ++j)
{
a[i][j] = (float)i + j;
b[i][j] = (float)i - j;
c[i][j] = 0.0f;
}
}
}
void mult(void)
{
int i, j, k;
for(i=0; i<SIZE; ++i)
{
for(j=0; j<SIZE; ++j)
{
for(k=0; k<SIZE; ++k)
{
c[i][j] += a[i][k] * b[k][j];
}
}
}
}
int main(void)
{
clock_t s, e;
init();
s=clock();
mult();
e=clock();
printf(" mult took %10d clocks\n", (int)(e-s));
return 0;
}
gcc/ChangeLog:
* config/i386/x86-tune.def (X86_TUNE_AVOID_128FMA_CHAINS,
X86_TUNE_AVOID_256FMA_CHAINS): Enable for znver4 and Core.
---
gcc/config/i386/x86-tune.def | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index bdb455d20..fd095f3ec 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -499,12 +499,13 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
/* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
smaller FMA chain. */
-DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2 | m_ZNVER3)
+DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2
+ | m_ZNVER3 | m_ZNVER4 | m_GENERIC)
/* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
smaller FMA chain. */
DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3
- | m_ALDERLAKE | m_SAPPHIRERAPIDS)
+ | m_ZNVER4 | m_ALDERLAKE | m_SAPPHIRERAPIDS | m_GENERIC)
/* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
smaller FMA chain. */
--
2.31.1
|