0012-Enable-small-loop-unrolling-for-O2.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490

From 1070bc24f53e851cae55320e26715cc594efcd2f Mon Sep 17 00:00:00 2001
From: Hongyu Wang <hongyu.wang@intel.com>
Date: Thu, 8 Sep 2022 16:52:02 +0800
Subject: [PATCH 12/22] Enable small loop unrolling for O2

Modern processors has multiple way instruction decoders
For x86, icelake/zen3 has 5 uops, so for small loop with <= 4
instructions (usually has 3 uops with a cmp/jmp pair that can be
macro-fused), the decoder would have 2 uops bubble for each iteration
and the pipeline could not be fully utilized.

Therefore, this patch enables loop unrolling for small size loop at O2
to fullfill the decoder as much as possible. It turns on rtl loop
unrolling when targetm.loop_unroll_adjust exists and O2 plus speed only.
In x86 backend the default behavior is to unroll small loops with less
than 4 insns by 1 time.

This improves 548.exchange2 by 9% on icelake and 7.4% on zen3 with
0.9% codesize increment. For other benchmarks the variants are minor
and overall codesize increased by 0.2%.

The kernel image size increased by 0.06%, and no impact on eembc.

gcc/ChangeLog:

	* common/config/i386/i386-common.cc (ix86_optimization_table):
	Enable small loop unroll at O2 by default.
	* config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll
	factor if -munroll-only-small-loops enabled and -funroll-loops/
	-funroll-all-loops are disabled.
	* config/i386/i386.h (struct processor_costs): Add 2 field
	small_unroll_ninsns and small_unroll_factor.
	* config/i386/i386.opt: Add -munroll-only-small-loops.
	* doc/invoke.texi: Document -munroll-only-small-loops.
	* loop-init.cc (pass_rtl_unroll_loops::gate): Enable rtl
	loop unrolling for -O2-speed and above if target hook
	loop_unroll_adjust exists.
	(pass_rtl_unroll_loops::execute): Set UAP_UNROLL flag
	when target hook loop_unroll_adjust exists.
	* config/i386/x86-tune-costs.h: Update all processor costs
	with small_unroll_ninsns = 4 and small_unroll_factor = 2.

gcc/testsuite/ChangeLog:

	* gcc.dg/guality/loop-1.c: Add additional option
	-mno-unroll-only-small-loops.
	* gcc.target/i386/pr86270.c: Add -mno-unroll-only-small-loops.
	* gcc.target/i386/pr93002.c: Likewise.
---
 gcc/common/config/i386/i386-common.cc   |  1 +
 gcc/config/i386/i386.cc                 | 18 ++++++++
 gcc/config/i386/i386.h                  |  5 +++
 gcc/config/i386/i386.opt                |  4 ++
 gcc/config/i386/x86-tune-costs.h        | 58 +++++++++++++++++++++++++
 gcc/doc/invoke.texi                     | 11 ++++-
 gcc/loop-init.cc                        | 10 +++--
 gcc/testsuite/gcc.dg/guality/loop-1.c   |  2 +
 gcc/testsuite/gcc.target/i386/pr86270.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr93002.c |  2 +-
 10 files changed, 107 insertions(+), 6 deletions(-)

diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc
index e2594cae4..cdd5caa55 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -1687,6 +1687,7 @@ static const struct default_options ix86_option_optimization_table[] =
     /* The STC algorithm produces the smallest code at -Os, for x86.  */
     { OPT_LEVELS_2_PLUS, OPT_freorder_blocks_algorithm_, NULL,
       REORDER_BLOCKS_ALGORITHM_STC },
+    { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_munroll_only_small_loops, NULL, 1 },
     /* Turn off -fschedule-insns by default.  It tends to make the
        problem with not enough registers even worse.  */
     { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 },
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 9a9ff3b34..e56004300 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -23570,6 +23570,24 @@ ix86_loop_unroll_adjust (unsigned nunroll, class loop *loop)
   unsigned i;
   unsigned mem_count = 0;
 
+  /* Unroll small size loop when unroll factor is not explicitly
+     specified.  */
+  if (!(flag_unroll_loops
+	|| flag_unroll_all_loops
+	|| loop->unroll))
+    {
+      nunroll = 1;
+
+      /* Any explicit -f{no-}unroll-{all-}loops turns off
+	 -munroll-only-small-loops.  */
+      if (ix86_unroll_only_small_loops
+	  && !OPTION_SET_P (flag_unroll_loops)
+	  && loop->ninsns <= ix86_cost->small_unroll_ninsns)
+	nunroll = ix86_cost->small_unroll_factor;
+
+      return nunroll;
+    }
+
   if (!TARGET_ADJUST_UNROLL)
      return nunroll;
 
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index fce0b3564..688aaabd3 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -219,6 +219,11 @@ struct processor_costs {
   const char *const align_jump;		/* Jump alignment.  */
   const char *const align_label;	/* Label alignment.  */
   const char *const align_func;		/* Function alignment.  */
+
+  const unsigned small_unroll_ninsns;	/* Insn count limit for small loop
+					   to be unrolled.  */
+  const unsigned small_unroll_factor;   /* Unroll factor for small loop to
+					   be unrolled.  */
 };
 
 extern const struct processor_costs *ix86_cost;
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index a3675e515..fc1b944ac 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1214,3 +1214,7 @@ Do not use GOT to access external symbols.
 -param=x86-stlf-window-ninsns=
 Target Joined UInteger Var(x86_stlf_window_ninsns) Init(64) Param
 Instructions number above which STFL stall penalty can be compensated.
+
+munroll-only-small-loops
+Target Var(ix86_unroll_only_small_loops) Init(0) Save
+Enable conservative small loop unrolling.
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index f105d57ca..db4c2da34 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -135,6 +135,8 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
   NULL,					/* Jump alignment.  */
   NULL,					/* Label alignment.  */
   NULL,					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
 };
 
 /* Processor costs (relative to an add) */
@@ -244,6 +246,8 @@ struct processor_costs i386_cost = {	/* 386 specific costs */
   "4",					/* Jump alignment.  */
   NULL,					/* Label alignment.  */
   "4",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
 };
 
 static stringop_algs i486_memcpy[2] = {
@@ -354,6 +358,8 @@ struct processor_costs i486_cost = {	/* 486 specific costs */
   "16",					/* Jump alignment.  */
   "0:0:8",				/* Label alignment.  */
   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
 };
 
 static stringop_algs pentium_memcpy[2] = {
@@ -462,6 +468,8 @@ struct processor_costs pentium_cost = {
   "16:8:8",				/* Jump alignment.  */
   "0:0:8",				/* Label alignment.  */
   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
 };
 
 static const
@@ -563,6 +571,8 @@ struct processor_costs lakemont_cost = {
   "16:8:8",				/* Jump alignment.  */
   "0:0:8",				/* Label alignment.  */
   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
 };
 
 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
@@ -679,6 +689,8 @@ struct processor_costs pentiumpro_cost = {
   "16:11:8",				/* Jump alignment.  */
   "0:0:8",				/* Label alignment.  */
   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
 };
 
 static stringop_algs geode_memcpy[2] = {
@@ -786,6 +798,8 @@ struct processor_costs geode_cost = {
   NULL,					/* Jump alignment.  */
   NULL,					/* Label alignment.  */
   NULL,					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
 };
 
 static stringop_algs k6_memcpy[2] = {
@@ -896,6 +910,8 @@ struct processor_costs k6_cost = {
   "32:8:8",				/* Jump alignment.  */
   "0:0:8",				/* Label alignment.  */
   "32",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
 };
 
 /* For some reason, Athlon deals better with REP prefix (relative to loops)
@@ -1007,6 +1023,8 @@ struct processor_costs athlon_cost = {
   "16:8:8",				/* Jump alignment.  */
   "0:0:8",				/* Label alignment.  */
   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
 };
 
 /* K8 has optimized REP instruction for medium sized blocks, but for very
@@ -1127,6 +1145,8 @@ struct processor_costs k8_cost = {
   "16:8:8",				/* Jump alignment.  */
   "0:0:8",				/* Label alignment.  */
   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
 };
 
 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
@@ -1255,6 +1275,8 @@ struct processor_costs amdfam10_cost = {
   "32:8:8",				/* Jump alignment.  */
   "0:0:8",				/* Label alignment.  */
   "32",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
 };
 
 /*  BDVER has optimized REP instruction for medium sized blocks, but for
@@ -1376,6 +1398,8 @@ const struct processor_costs bdver_cost = {
   "16:8:8",				/* Jump alignment.  */
   "0:0:8",				/* Label alignment.  */
   "11",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
 };
 
 
@@ -1529,6 +1553,8 @@ struct processor_costs znver1_cost = {
   "16",					/* Jump alignment.  */
   "0:0:8",				/* Label alignment.  */
   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
 };
 
 /*  ZNVER2 has optimized REP instruction for medium sized blocks, but for
@@ -1686,6 +1712,8 @@ struct processor_costs znver2_cost = {
   "16",					/* Jump alignment.  */
   "0:0:8",				/* Label alignment.  */
   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
 };
 
 struct processor_costs znver3_cost = {
@@ -1818,6 +1846,8 @@ struct processor_costs znver3_cost = {
   "16",					/* Jump alignment.  */
   "0:0:8",				/* Label alignment.  */
   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
 };
 
 /* This table currently replicates znver3_cost table. */
@@ -1952,6 +1982,8 @@ struct processor_costs znver4_cost = {
   "16",					/* Jump alignment.  */
   "0:0:8",				/* Label alignment.  */
   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
 };
 
 /* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
@@ -2076,6 +2108,8 @@ struct processor_costs skylake_cost = {
   "16:11:8",				/* Jump alignment.  */
   "0:0:8",				/* Label alignment.  */
   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
 };
 
 /* icelake_cost should produce code tuned for Icelake family of CPUs.
@@ -2202,6 +2236,8 @@ struct processor_costs icelake_cost = {
   "16:11:8",				/* Jump alignment.  */
   "0:0:8",				/* Label alignment.  */
   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
 };
 
 /* alderlake_cost should produce code tuned for alderlake family of CPUs.  */
@@ -2322,6 +2358,8 @@ struct processor_costs alderlake_cost = {
   "16:11:8",				/* Jump alignment.  */
   "0:0:8",				/* Label alignment.  */
   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
 };
 
   /* BTVER1 has optimized REP instruction for medium sized blocks, but for
@@ -2435,6 +2473,8 @@ const struct processor_costs btver1_cost = {
   "16:8:8",				/* Jump alignment.  */
   "0:0:8",				/* Label alignment.  */
   "11",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
 };
 
 static stringop_algs btver2_memcpy[2] = {
@@ -2545,6 +2585,8 @@ const struct processor_costs btver2_cost = {
   "16:8:8",				/* Jump alignment.  */
   "0:0:8",				/* Label alignment.  */
   "11",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
 };
 
 static stringop_algs pentium4_memcpy[2] = {
@@ -2654,6 +2696,8 @@ struct processor_costs pentium4_cost = {
   NULL,					/* Jump alignment.  */
   NULL,					/* Label alignment.  */
   NULL,					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
 };
 
 static stringop_algs nocona_memcpy[2] = {
@@ -2766,6 +2810,8 @@ struct processor_costs nocona_cost = {
   NULL,					/* Jump alignment.  */
   NULL,					/* Label alignment.  */
   NULL,					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
 };
 
 static stringop_algs atom_memcpy[2] = {
@@ -2876,6 +2922,8 @@ struct processor_costs atom_cost = {
   "16:8:8",				/* Jump alignment.  */
   "0:0:8",				/* Label alignment.  */
   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
 };
 
 static stringop_algs slm_memcpy[2] = {
@@ -2986,6 +3034,8 @@ struct processor_costs slm_cost = {
   "16:8:8",				/* Jump alignment.  */
   "0:0:8",				/* Label alignment.  */
   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
 };
 
 static stringop_algs tremont_memcpy[2] = {
@@ -3110,6 +3160,8 @@ struct processor_costs tremont_cost = {
   "16:11:8",				/* Jump alignment.  */
   "0:0:8",				/* Label alignment.  */
   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
 };
 
 static stringop_algs intel_memcpy[2] = {
@@ -3220,6 +3272,8 @@ struct processor_costs intel_cost = {
   "16:8:8",				/* Jump alignment.  */
   "0:0:8",				/* Label alignment.  */
   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
 };
 
 /* Generic should produce code tuned for Core-i7 (and newer chips)
@@ -3339,6 +3393,8 @@ struct processor_costs generic_cost = {
   "16:11:8",				/* Jump alignment.  */
   "0:0:8",				/* Label alignment.  */
   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
 };
 
 /* core_cost should produce code tuned for Core familly of CPUs.  */
@@ -3465,5 +3521,7 @@ struct processor_costs core_cost = {
   "16:11:8",				/* Jump alignment.  */
   "0:0:8",				/* Label alignment.  */
   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
 };
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index ff8cd032f..16f4b367e 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -1449,7 +1449,8 @@ See RS/6000 and PowerPC Options.
 -mgeneral-regs-only  -mcall-ms2sysv-xlogues -mrelax-cmpxchg-loop @gol
 -mindirect-branch=@var{choice}  -mfunction-return=@var{choice} @gol
 -mindirect-branch-register -mharden-sls=@var{choice} @gol
--mindirect-branch-cs-prefix -mneeded -mno-direct-extern-access}
+-mindirect-branch-cs-prefix -mneeded -mno-direct-extern-access @gol
+-munroll-only-small-loops}
 
 @emph{x86 Windows Options}
 @gccoptlist{-mconsole  -mcygwin  -mno-cygwin  -mdll @gol
@@ -33183,6 +33184,14 @@ treat access to protected symbols as local symbols.  The default is
 @option{-mno-direct-extern-access} and executable compiled with
 @option{-mdirect-extern-access} may not be binary compatible if
 protected symbols are used in shared libraries and executable.
+
+@item -munroll-only-small-loops
+@opindex munroll-only-small-loops
+@opindex mno-unroll-only-small-loops
+Controls conservative small loop unrolling. It is default enabled by
+O2, and unrolls loop with less than 4 insns by 1 time. Explicit
+-f[no-]unroll-[all-]loops would disable this flag to avoid any
+unintended unrolling behavior that user does not want.
 @end table
 
 @node x86 Windows Options
diff --git a/gcc/loop-init.cc b/gcc/loop-init.cc
index 1e4f6cfd7..f1c717041 100644
--- a/gcc/loop-init.cc
+++ b/gcc/loop-init.cc
@@ -565,9 +565,12 @@ public:
   {}
 
   /* opt_pass methods: */
-  virtual bool gate (function *)
+  virtual bool gate (function *fun)
     {
-      return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll);
+      return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll
+	      || (targetm.loop_unroll_adjust
+		  && optimize >= 2
+		  && optimize_function_for_speed_p (fun)));
     }
 
   virtual unsigned int execute (function *);
@@ -583,7 +586,8 @@ pass_rtl_unroll_loops::execute (function *fun)
       if (dump_file)
 	df_dump (dump_file);
 
-      if (flag_unroll_loops)
+      if (flag_unroll_loops
+	  || targetm.loop_unroll_adjust)
 	flags |= UAP_UNROLL;
       if (flag_unroll_all_loops)
 	flags |= UAP_UNROLL_ALL;
diff --git a/gcc/testsuite/gcc.dg/guality/loop-1.c b/gcc/testsuite/gcc.dg/guality/loop-1.c
index 1b1f6d322..a32ea445a 100644
--- a/gcc/testsuite/gcc.dg/guality/loop-1.c
+++ b/gcc/testsuite/gcc.dg/guality/loop-1.c
@@ -1,5 +1,7 @@
 /* { dg-do run } */
 /* { dg-options "-fno-tree-scev-cprop -fno-tree-vectorize -g" } */
+/* { dg-additional-options "-mno-unroll-only-small-loops" { target ia32 } } */
+
 
 #include "../nop.h"
 
diff --git a/gcc/testsuite/gcc.target/i386/pr86270.c b/gcc/testsuite/gcc.target/i386/pr86270.c
index 81841ef5b..cbc9fbb04 100644
--- a/gcc/testsuite/gcc.target/i386/pr86270.c
+++ b/gcc/testsuite/gcc.target/i386/pr86270.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2" } */
+/* { dg-options "-O2 -mno-unroll-only-small-loops" } */
 
 int *a;
 long len;
diff --git a/gcc/testsuite/gcc.target/i386/pr93002.c b/gcc/testsuite/gcc.target/i386/pr93002.c
index 0248fcc00..f75a847f7 100644
--- a/gcc/testsuite/gcc.target/i386/pr93002.c
+++ b/gcc/testsuite/gcc.target/i386/pr93002.c
@@ -1,6 +1,6 @@
 /* PR target/93002 */
 /* { dg-do compile } */
-/* { dg-options "-O2" } */
+/* { dg-options "-O2 -mno-unroll-only-small-loops" } */
 /* { dg-final { scan-assembler-not "cmp\[^\n\r]*-1" } } */
 
 volatile int sink;
-- 
2.33.0