···1-From 6001db79c477b03eacc7e7049560921fb54b7845 Mon Sep 17 00:00:00 2001
2-From: Richard Sandiford <richard.sandiford@arm.com>
3-Date: Mon, 7 Sep 2020 20:15:36 +0100
4-Subject: [PATCH] lra: Avoid cycling on certain subreg reloads [PR96796]
5-6-This PR is about LRA cycling for a reload of the form:
7-8-----------------------------------------------------------------------------
9-Changing pseudo 196 in operand 1 of insn 103 on equiv [r105:DI*0x8+r140:DI]
10- Creating newreg=287, assigning class ALL_REGS to slow/invalid mem r287
11- Creating newreg=288, assigning class ALL_REGS to slow/invalid mem r288
12- 103: r203:SI=r288:SI<<0x1+r196:DI#0
13- REG_DEAD r196:DI
14- Inserting slow/invalid mem reload before:
15- 316: r287:DI=[r105:DI*0x8+r140:DI]
16- 317: r288:SI=r287:DI#0
17-----------------------------------------------------------------------------
18-19-The problem is with r287. We rightly give it a broad starting class of
20-POINTER_AND_FP_REGS (reduced from ALL_REGS by preferred_reload_class).
21-However, we never make forward progress towards narrowing it down to
22-a specific choice of class (POINTER_REGS or FP_REGS).
23-24-I think in practice we rely on two things to narrow a reload pseudo's
25-class down to a specific choice:
26-27-(1) a restricted class is specified when the pseudo is created
28-29- This happens for input address reloads, where the class is taken
30- from the target's chosen base register class. It also happens
31- for simple REG reloads, where the class is taken from the chosen
32- alternative's constraints.
33-34-(2) uses of the reload pseudo as a direct input operand
35-36- In this case get_reload_reg tries to reuse the existing register
37- and narrow its class, instead of creating a new reload pseudo.
38-39-However, neither occurs here. As described above, r287 rightly
40-starts out with a wide choice of class, ultimately derived from
41-ALL_REGS, so we don't get (1). And as the comments in the PR
42-explain, r287 is never used as an input reload, only the subreg is,
43-so we don't get (2):
44-45-----------------------------------------------------------------------------
46- Choosing alt 13 in insn 317: (0) r (1) w {*movsi_aarch64}
47- Creating newreg=291, assigning class FP_REGS to r291
48- 317: r288:SI=r291:SI
49- Inserting insn reload before:
50- 320: r291:SI=r287:DI#0
51-----------------------------------------------------------------------------
52-53-IMO, in this case we should rely on the reload of r316 to narrow
54-down the class of r278. Currently we do:
55-56-----------------------------------------------------------------------------
57- Choosing alt 7 in insn 316: (0) r (1) m {*movdi_aarch64}
58- Creating newreg=289 from oldreg=287, assigning class GENERAL_REGS to r289
59- 316: r289:DI=[r105:DI*0x8+r140:DI]
60- Inserting insn reload after:
61- 318: r287:DI=r289:DI
62----------------------------------------------------
63-64-i.e. we create a new pseudo register r289 and give *that* pseudo
65-GENERAL_REGS instead. This is because get_reload_reg only narrows
66-down the existing class for OP_IN and OP_INOUT, not OP_OUT.
67-68-But if we have a reload pseudo in a reload instruction and have chosen
69-a specific class for the reload pseudo, I think we should simply install
70-it for OP_OUT reloads too, if the class is a subset of the existing class.
71-We will need to pick such a register whatever happens (for r289 in the
72-example above). And as explained in the PR, doing this actually avoids
73-an unnecessary move via the FP registers too.
74-75-The patch is quite aggressive in that it does this for all reload
76-pseudos in all reload instructions. I wondered about reusing the
77-condition for a reload move in in_class_p:
78-79- INSN_UID (curr_insn) >= new_insn_uid_start
80- && curr_insn_set != NULL
81- && ((OBJECT_P (SET_SRC (curr_insn_set))
82- && ! CONSTANT_P (SET_SRC (curr_insn_set)))
83- || (GET_CODE (SET_SRC (curr_insn_set)) == SUBREG
84- && OBJECT_P (SUBREG_REG (SET_SRC (curr_insn_set)))
85- && ! CONSTANT_P (SUBREG_REG (SET_SRC (curr_insn_set)))))))
86-87-but I can't really justify that on first principles. I think we
88-should apply the rule consistently until we have a specific reason
89-for doing otherwise.
90-91-gcc/
92- PR rtl-optimization/96796
93- * lra-constraints.c (in_class_p): Add a default-false
94- allow_all_reload_class_changes_p parameter. Do not treat
95- reload moves specially when the parameter is true.
96- (get_reload_reg): Try to narrow the class of an existing OP_OUT
97- reload if we're reloading a reload pseudo in a reload instruction.
98-99-gcc/testsuite/
100- PR rtl-optimization/96796
101- * gcc.c-torture/compile/pr96796.c: New test.
102----
103- gcc/lra-constraints.c | 54 ++++++++++++++----
104- gcc/testsuite/gcc.c-torture/compile/pr96796.c | 55 +++++++++++++++++++
105- 2 files changed, 99 insertions(+), 10 deletions(-)
106- create mode 100644 gcc/testsuite/gcc.c-torture/compile/pr96796.c
107-108-diff --git a/gcc/lra-constraints.c b/gcc/lra-constraints.c
109-index 580da9c3ed6..161b721efb1 100644
110---- a/gcc/lra-constraints.c
111-+++ b/gcc/lra-constraints.c
112-@@ -236,12 +236,17 @@ get_reg_class (int regno)
113- CL. Use elimination first if REG is a hard register. If REG is a
114- reload pseudo created by this constraints pass, assume that it will
115- be allocated a hard register from its allocno class, but allow that
116-- class to be narrowed to CL if it is currently a superset of CL.
117-+ class to be narrowed to CL if it is currently a superset of CL and
118-+ if either:
119-+
120-+ - ALLOW_ALL_RELOAD_CLASS_CHANGES_P is true or
121-+ - the instruction we're processing is not a reload move.
122-123- If NEW_CLASS is nonnull, set *NEW_CLASS to the new allocno class of
124- REGNO (reg), or NO_REGS if no change in its class was needed. */
125- static bool
126--in_class_p (rtx reg, enum reg_class cl, enum reg_class *new_class)
127-+in_class_p (rtx reg, enum reg_class cl, enum reg_class *new_class,
128-+ bool allow_all_reload_class_changes_p = false)
129- {
130- enum reg_class rclass, common_class;
131- machine_mode reg_mode;
132-@@ -266,7 +271,8 @@ in_class_p (rtx reg, enum reg_class cl, enum reg_class *new_class)
133- typically moves that have many alternatives, and restricting
134- reload pseudos for one alternative may lead to situations
135- where other reload pseudos are no longer allocatable. */
136-- || (INSN_UID (curr_insn) >= new_insn_uid_start
137-+ || (!allow_all_reload_class_changes_p
138-+ && INSN_UID (curr_insn) >= new_insn_uid_start
139- && curr_insn_set != NULL
140- && ((OBJECT_P (SET_SRC (curr_insn_set))
141- && ! CONSTANT_P (SET_SRC (curr_insn_set)))
142-@@ -598,13 +604,12 @@ canonicalize_reload_addr (rtx addr)
143- return addr;
144- }
145-146--/* Create a new pseudo using MODE, RCLASS, ORIGINAL or reuse already
147-- created input reload pseudo (only if TYPE is not OP_OUT). Don't
148-- reuse pseudo if IN_SUBREG_P is true and the reused pseudo should be
149-- wrapped up in SUBREG. The result pseudo is returned through
150-- RESULT_REG. Return TRUE if we created a new pseudo, FALSE if we
151-- reused the already created input reload pseudo. Use TITLE to
152-- describe new registers for debug purposes. */
153-+/* Create a new pseudo using MODE, RCLASS, ORIGINAL or reuse an existing
154-+ reload pseudo. Don't reuse an existing reload pseudo if IN_SUBREG_P
155-+ is true and the reused pseudo should be wrapped up in a SUBREG.
156-+ The result pseudo is returned through RESULT_REG. Return TRUE if we
157-+ created a new pseudo, FALSE if we reused an existing reload pseudo.
158-+ Use TITLE to describe new registers for debug purposes. */
159- static bool
160- get_reload_reg (enum op_type type, machine_mode mode, rtx original,
161- enum reg_class rclass, bool in_subreg_p,
162-@@ -616,6 +621,35 @@ get_reload_reg (enum op_type type, machine_mode mode, rtx original,
163-164- if (type == OP_OUT)
165- {
166-+ /* Output reload registers tend to start out with a conservative
167-+ choice of register class. Usually this is ALL_REGS, although
168-+ a target might narrow it (for performance reasons) through
169-+ targetm.preferred_reload_class. It's therefore quite common
170-+ for a reload instruction to require a more restrictive class
171-+ than the class that was originally assigned to the reload register.
172-+
173-+ In these situations, it's more efficient to refine the choice
174-+ of register class rather than create a second reload register.
175-+ This also helps to avoid cycling for registers that are only
176-+ used by reload instructions. */
177-+ if (REG_P (original)
178-+ && (int) REGNO (original) >= new_regno_start
179-+ && INSN_UID (curr_insn) >= new_insn_uid_start
180-+ && in_class_p (original, rclass, &new_class, true))
181-+ {
182-+ unsigned int regno = REGNO (original);
183-+ if (lra_dump_file != NULL)
184-+ {
185-+ fprintf (lra_dump_file, " Reuse r%d for output ", regno);
186-+ dump_value_slim (lra_dump_file, original, 1);
187-+ }
188-+ if (new_class != lra_get_allocno_class (regno))
189-+ lra_change_class (regno, new_class, ", change to", false);
190-+ if (lra_dump_file != NULL)
191-+ fprintf (lra_dump_file, "\n");
192-+ *result_reg = original;
193-+ return false;
194-+ }
195- *result_reg
196- = lra_create_new_reg_with_unique_value (mode, original, rclass, title);
197- return true;
198-diff --git a/gcc/testsuite/gcc.c-torture/compile/pr96796.c b/gcc/testsuite/gcc.c-torture/compile/pr96796.c
199-new file mode 100644
200-index 00000000000..8808e62fe77
201---- /dev/null
202-+++ b/gcc/testsuite/gcc.c-torture/compile/pr96796.c
203-@@ -0,0 +1,55 @@
204-+/* { dg-additional-options "-fcommon" } */
205-+
206-+struct S0 {
207-+ signed f0 : 8;
208-+ unsigned f1;
209-+ unsigned f4;
210-+};
211-+struct S1 {
212-+ long f3;
213-+ char f4;
214-+} g_3_4;
215-+
216-+int g_5, func_1_l_32, func_50___trans_tmp_31;
217-+static struct S0 g_144, g_834, g_1255, g_1261;
218-+
219-+int g_273[120] = {};
220-+int *g_555;
221-+char **g_979;
222-+static int g_1092_0;
223-+static int g_1193;
224-+int safe_mul_func_int16_t_s_s(int si1, int si2) { return si1 * si2; }
225-+static struct S0 *func_50();
226-+int func_1() { func_50(g_3_4, g_5, func_1_l_32, 8, 3); }
227-+void safe_div_func_int64_t_s_s(int *);
228-+void safe_mod_func_uint32_t_u_u(struct S0);
229-+struct S0 *func_50(int p_51, struct S0 p_52, struct S1 p_53, int p_54,
230-+ int p_55) {
231-+ int __trans_tmp_30;
232-+ char __trans_tmp_22;
233-+ short __trans_tmp_19;
234-+ long l_985_1;
235-+ long l_1191[8];
236-+ safe_div_func_int64_t_s_s(g_273);
237-+ __builtin_printf((char*)g_1261.f4);
238-+ safe_mod_func_uint32_t_u_u(g_834);
239-+ g_144.f0 += 1;
240-+ for (;;) {
241-+ struct S1 l_1350 = {&l_1350};
242-+ for (; p_53.f3; p_53.f3 -= 1)
243-+ for (; g_1193 <= 2; g_1193 += 1) {
244-+ __trans_tmp_19 = safe_mul_func_int16_t_s_s(l_1191[l_985_1 + p_53.f3],
245-+ p_55 % (**g_979 = 10));
246-+ __trans_tmp_22 = g_1255.f1 * p_53.f4;
247-+ __trans_tmp_30 = __trans_tmp_19 + __trans_tmp_22;
248-+ if (__trans_tmp_30)
249-+ g_1261.f0 = p_51;
250-+ else {
251-+ g_1255.f0 = p_53.f3;
252-+ int *l_1422 = g_834.f0 = g_144.f4 != (*l_1422)++ > 0 < 0 ^ 51;
253-+ g_555 = ~0;
254-+ g_1092_0 |= func_50___trans_tmp_31;
255-+ }
256-+ }
257-+ }
258-+}
259---
260-2.18.4
261-