nixpkgs mirror (for testing)
github.com/NixOS/nixpkgs
nix
1From 6001db79c477b03eacc7e7049560921fb54b7845 Mon Sep 17 00:00:00 2001
2From: Richard Sandiford <richard.sandiford@arm.com>
3Date: Mon, 7 Sep 2020 20:15:36 +0100
4Subject: [PATCH] lra: Avoid cycling on certain subreg reloads [PR96796]
5
6This PR is about LRA cycling for a reload of the form:
7
8----------------------------------------------------------------------------
9Changing pseudo 196 in operand 1 of insn 103 on equiv [r105:DI*0x8+r140:DI]
10 Creating newreg=287, assigning class ALL_REGS to slow/invalid mem r287
11 Creating newreg=288, assigning class ALL_REGS to slow/invalid mem r288
12 103: r203:SI=r288:SI<<0x1+r196:DI#0
13 REG_DEAD r196:DI
14 Inserting slow/invalid mem reload before:
15 316: r287:DI=[r105:DI*0x8+r140:DI]
16 317: r288:SI=r287:DI#0
17----------------------------------------------------------------------------
18
19The problem is with r287. We rightly give it a broad starting class of
20POINTER_AND_FP_REGS (reduced from ALL_REGS by preferred_reload_class).
21However, we never make forward progress towards narrowing it down to
22a specific choice of class (POINTER_REGS or FP_REGS).
23
24I think in practice we rely on two things to narrow a reload pseudo's
25class down to a specific choice:
26
27(1) a restricted class is specified when the pseudo is created
28
29 This happens for input address reloads, where the class is taken
30 from the target's chosen base register class. It also happens
31 for simple REG reloads, where the class is taken from the chosen
32 alternative's constraints.
33
34(2) uses of the reload pseudo as a direct input operand
35
36 In this case get_reload_reg tries to reuse the existing register
37 and narrow its class, instead of creating a new reload pseudo.
38
39However, neither occurs here. As described above, r287 rightly
40starts out with a wide choice of class, ultimately derived from
41ALL_REGS, so we don't get (1). And as the comments in the PR
42explain, r287 is never used as an input reload, only the subreg is,
43so we don't get (2):
44
45----------------------------------------------------------------------------
46 Choosing alt 13 in insn 317: (0) r (1) w {*movsi_aarch64}
47 Creating newreg=291, assigning class FP_REGS to r291
48 317: r288:SI=r291:SI
49 Inserting insn reload before:
50 320: r291:SI=r287:DI#0
51----------------------------------------------------------------------------
52
53IMO, in this case we should rely on the reload of r316 to narrow
54down the class of r278. Currently we do:
55
56----------------------------------------------------------------------------
57 Choosing alt 7 in insn 316: (0) r (1) m {*movdi_aarch64}
58 Creating newreg=289 from oldreg=287, assigning class GENERAL_REGS to r289
59 316: r289:DI=[r105:DI*0x8+r140:DI]
60 Inserting insn reload after:
61 318: r287:DI=r289:DI
62---------------------------------------------------
63
64i.e. we create a new pseudo register r289 and give *that* pseudo
65GENERAL_REGS instead. This is because get_reload_reg only narrows
66down the existing class for OP_IN and OP_INOUT, not OP_OUT.
67
68But if we have a reload pseudo in a reload instruction and have chosen
69a specific class for the reload pseudo, I think we should simply install
70it for OP_OUT reloads too, if the class is a subset of the existing class.
71We will need to pick such a register whatever happens (for r289 in the
72example above). And as explained in the PR, doing this actually avoids
73an unnecessary move via the FP registers too.
74
75The patch is quite aggressive in that it does this for all reload
76pseudos in all reload instructions. I wondered about reusing the
77condition for a reload move in in_class_p:
78
79 INSN_UID (curr_insn) >= new_insn_uid_start
80 && curr_insn_set != NULL
81 && ((OBJECT_P (SET_SRC (curr_insn_set))
82 && ! CONSTANT_P (SET_SRC (curr_insn_set)))
83 || (GET_CODE (SET_SRC (curr_insn_set)) == SUBREG
84 && OBJECT_P (SUBREG_REG (SET_SRC (curr_insn_set)))
85 && ! CONSTANT_P (SUBREG_REG (SET_SRC (curr_insn_set)))))))
86
87but I can't really justify that on first principles. I think we
88should apply the rule consistently until we have a specific reason
89for doing otherwise.
90
91gcc/
92 PR rtl-optimization/96796
93 * lra-constraints.c (in_class_p): Add a default-false
94 allow_all_reload_class_changes_p parameter. Do not treat
95 reload moves specially when the parameter is true.
96 (get_reload_reg): Try to narrow the class of an existing OP_OUT
97 reload if we're reloading a reload pseudo in a reload instruction.
98
99gcc/testsuite/
100 PR rtl-optimization/96796
101 * gcc.c-torture/compile/pr96796.c: New test.
102---
103 gcc/lra-constraints.c | 54 ++++++++++++++----
104 gcc/testsuite/gcc.c-torture/compile/pr96796.c | 55 +++++++++++++++++++
105 2 files changed, 99 insertions(+), 10 deletions(-)
106 create mode 100644 gcc/testsuite/gcc.c-torture/compile/pr96796.c
107
108diff --git a/gcc/lra-constraints.c b/gcc/lra-constraints.c
109index 580da9c3ed6..161b721efb1 100644
110--- a/gcc/lra-constraints.c
111+++ b/gcc/lra-constraints.c
112@@ -236,12 +236,17 @@ get_reg_class (int regno)
113 CL. Use elimination first if REG is a hard register. If REG is a
114 reload pseudo created by this constraints pass, assume that it will
115 be allocated a hard register from its allocno class, but allow that
116- class to be narrowed to CL if it is currently a superset of CL.
117+ class to be narrowed to CL if it is currently a superset of CL and
118+ if either:
119+
120+ - ALLOW_ALL_RELOAD_CLASS_CHANGES_P is true or
121+ - the instruction we're processing is not a reload move.
122
123 If NEW_CLASS is nonnull, set *NEW_CLASS to the new allocno class of
124 REGNO (reg), or NO_REGS if no change in its class was needed. */
125 static bool
126-in_class_p (rtx reg, enum reg_class cl, enum reg_class *new_class)
127+in_class_p (rtx reg, enum reg_class cl, enum reg_class *new_class,
128+ bool allow_all_reload_class_changes_p = false)
129 {
130 enum reg_class rclass, common_class;
131 machine_mode reg_mode;
132@@ -266,7 +271,8 @@ in_class_p (rtx reg, enum reg_class cl, enum reg_class *new_class)
133 typically moves that have many alternatives, and restricting
134 reload pseudos for one alternative may lead to situations
135 where other reload pseudos are no longer allocatable. */
136- || (INSN_UID (curr_insn) >= new_insn_uid_start
137+ || (!allow_all_reload_class_changes_p
138+ && INSN_UID (curr_insn) >= new_insn_uid_start
139 && curr_insn_set != NULL
140 && ((OBJECT_P (SET_SRC (curr_insn_set))
141 && ! CONSTANT_P (SET_SRC (curr_insn_set)))
142@@ -598,13 +604,12 @@ canonicalize_reload_addr (rtx addr)
143 return addr;
144 }
145
146-/* Create a new pseudo using MODE, RCLASS, ORIGINAL or reuse already
147- created input reload pseudo (only if TYPE is not OP_OUT). Don't
148- reuse pseudo if IN_SUBREG_P is true and the reused pseudo should be
149- wrapped up in SUBREG. The result pseudo is returned through
150- RESULT_REG. Return TRUE if we created a new pseudo, FALSE if we
151- reused the already created input reload pseudo. Use TITLE to
152- describe new registers for debug purposes. */
153+/* Create a new pseudo using MODE, RCLASS, ORIGINAL or reuse an existing
154+ reload pseudo. Don't reuse an existing reload pseudo if IN_SUBREG_P
155+ is true and the reused pseudo should be wrapped up in a SUBREG.
156+ The result pseudo is returned through RESULT_REG. Return TRUE if we
157+ created a new pseudo, FALSE if we reused an existing reload pseudo.
158+ Use TITLE to describe new registers for debug purposes. */
159 static bool
160 get_reload_reg (enum op_type type, machine_mode mode, rtx original,
161 enum reg_class rclass, bool in_subreg_p,
162@@ -616,6 +621,35 @@ get_reload_reg (enum op_type type, machine_mode mode, rtx original,
163
164 if (type == OP_OUT)
165 {
166+ /* Output reload registers tend to start out with a conservative
167+ choice of register class. Usually this is ALL_REGS, although
168+ a target might narrow it (for performance reasons) through
169+ targetm.preferred_reload_class. It's therefore quite common
170+ for a reload instruction to require a more restrictive class
171+ than the class that was originally assigned to the reload register.
172+
173+ In these situations, it's more efficient to refine the choice
174+ of register class rather than create a second reload register.
175+ This also helps to avoid cycling for registers that are only
176+ used by reload instructions. */
177+ if (REG_P (original)
178+ && (int) REGNO (original) >= new_regno_start
179+ && INSN_UID (curr_insn) >= new_insn_uid_start
180+ && in_class_p (original, rclass, &new_class, true))
181+ {
182+ unsigned int regno = REGNO (original);
183+ if (lra_dump_file != NULL)
184+ {
185+ fprintf (lra_dump_file, " Reuse r%d for output ", regno);
186+ dump_value_slim (lra_dump_file, original, 1);
187+ }
188+ if (new_class != lra_get_allocno_class (regno))
189+ lra_change_class (regno, new_class, ", change to", false);
190+ if (lra_dump_file != NULL)
191+ fprintf (lra_dump_file, "\n");
192+ *result_reg = original;
193+ return false;
194+ }
195 *result_reg
196 = lra_create_new_reg_with_unique_value (mode, original, rclass, title);
197 return true;
198diff --git a/gcc/testsuite/gcc.c-torture/compile/pr96796.c b/gcc/testsuite/gcc.c-torture/compile/pr96796.c
199new file mode 100644
200index 00000000000..8808e62fe77
201--- /dev/null
202+++ b/gcc/testsuite/gcc.c-torture/compile/pr96796.c
203@@ -0,0 +1,55 @@
204+/* { dg-additional-options "-fcommon" } */
205+
206+struct S0 {
207+ signed f0 : 8;
208+ unsigned f1;
209+ unsigned f4;
210+};
211+struct S1 {
212+ long f3;
213+ char f4;
214+} g_3_4;
215+
216+int g_5, func_1_l_32, func_50___trans_tmp_31;
217+static struct S0 g_144, g_834, g_1255, g_1261;
218+
219+int g_273[120] = {};
220+int *g_555;
221+char **g_979;
222+static int g_1092_0;
223+static int g_1193;
224+int safe_mul_func_int16_t_s_s(int si1, int si2) { return si1 * si2; }
225+static struct S0 *func_50();
226+int func_1() { func_50(g_3_4, g_5, func_1_l_32, 8, 3); }
227+void safe_div_func_int64_t_s_s(int *);
228+void safe_mod_func_uint32_t_u_u(struct S0);
229+struct S0 *func_50(int p_51, struct S0 p_52, struct S1 p_53, int p_54,
230+ int p_55) {
231+ int __trans_tmp_30;
232+ char __trans_tmp_22;
233+ short __trans_tmp_19;
234+ long l_985_1;
235+ long l_1191[8];
236+ safe_div_func_int64_t_s_s(g_273);
237+ __builtin_printf((char*)g_1261.f4);
238+ safe_mod_func_uint32_t_u_u(g_834);
239+ g_144.f0 += 1;
240+ for (;;) {
241+ struct S1 l_1350 = {&l_1350};
242+ for (; p_53.f3; p_53.f3 -= 1)
243+ for (; g_1193 <= 2; g_1193 += 1) {
244+ __trans_tmp_19 = safe_mul_func_int16_t_s_s(l_1191[l_985_1 + p_53.f3],
245+ p_55 % (**g_979 = 10));
246+ __trans_tmp_22 = g_1255.f1 * p_53.f4;
247+ __trans_tmp_30 = __trans_tmp_19 + __trans_tmp_22;
248+ if (__trans_tmp_30)
249+ g_1261.f0 = p_51;
250+ else {
251+ g_1255.f0 = p_53.f3;
252+ int *l_1422 = g_834.f0 = g_144.f4 != (*l_1422)++ > 0 < 0 ^ 51;
253+ g_555 = ~0;
254+ g_1092_0 |= func_50___trans_tmp_31;
255+ }
256+ }
257+ }
258+}
259--
2602.18.4
261