1From 9f69a58623bd01349a18ba0c7a9cb1dad6a51e8e Mon Sep 17 00:00:00 2001
2From: Serhiy Storchaka <storchaka@gmail.com>
3Date: Mon, 12 May 2025 20:42:23 +0300
4Subject: [PATCH] gh-133767: Fix use-after-free in the unicode-escape decoder
5 with an error handler (GH-129648)
6
7If the error handler is used, a new bytes object is created to set as
8the object attribute of UnicodeDecodeError, and that bytes object then
9replaces the original data. A pointer to the decoded data will became invalid
10after destroying that temporary bytes object. So we need other way to return
11the first invalid escape from _PyUnicode_DecodeUnicodeEscapeInternal().
12
13_PyBytes_DecodeEscape() does not have such issue, because it does not
14use the error handlers registry, but it should be changed for compatibility
15with _PyUnicode_DecodeUnicodeEscapeInternal().
16---
17 Include/internal/pycore_bytesobject.h | 5 +-
18 Include/internal/pycore_unicodeobject.h | 12 +++--
19 Lib/test/test_codeccallbacks.py | 39 +++++++++++++-
20 Lib/test/test_codecs.py | 52 +++++++++++++++----
21 ...-05-09-20-22-54.gh-issue-133767.kN2i3Q.rst | 2 +
22 Objects/bytesobject.c | 41 ++++++++-------
23 Objects/unicodeobject.c | 46 +++++++++-------
24 Parser/string_parser.c | 26 ++++++----
25 8 files changed, 160 insertions(+), 63 deletions(-)
26 create mode 100644 Misc/NEWS.d/next/Security/2025-05-09-20-22-54.gh-issue-133767.kN2i3Q.rst
27
28diff --git a/Include/internal/pycore_bytesobject.h b/Include/internal/pycore_bytesobject.h
29index 300e7f4896a39e..8ea9b3ebb88454 100644
30--- a/Include/internal/pycore_bytesobject.h
31+++ b/Include/internal/pycore_bytesobject.h
32@@ -20,8 +20,9 @@ extern PyObject* _PyBytes_FromHex(
33
34 // Helper for PyBytes_DecodeEscape that detects invalid escape chars.
35 // Export for test_peg_generator.
36-PyAPI_FUNC(PyObject*) _PyBytes_DecodeEscape(const char *, Py_ssize_t,
37- const char *, const char **);
38+PyAPI_FUNC(PyObject*) _PyBytes_DecodeEscape2(const char *, Py_ssize_t,
39+ const char *,
40+ int *, const char **);
41
42
43 // Substring Search.
44diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h
45index c85d53b89accdb..3791b913c17546 100644
46--- a/Include/internal/pycore_unicodeobject.h
47+++ b/Include/internal/pycore_unicodeobject.h
48@@ -139,14 +139,18 @@ extern PyObject* _PyUnicode_DecodeUnicodeEscapeStateful(
49 // Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
50 // chars.
51 // Export for test_peg_generator.
52-PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
53+PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal2(
54 const char *string, /* Unicode-Escape encoded string */
55 Py_ssize_t length, /* size of string */
56 const char *errors, /* error handling */
57 Py_ssize_t *consumed, /* bytes consumed */
58- const char **first_invalid_escape); /* on return, points to first
59- invalid escaped char in
60- string. */
61+ int *first_invalid_escape_char, /* on return, if not -1, contain the first
62+ invalid escaped char (<= 0xff) or invalid
63+ octal escape (> 0xff) in string. */
64+ const char **first_invalid_escape_ptr); /* on return, if not NULL, may
65+ point to the first invalid escaped
66+ char in string.
67+ May be NULL if errors is not NULL. */
68
69 /* --- Raw-Unicode-Escape Codecs ---------------------------------------------- */
70
71diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py
72index 86e5e5c1474674..a767f67a02cf56 100644
73--- a/Lib/test/test_codeccallbacks.py
74+++ b/Lib/test/test_codeccallbacks.py
75@@ -2,6 +2,7 @@
76 import codecs
77 import html.entities
78 import itertools
79+import re
80 import sys
81 import unicodedata
82 import unittest
83@@ -1125,7 +1126,7 @@ def test_bug828737(self):
84 text = 'abc<def>ghi'*n
85 text.translate(charmap)
86
87- def test_mutatingdecodehandler(self):
88+ def test_mutating_decode_handler(self):
89 baddata = [
90 ("ascii", b"\xff"),
91 ("utf-7", b"++"),
92@@ -1160,6 +1161,42 @@ def mutating(exc):
93 for (encoding, data) in baddata:
94 self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
95
96+ def test_mutating_decode_handler_unicode_escape(self):
97+ decode = codecs.unicode_escape_decode
98+ def mutating(exc):
99+ if isinstance(exc, UnicodeDecodeError):
100+ r = data.get(exc.object[:exc.end])
101+ if r is not None:
102+ exc.object = r[0] + exc.object[exc.end:]
103+ return ('\u0404', r[1])
104+ raise AssertionError("don't know how to handle %r" % exc)
105+
106+ codecs.register_error('test.mutating2', mutating)
107+ data = {
108+ br'\x0': (b'\\', 0),
109+ br'\x3': (b'xxx\\', 3),
110+ br'\x5': (b'x\\', 1),
111+ }
112+ def check(input, expected, msg):
113+ with self.assertWarns(DeprecationWarning) as cm:
114+ self.assertEqual(decode(input, 'test.mutating2'), (expected, len(input)))
115+ self.assertIn(msg, str(cm.warning))
116+
117+ check(br'\x0n\z', '\u0404\n\\z', r'"\z" is an invalid escape sequence')
118+ check(br'\x0n\501', '\u0404\n\u0141', r'"\501" is an invalid octal escape sequence')
119+ check(br'\x0z', '\u0404\\z', r'"\z" is an invalid escape sequence')
120+
121+ check(br'\x3n\zr', '\u0404\n\\zr', r'"\z" is an invalid escape sequence')
122+ check(br'\x3zr', '\u0404\\zr', r'"\z" is an invalid escape sequence')
123+ check(br'\x3z5', '\u0404\\z5', r'"\z" is an invalid escape sequence')
124+ check(memoryview(br'\x3z5x')[:-1], '\u0404\\z5', r'"\z" is an invalid escape sequence')
125+ check(memoryview(br'\x3z5xy')[:-2], '\u0404\\z5', r'"\z" is an invalid escape sequence')
126+
127+ check(br'\x5n\z', '\u0404\n\\z', r'"\z" is an invalid escape sequence')
128+ check(br'\x5n\501', '\u0404\n\u0141', r'"\501" is an invalid octal escape sequence')
129+ check(br'\x5z', '\u0404\\z', r'"\z" is an invalid escape sequence')
130+ check(memoryview(br'\x5zy')[:-1], '\u0404\\z', r'"\z" is an invalid escape sequence')
131+
132 # issue32583
133 def test_crashing_decode_handler(self):
134 # better generating one more character to fill the extra space slot
135diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
136index 94fcf98e75721f..d42270da15ee32 100644
137--- a/Lib/test/test_codecs.py
138+++ b/Lib/test/test_codecs.py
139@@ -1196,23 +1196,39 @@ def test_escape(self):
140 check(br"[\1010]", b"[A0]")
141 check(br"[\x41]", b"[A]")
142 check(br"[\x410]", b"[A0]")
143+
144+ def test_warnings(self):
145+ decode = codecs.escape_decode
146+ check = coding_checker(self, decode)
147 for i in range(97, 123):
148 b = bytes([i])
149 if b not in b'abfnrtvx':
150- with self.assertWarns(DeprecationWarning):
151+ with self.assertWarnsRegex(DeprecationWarning,
152+ r'"\\%c" is an invalid escape sequence' % i):
153 check(b"\\" + b, b"\\" + b)
154- with self.assertWarns(DeprecationWarning):
155+ with self.assertWarnsRegex(DeprecationWarning,
156+ r'"\\%c" is an invalid escape sequence' % (i-32)):
157 check(b"\\" + b.upper(), b"\\" + b.upper())
158- with self.assertWarns(DeprecationWarning):
159+ with self.assertWarnsRegex(DeprecationWarning,
160+ r'"\\8" is an invalid escape sequence'):
161 check(br"\8", b"\\8")
162 with self.assertWarns(DeprecationWarning):
163 check(br"\9", b"\\9")
164- with self.assertWarns(DeprecationWarning):
165+ with self.assertWarnsRegex(DeprecationWarning,
166+ r'"\\\xfa" is an invalid escape sequence') as cm:
167 check(b"\\\xfa", b"\\\xfa")
168 for i in range(0o400, 0o1000):
169- with self.assertWarns(DeprecationWarning):
170+ with self.assertWarnsRegex(DeprecationWarning,
171+ r'"\\%o" is an invalid octal escape sequence' % i):
172 check(rb'\%o' % i, bytes([i & 0o377]))
173
174+ with self.assertWarnsRegex(DeprecationWarning,
175+ r'"\\z" is an invalid escape sequence'):
176+ self.assertEqual(decode(br'\x\z', 'ignore'), (b'\\z', 4))
177+ with self.assertWarnsRegex(DeprecationWarning,
178+ r'"\\501" is an invalid octal escape sequence'):
179+ self.assertEqual(decode(br'\x\501', 'ignore'), (b'A', 6))
180+
181 def test_errors(self):
182 decode = codecs.escape_decode
183 self.assertRaises(ValueError, decode, br"\x")
184@@ -2661,24 +2677,40 @@ def test_escape_decode(self):
185 check(br"[\x410]", "[A0]")
186 check(br"\u20ac", "\u20ac")
187 check(br"\U0001d120", "\U0001d120")
188+
189+ def test_decode_warnings(self):
190+ decode = codecs.unicode_escape_decode
191+ check = coding_checker(self, decode)
192 for i in range(97, 123):
193 b = bytes([i])
194 if b not in b'abfnrtuvx':
195- with self.assertWarns(DeprecationWarning):
196+ with self.assertWarnsRegex(DeprecationWarning,
197+ r'"\\%c" is an invalid escape sequence' % i):
198 check(b"\\" + b, "\\" + chr(i))
199 if b.upper() not in b'UN':
200- with self.assertWarns(DeprecationWarning):
201+ with self.assertWarnsRegex(DeprecationWarning,
202+ r'"\\%c" is an invalid escape sequence' % (i-32)):
203 check(b"\\" + b.upper(), "\\" + chr(i-32))
204- with self.assertWarns(DeprecationWarning):
205+ with self.assertWarnsRegex(DeprecationWarning,
206+ r'"\\8" is an invalid escape sequence'):
207 check(br"\8", "\\8")
208 with self.assertWarns(DeprecationWarning):
209 check(br"\9", "\\9")
210- with self.assertWarns(DeprecationWarning):
211+ with self.assertWarnsRegex(DeprecationWarning,
212+ r'"\\\xfa" is an invalid escape sequence') as cm:
213 check(b"\\\xfa", "\\\xfa")
214 for i in range(0o400, 0o1000):
215- with self.assertWarns(DeprecationWarning):
216+ with self.assertWarnsRegex(DeprecationWarning,
217+ r'"\\%o" is an invalid octal escape sequence' % i):
218 check(rb'\%o' % i, chr(i))
219
220+ with self.assertWarnsRegex(DeprecationWarning,
221+ r'"\\z" is an invalid escape sequence'):
222+ self.assertEqual(decode(br'\x\z', 'ignore'), ('\\z', 4))
223+ with self.assertWarnsRegex(DeprecationWarning,
224+ r'"\\501" is an invalid octal escape sequence'):
225+ self.assertEqual(decode(br'\x\501', 'ignore'), ('\u0141', 6))
226+
227 def test_decode_errors(self):
228 decode = codecs.unicode_escape_decode
229 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
230diff --git a/Misc/NEWS.d/next/Security/2025-05-09-20-22-54.gh-issue-133767.kN2i3Q.rst b/Misc/NEWS.d/next/Security/2025-05-09-20-22-54.gh-issue-133767.kN2i3Q.rst
231new file mode 100644
232index 00000000000000..39d2f1e1a892cf
233--- /dev/null
234+++ b/Misc/NEWS.d/next/Security/2025-05-09-20-22-54.gh-issue-133767.kN2i3Q.rst
235@@ -0,0 +1,2 @@
236+Fix use-after-free in the "unicode-escape" decoder with a non-"strict" error
237+handler.
238diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c
239index fc407ec6bf99d6..87ea1162e03513 100644
240--- a/Objects/bytesobject.c
241+++ b/Objects/bytesobject.c
242@@ -1075,10 +1075,11 @@ _PyBytes_FormatEx(const char *format, Py_ssize_t format_len,
243 }
244
245 /* Unescape a backslash-escaped string. */
246-PyObject *_PyBytes_DecodeEscape(const char *s,
247+PyObject *_PyBytes_DecodeEscape2(const char *s,
248 Py_ssize_t len,
249 const char *errors,
250- const char **first_invalid_escape)
251+ int *first_invalid_escape_char,
252+ const char **first_invalid_escape_ptr)
253 {
254 int c;
255 char *p;
256@@ -1092,7 +1093,8 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
257 return NULL;
258 writer.overallocate = 1;
259
260- *first_invalid_escape = NULL;
261+ *first_invalid_escape_char = -1;
262+ *first_invalid_escape_ptr = NULL;
263
264 end = s + len;
265 while (s < end) {
266@@ -1130,9 +1132,10 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
267 c = (c<<3) + *s++ - '0';
268 }
269 if (c > 0377) {
270- if (*first_invalid_escape == NULL) {
271- *first_invalid_escape = s-3; /* Back up 3 chars, since we've
272- already incremented s. */
273+ if (*first_invalid_escape_char == -1) {
274+ *first_invalid_escape_char = c;
275+ /* Back up 3 chars, since we've already incremented s. */
276+ *first_invalid_escape_ptr = s - 3;
277 }
278 }
279 *p++ = c;
280@@ -1173,9 +1176,10 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
281 break;
282
283 default:
284- if (*first_invalid_escape == NULL) {
285- *first_invalid_escape = s-1; /* Back up one char, since we've
286- already incremented s. */
287+ if (*first_invalid_escape_char == -1) {
288+ *first_invalid_escape_char = (unsigned char)s[-1];
289+ /* Back up one char, since we've already incremented s. */
290+ *first_invalid_escape_ptr = s - 1;
291 }
292 *p++ = '\\';
293 s--;
294@@ -1195,18 +1199,19 @@ PyObject *PyBytes_DecodeEscape(const char *s,
295 Py_ssize_t Py_UNUSED(unicode),
296 const char *Py_UNUSED(recode_encoding))
297 {
298- const char* first_invalid_escape;
299- PyObject *result = _PyBytes_DecodeEscape(s, len, errors,
300- &first_invalid_escape);
301+ int first_invalid_escape_char;
302+ const char *first_invalid_escape_ptr;
303+ PyObject *result = _PyBytes_DecodeEscape2(s, len, errors,
304+ &first_invalid_escape_char,
305+ &first_invalid_escape_ptr);
306 if (result == NULL)
307 return NULL;
308- if (first_invalid_escape != NULL) {
309- unsigned char c = *first_invalid_escape;
310- if ('4' <= c && c <= '7') {
311+ if (first_invalid_escape_char != -1) {
312+ if (first_invalid_escape_char > 0xff) {
313 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
314- "b\"\\%.3s\" is an invalid octal escape sequence. "
315+ "b\"\\%o\" is an invalid octal escape sequence. "
316 "Such sequences will not work in the future. ",
317- first_invalid_escape) < 0)
318+ first_invalid_escape_char) < 0)
319 {
320 Py_DECREF(result);
321 return NULL;
322@@ -1216,7 +1221,7 @@ PyObject *PyBytes_DecodeEscape(const char *s,
323 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
324 "b\"\\%c\" is an invalid escape sequence. "
325 "Such sequences will not work in the future. ",
326- c) < 0)
327+ first_invalid_escape_char) < 0)
328 {
329 Py_DECREF(result);
330 return NULL;
331diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
332index f3f0c9646a652e..cd26494ad8f1d6 100644
333--- a/Objects/unicodeobject.c
334+++ b/Objects/unicodeobject.c
335@@ -6596,13 +6596,15 @@ _PyUnicode_GetNameCAPI(void)
336 /* --- Unicode Escape Codec ----------------------------------------------- */
337
338 PyObject *
339-_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
340+_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
341 Py_ssize_t size,
342 const char *errors,
343 Py_ssize_t *consumed,
344- const char **first_invalid_escape)
345+ int *first_invalid_escape_char,
346+ const char **first_invalid_escape_ptr)
347 {
348 const char *starts = s;
349+ const char *initial_starts = starts;
350 _PyUnicodeWriter writer;
351 const char *end;
352 PyObject *errorHandler = NULL;
353@@ -6610,7 +6612,8 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
354 _PyUnicode_Name_CAPI *ucnhash_capi;
355
356 // so we can remember if we've seen an invalid escape char or not
357- *first_invalid_escape = NULL;
358+ *first_invalid_escape_char = -1;
359+ *first_invalid_escape_ptr = NULL;
360
361 if (size == 0) {
362 if (consumed) {
363@@ -6698,9 +6701,12 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
364 }
365 }
366 if (ch > 0377) {
367- if (*first_invalid_escape == NULL) {
368- *first_invalid_escape = s-3; /* Back up 3 chars, since we've
369- already incremented s. */
370+ if (*first_invalid_escape_char == -1) {
371+ *first_invalid_escape_char = ch;
372+ if (starts == initial_starts) {
373+ /* Back up 3 chars, since we've already incremented s. */
374+ *first_invalid_escape_ptr = s - 3;
375+ }
376 }
377 }
378 WRITE_CHAR(ch);
379@@ -6795,9 +6801,12 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
380 goto error;
381
382 default:
383- if (*first_invalid_escape == NULL) {
384- *first_invalid_escape = s-1; /* Back up one char, since we've
385- already incremented s. */
386+ if (*first_invalid_escape_char == -1) {
387+ *first_invalid_escape_char = c;
388+ if (starts == initial_starts) {
389+ /* Back up one char, since we've already incremented s. */
390+ *first_invalid_escape_ptr = s - 1;
391+ }
392 }
393 WRITE_ASCII_CHAR('\\');
394 WRITE_CHAR(c);
395@@ -6842,19 +6851,20 @@ _PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
396 const char *errors,
397 Py_ssize_t *consumed)
398 {
399- const char *first_invalid_escape;
400- PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
401+ int first_invalid_escape_char;
402+ const char *first_invalid_escape_ptr;
403+ PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
404 consumed,
405- &first_invalid_escape);
406+ &first_invalid_escape_char,
407+ &first_invalid_escape_ptr);
408 if (result == NULL)
409 return NULL;
410- if (first_invalid_escape != NULL) {
411- unsigned char c = *first_invalid_escape;
412- if ('4' <= c && c <= '7') {
413+ if (first_invalid_escape_char != -1) {
414+ if (first_invalid_escape_char > 0xff) {
415 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
416- "\"\\%.3s\" is an invalid octal escape sequence. "
417+ "\"\\%o\" is an invalid octal escape sequence. "
418 "Such sequences will not work in the future. ",
419- first_invalid_escape) < 0)
420+ first_invalid_escape_char) < 0)
421 {
422 Py_DECREF(result);
423 return NULL;
424@@ -6864,7 +6874,7 @@ _PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
425 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
426 "\"\\%c\" is an invalid escape sequence. "
427 "Such sequences will not work in the future. ",
428- c) < 0)
429+ first_invalid_escape_char) < 0)
430 {
431 Py_DECREF(result);
432 return NULL;
433diff --git a/Parser/string_parser.c b/Parser/string_parser.c
434index d3631b114c5a3c..ebe68989d1af58 100644
435--- a/Parser/string_parser.c
436+++ b/Parser/string_parser.c
437@@ -196,15 +196,18 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
438 len = (size_t)(p - buf);
439 s = buf;
440
441- const char *first_invalid_escape;
442- v = _PyUnicode_DecodeUnicodeEscapeInternal(s, (Py_ssize_t)len, NULL, NULL, &first_invalid_escape);
443+ int first_invalid_escape_char;
444+ const char *first_invalid_escape_ptr;
445+ v = _PyUnicode_DecodeUnicodeEscapeInternal2(s, (Py_ssize_t)len, NULL, NULL,
446+ &first_invalid_escape_char,
447+ &first_invalid_escape_ptr);
448
449 // HACK: later we can simply pass the line no, since we don't preserve the tokens
450 // when we are decoding the string but we preserve the line numbers.
451- if (v != NULL && first_invalid_escape != NULL && t != NULL) {
452- if (warn_invalid_escape_sequence(parser, s, first_invalid_escape, t) < 0) {
453- /* We have not decref u before because first_invalid_escape points
454- inside u. */
455+ if (v != NULL && first_invalid_escape_ptr != NULL && t != NULL) {
456+ if (warn_invalid_escape_sequence(parser, s, first_invalid_escape_ptr, t) < 0) {
457+ /* We have not decref u before because first_invalid_escape_ptr
458+ points inside u. */
459 Py_XDECREF(u);
460 Py_DECREF(v);
461 return NULL;
462@@ -217,14 +220,17 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
463 static PyObject *
464 decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
465 {
466- const char *first_invalid_escape;
467- PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
468+ int first_invalid_escape_char;
469+ const char *first_invalid_escape_ptr;
470+ PyObject *result = _PyBytes_DecodeEscape2(s, len, NULL,
471+ &first_invalid_escape_char,
472+ &first_invalid_escape_ptr);
473 if (result == NULL) {
474 return NULL;
475 }
476
477- if (first_invalid_escape != NULL) {
478- if (warn_invalid_escape_sequence(p, s, first_invalid_escape, t) < 0) {
479+ if (first_invalid_escape_ptr != NULL) {
480+ if (warn_invalid_escape_sequence(p, s, first_invalid_escape_ptr, t) < 0) {
481 Py_DECREF(result);
482 return NULL;
483 }