at master 23 kB view raw
1From 9f69a58623bd01349a18ba0c7a9cb1dad6a51e8e Mon Sep 17 00:00:00 2001 2From: Serhiy Storchaka <storchaka@gmail.com> 3Date: Mon, 12 May 2025 20:42:23 +0300 4Subject: [PATCH] gh-133767: Fix use-after-free in the unicode-escape decoder 5 with an error handler (GH-129648) 6 7If the error handler is used, a new bytes object is created to set as 8the object attribute of UnicodeDecodeError, and that bytes object then 9replaces the original data. A pointer to the decoded data will became invalid 10after destroying that temporary bytes object. So we need other way to return 11the first invalid escape from _PyUnicode_DecodeUnicodeEscapeInternal(). 12 13_PyBytes_DecodeEscape() does not have such issue, because it does not 14use the error handlers registry, but it should be changed for compatibility 15with _PyUnicode_DecodeUnicodeEscapeInternal(). 16--- 17 Include/internal/pycore_bytesobject.h | 5 +- 18 Include/internal/pycore_unicodeobject.h | 12 +++-- 19 Lib/test/test_codeccallbacks.py | 39 +++++++++++++- 20 Lib/test/test_codecs.py | 52 +++++++++++++++---- 21 ...-05-09-20-22-54.gh-issue-133767.kN2i3Q.rst | 2 + 22 Objects/bytesobject.c | 41 ++++++++------- 23 Objects/unicodeobject.c | 46 +++++++++------- 24 Parser/string_parser.c | 26 ++++++---- 25 8 files changed, 160 insertions(+), 63 deletions(-) 26 create mode 100644 Misc/NEWS.d/next/Security/2025-05-09-20-22-54.gh-issue-133767.kN2i3Q.rst 27 28diff --git a/Include/internal/pycore_bytesobject.h b/Include/internal/pycore_bytesobject.h 29index 300e7f4896a39e..8ea9b3ebb88454 100644 30--- a/Include/internal/pycore_bytesobject.h 31+++ b/Include/internal/pycore_bytesobject.h 32@@ -20,8 +20,9 @@ extern PyObject* _PyBytes_FromHex( 33 34 // Helper for PyBytes_DecodeEscape that detects invalid escape chars. 35 // Export for test_peg_generator. 36-PyAPI_FUNC(PyObject*) _PyBytes_DecodeEscape(const char *, Py_ssize_t, 37- const char *, const char **); 38+PyAPI_FUNC(PyObject*) _PyBytes_DecodeEscape2(const char *, Py_ssize_t, 39+ const char *, 40+ int *, const char **); 41 42 43 // Substring Search. 44diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h 45index c85d53b89accdb..3791b913c17546 100644 46--- a/Include/internal/pycore_unicodeobject.h 47+++ b/Include/internal/pycore_unicodeobject.h 48@@ -139,14 +139,18 @@ extern PyObject* _PyUnicode_DecodeUnicodeEscapeStateful( 49 // Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape 50 // chars. 51 // Export for test_peg_generator. 52-PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal( 53+PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal2( 54 const char *string, /* Unicode-Escape encoded string */ 55 Py_ssize_t length, /* size of string */ 56 const char *errors, /* error handling */ 57 Py_ssize_t *consumed, /* bytes consumed */ 58- const char **first_invalid_escape); /* on return, points to first 59- invalid escaped char in 60- string. */ 61+ int *first_invalid_escape_char, /* on return, if not -1, contain the first 62+ invalid escaped char (<= 0xff) or invalid 63+ octal escape (> 0xff) in string. */ 64+ const char **first_invalid_escape_ptr); /* on return, if not NULL, may 65+ point to the first invalid escaped 66+ char in string. 67+ May be NULL if errors is not NULL. */ 68 69 /* --- Raw-Unicode-Escape Codecs ---------------------------------------------- */ 70 71diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py 72index 86e5e5c1474674..a767f67a02cf56 100644 73--- a/Lib/test/test_codeccallbacks.py 74+++ b/Lib/test/test_codeccallbacks.py 75@@ -2,6 +2,7 @@ 76 import codecs 77 import html.entities 78 import itertools 79+import re 80 import sys 81 import unicodedata 82 import unittest 83@@ -1125,7 +1126,7 @@ def test_bug828737(self): 84 text = 'abc<def>ghi'*n 85 text.translate(charmap) 86 87- def test_mutatingdecodehandler(self): 88+ def test_mutating_decode_handler(self): 89 baddata = [ 90 ("ascii", b"\xff"), 91 ("utf-7", b"++"), 92@@ -1160,6 +1161,42 @@ def mutating(exc): 93 for (encoding, data) in baddata: 94 self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242") 95 96+ def test_mutating_decode_handler_unicode_escape(self): 97+ decode = codecs.unicode_escape_decode 98+ def mutating(exc): 99+ if isinstance(exc, UnicodeDecodeError): 100+ r = data.get(exc.object[:exc.end]) 101+ if r is not None: 102+ exc.object = r[0] + exc.object[exc.end:] 103+ return ('\u0404', r[1]) 104+ raise AssertionError("don't know how to handle %r" % exc) 105+ 106+ codecs.register_error('test.mutating2', mutating) 107+ data = { 108+ br'\x0': (b'\\', 0), 109+ br'\x3': (b'xxx\\', 3), 110+ br'\x5': (b'x\\', 1), 111+ } 112+ def check(input, expected, msg): 113+ with self.assertWarns(DeprecationWarning) as cm: 114+ self.assertEqual(decode(input, 'test.mutating2'), (expected, len(input))) 115+ self.assertIn(msg, str(cm.warning)) 116+ 117+ check(br'\x0n\z', '\u0404\n\\z', r'"\z" is an invalid escape sequence') 118+ check(br'\x0n\501', '\u0404\n\u0141', r'"\501" is an invalid octal escape sequence') 119+ check(br'\x0z', '\u0404\\z', r'"\z" is an invalid escape sequence') 120+ 121+ check(br'\x3n\zr', '\u0404\n\\zr', r'"\z" is an invalid escape sequence') 122+ check(br'\x3zr', '\u0404\\zr', r'"\z" is an invalid escape sequence') 123+ check(br'\x3z5', '\u0404\\z5', r'"\z" is an invalid escape sequence') 124+ check(memoryview(br'\x3z5x')[:-1], '\u0404\\z5', r'"\z" is an invalid escape sequence') 125+ check(memoryview(br'\x3z5xy')[:-2], '\u0404\\z5', r'"\z" is an invalid escape sequence') 126+ 127+ check(br'\x5n\z', '\u0404\n\\z', r'"\z" is an invalid escape sequence') 128+ check(br'\x5n\501', '\u0404\n\u0141', r'"\501" is an invalid octal escape sequence') 129+ check(br'\x5z', '\u0404\\z', r'"\z" is an invalid escape sequence') 130+ check(memoryview(br'\x5zy')[:-1], '\u0404\\z', r'"\z" is an invalid escape sequence') 131+ 132 # issue32583 133 def test_crashing_decode_handler(self): 134 # better generating one more character to fill the extra space slot 135diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py 136index 94fcf98e75721f..d42270da15ee32 100644 137--- a/Lib/test/test_codecs.py 138+++ b/Lib/test/test_codecs.py 139@@ -1196,23 +1196,39 @@ def test_escape(self): 140 check(br"[\1010]", b"[A0]") 141 check(br"[\x41]", b"[A]") 142 check(br"[\x410]", b"[A0]") 143+ 144+ def test_warnings(self): 145+ decode = codecs.escape_decode 146+ check = coding_checker(self, decode) 147 for i in range(97, 123): 148 b = bytes([i]) 149 if b not in b'abfnrtvx': 150- with self.assertWarns(DeprecationWarning): 151+ with self.assertWarnsRegex(DeprecationWarning, 152+ r'"\\%c" is an invalid escape sequence' % i): 153 check(b"\\" + b, b"\\" + b) 154- with self.assertWarns(DeprecationWarning): 155+ with self.assertWarnsRegex(DeprecationWarning, 156+ r'"\\%c" is an invalid escape sequence' % (i-32)): 157 check(b"\\" + b.upper(), b"\\" + b.upper()) 158- with self.assertWarns(DeprecationWarning): 159+ with self.assertWarnsRegex(DeprecationWarning, 160+ r'"\\8" is an invalid escape sequence'): 161 check(br"\8", b"\\8") 162 with self.assertWarns(DeprecationWarning): 163 check(br"\9", b"\\9") 164- with self.assertWarns(DeprecationWarning): 165+ with self.assertWarnsRegex(DeprecationWarning, 166+ r'"\\\xfa" is an invalid escape sequence') as cm: 167 check(b"\\\xfa", b"\\\xfa") 168 for i in range(0o400, 0o1000): 169- with self.assertWarns(DeprecationWarning): 170+ with self.assertWarnsRegex(DeprecationWarning, 171+ r'"\\%o" is an invalid octal escape sequence' % i): 172 check(rb'\%o' % i, bytes([i & 0o377])) 173 174+ with self.assertWarnsRegex(DeprecationWarning, 175+ r'"\\z" is an invalid escape sequence'): 176+ self.assertEqual(decode(br'\x\z', 'ignore'), (b'\\z', 4)) 177+ with self.assertWarnsRegex(DeprecationWarning, 178+ r'"\\501" is an invalid octal escape sequence'): 179+ self.assertEqual(decode(br'\x\501', 'ignore'), (b'A', 6)) 180+ 181 def test_errors(self): 182 decode = codecs.escape_decode 183 self.assertRaises(ValueError, decode, br"\x") 184@@ -2661,24 +2677,40 @@ def test_escape_decode(self): 185 check(br"[\x410]", "[A0]") 186 check(br"\u20ac", "\u20ac") 187 check(br"\U0001d120", "\U0001d120") 188+ 189+ def test_decode_warnings(self): 190+ decode = codecs.unicode_escape_decode 191+ check = coding_checker(self, decode) 192 for i in range(97, 123): 193 b = bytes([i]) 194 if b not in b'abfnrtuvx': 195- with self.assertWarns(DeprecationWarning): 196+ with self.assertWarnsRegex(DeprecationWarning, 197+ r'"\\%c" is an invalid escape sequence' % i): 198 check(b"\\" + b, "\\" + chr(i)) 199 if b.upper() not in b'UN': 200- with self.assertWarns(DeprecationWarning): 201+ with self.assertWarnsRegex(DeprecationWarning, 202+ r'"\\%c" is an invalid escape sequence' % (i-32)): 203 check(b"\\" + b.upper(), "\\" + chr(i-32)) 204- with self.assertWarns(DeprecationWarning): 205+ with self.assertWarnsRegex(DeprecationWarning, 206+ r'"\\8" is an invalid escape sequence'): 207 check(br"\8", "\\8") 208 with self.assertWarns(DeprecationWarning): 209 check(br"\9", "\\9") 210- with self.assertWarns(DeprecationWarning): 211+ with self.assertWarnsRegex(DeprecationWarning, 212+ r'"\\\xfa" is an invalid escape sequence') as cm: 213 check(b"\\\xfa", "\\\xfa") 214 for i in range(0o400, 0o1000): 215- with self.assertWarns(DeprecationWarning): 216+ with self.assertWarnsRegex(DeprecationWarning, 217+ r'"\\%o" is an invalid octal escape sequence' % i): 218 check(rb'\%o' % i, chr(i)) 219 220+ with self.assertWarnsRegex(DeprecationWarning, 221+ r'"\\z" is an invalid escape sequence'): 222+ self.assertEqual(decode(br'\x\z', 'ignore'), ('\\z', 4)) 223+ with self.assertWarnsRegex(DeprecationWarning, 224+ r'"\\501" is an invalid octal escape sequence'): 225+ self.assertEqual(decode(br'\x\501', 'ignore'), ('\u0141', 6)) 226+ 227 def test_decode_errors(self): 228 decode = codecs.unicode_escape_decode 229 for c, d in (b'x', 2), (b'u', 4), (b'U', 4): 230diff --git a/Misc/NEWS.d/next/Security/2025-05-09-20-22-54.gh-issue-133767.kN2i3Q.rst b/Misc/NEWS.d/next/Security/2025-05-09-20-22-54.gh-issue-133767.kN2i3Q.rst 231new file mode 100644 232index 00000000000000..39d2f1e1a892cf 233--- /dev/null 234+++ b/Misc/NEWS.d/next/Security/2025-05-09-20-22-54.gh-issue-133767.kN2i3Q.rst 235@@ -0,0 +1,2 @@ 236+Fix use-after-free in the "unicode-escape" decoder with a non-"strict" error 237+handler. 238diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c 239index fc407ec6bf99d6..87ea1162e03513 100644 240--- a/Objects/bytesobject.c 241+++ b/Objects/bytesobject.c 242@@ -1075,10 +1075,11 @@ _PyBytes_FormatEx(const char *format, Py_ssize_t format_len, 243 } 244 245 /* Unescape a backslash-escaped string. */ 246-PyObject *_PyBytes_DecodeEscape(const char *s, 247+PyObject *_PyBytes_DecodeEscape2(const char *s, 248 Py_ssize_t len, 249 const char *errors, 250- const char **first_invalid_escape) 251+ int *first_invalid_escape_char, 252+ const char **first_invalid_escape_ptr) 253 { 254 int c; 255 char *p; 256@@ -1092,7 +1093,8 @@ PyObject *_PyBytes_DecodeEscape(const char *s, 257 return NULL; 258 writer.overallocate = 1; 259 260- *first_invalid_escape = NULL; 261+ *first_invalid_escape_char = -1; 262+ *first_invalid_escape_ptr = NULL; 263 264 end = s + len; 265 while (s < end) { 266@@ -1130,9 +1132,10 @@ PyObject *_PyBytes_DecodeEscape(const char *s, 267 c = (c<<3) + *s++ - '0'; 268 } 269 if (c > 0377) { 270- if (*first_invalid_escape == NULL) { 271- *first_invalid_escape = s-3; /* Back up 3 chars, since we've 272- already incremented s. */ 273+ if (*first_invalid_escape_char == -1) { 274+ *first_invalid_escape_char = c; 275+ /* Back up 3 chars, since we've already incremented s. */ 276+ *first_invalid_escape_ptr = s - 3; 277 } 278 } 279 *p++ = c; 280@@ -1173,9 +1176,10 @@ PyObject *_PyBytes_DecodeEscape(const char *s, 281 break; 282 283 default: 284- if (*first_invalid_escape == NULL) { 285- *first_invalid_escape = s-1; /* Back up one char, since we've 286- already incremented s. */ 287+ if (*first_invalid_escape_char == -1) { 288+ *first_invalid_escape_char = (unsigned char)s[-1]; 289+ /* Back up one char, since we've already incremented s. */ 290+ *first_invalid_escape_ptr = s - 1; 291 } 292 *p++ = '\\'; 293 s--; 294@@ -1195,18 +1199,19 @@ PyObject *PyBytes_DecodeEscape(const char *s, 295 Py_ssize_t Py_UNUSED(unicode), 296 const char *Py_UNUSED(recode_encoding)) 297 { 298- const char* first_invalid_escape; 299- PyObject *result = _PyBytes_DecodeEscape(s, len, errors, 300- &first_invalid_escape); 301+ int first_invalid_escape_char; 302+ const char *first_invalid_escape_ptr; 303+ PyObject *result = _PyBytes_DecodeEscape2(s, len, errors, 304+ &first_invalid_escape_char, 305+ &first_invalid_escape_ptr); 306 if (result == NULL) 307 return NULL; 308- if (first_invalid_escape != NULL) { 309- unsigned char c = *first_invalid_escape; 310- if ('4' <= c && c <= '7') { 311+ if (first_invalid_escape_char != -1) { 312+ if (first_invalid_escape_char > 0xff) { 313 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, 314- "b\"\\%.3s\" is an invalid octal escape sequence. " 315+ "b\"\\%o\" is an invalid octal escape sequence. " 316 "Such sequences will not work in the future. ", 317- first_invalid_escape) < 0) 318+ first_invalid_escape_char) < 0) 319 { 320 Py_DECREF(result); 321 return NULL; 322@@ -1216,7 +1221,7 @@ PyObject *PyBytes_DecodeEscape(const char *s, 323 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, 324 "b\"\\%c\" is an invalid escape sequence. " 325 "Such sequences will not work in the future. ", 326- c) < 0) 327+ first_invalid_escape_char) < 0) 328 { 329 Py_DECREF(result); 330 return NULL; 331diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c 332index f3f0c9646a652e..cd26494ad8f1d6 100644 333--- a/Objects/unicodeobject.c 334+++ b/Objects/unicodeobject.c 335@@ -6596,13 +6596,15 @@ _PyUnicode_GetNameCAPI(void) 336 /* --- Unicode Escape Codec ----------------------------------------------- */ 337 338 PyObject * 339-_PyUnicode_DecodeUnicodeEscapeInternal(const char *s, 340+_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s, 341 Py_ssize_t size, 342 const char *errors, 343 Py_ssize_t *consumed, 344- const char **first_invalid_escape) 345+ int *first_invalid_escape_char, 346+ const char **first_invalid_escape_ptr) 347 { 348 const char *starts = s; 349+ const char *initial_starts = starts; 350 _PyUnicodeWriter writer; 351 const char *end; 352 PyObject *errorHandler = NULL; 353@@ -6610,7 +6612,8 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s, 354 _PyUnicode_Name_CAPI *ucnhash_capi; 355 356 // so we can remember if we've seen an invalid escape char or not 357- *first_invalid_escape = NULL; 358+ *first_invalid_escape_char = -1; 359+ *first_invalid_escape_ptr = NULL; 360 361 if (size == 0) { 362 if (consumed) { 363@@ -6698,9 +6701,12 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s, 364 } 365 } 366 if (ch > 0377) { 367- if (*first_invalid_escape == NULL) { 368- *first_invalid_escape = s-3; /* Back up 3 chars, since we've 369- already incremented s. */ 370+ if (*first_invalid_escape_char == -1) { 371+ *first_invalid_escape_char = ch; 372+ if (starts == initial_starts) { 373+ /* Back up 3 chars, since we've already incremented s. */ 374+ *first_invalid_escape_ptr = s - 3; 375+ } 376 } 377 } 378 WRITE_CHAR(ch); 379@@ -6795,9 +6801,12 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s, 380 goto error; 381 382 default: 383- if (*first_invalid_escape == NULL) { 384- *first_invalid_escape = s-1; /* Back up one char, since we've 385- already incremented s. */ 386+ if (*first_invalid_escape_char == -1) { 387+ *first_invalid_escape_char = c; 388+ if (starts == initial_starts) { 389+ /* Back up one char, since we've already incremented s. */ 390+ *first_invalid_escape_ptr = s - 1; 391+ } 392 } 393 WRITE_ASCII_CHAR('\\'); 394 WRITE_CHAR(c); 395@@ -6842,19 +6851,20 @@ _PyUnicode_DecodeUnicodeEscapeStateful(const char *s, 396 const char *errors, 397 Py_ssize_t *consumed) 398 { 399- const char *first_invalid_escape; 400- PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors, 401+ int first_invalid_escape_char; 402+ const char *first_invalid_escape_ptr; 403+ PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors, 404 consumed, 405- &first_invalid_escape); 406+ &first_invalid_escape_char, 407+ &first_invalid_escape_ptr); 408 if (result == NULL) 409 return NULL; 410- if (first_invalid_escape != NULL) { 411- unsigned char c = *first_invalid_escape; 412- if ('4' <= c && c <= '7') { 413+ if (first_invalid_escape_char != -1) { 414+ if (first_invalid_escape_char > 0xff) { 415 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, 416- "\"\\%.3s\" is an invalid octal escape sequence. " 417+ "\"\\%o\" is an invalid octal escape sequence. " 418 "Such sequences will not work in the future. ", 419- first_invalid_escape) < 0) 420+ first_invalid_escape_char) < 0) 421 { 422 Py_DECREF(result); 423 return NULL; 424@@ -6864,7 +6874,7 @@ _PyUnicode_DecodeUnicodeEscapeStateful(const char *s, 425 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, 426 "\"\\%c\" is an invalid escape sequence. " 427 "Such sequences will not work in the future. ", 428- c) < 0) 429+ first_invalid_escape_char) < 0) 430 { 431 Py_DECREF(result); 432 return NULL; 433diff --git a/Parser/string_parser.c b/Parser/string_parser.c 434index d3631b114c5a3c..ebe68989d1af58 100644 435--- a/Parser/string_parser.c 436+++ b/Parser/string_parser.c 437@@ -196,15 +196,18 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t) 438 len = (size_t)(p - buf); 439 s = buf; 440 441- const char *first_invalid_escape; 442- v = _PyUnicode_DecodeUnicodeEscapeInternal(s, (Py_ssize_t)len, NULL, NULL, &first_invalid_escape); 443+ int first_invalid_escape_char; 444+ const char *first_invalid_escape_ptr; 445+ v = _PyUnicode_DecodeUnicodeEscapeInternal2(s, (Py_ssize_t)len, NULL, NULL, 446+ &first_invalid_escape_char, 447+ &first_invalid_escape_ptr); 448 449 // HACK: later we can simply pass the line no, since we don't preserve the tokens 450 // when we are decoding the string but we preserve the line numbers. 451- if (v != NULL && first_invalid_escape != NULL && t != NULL) { 452- if (warn_invalid_escape_sequence(parser, s, first_invalid_escape, t) < 0) { 453- /* We have not decref u before because first_invalid_escape points 454- inside u. */ 455+ if (v != NULL && first_invalid_escape_ptr != NULL && t != NULL) { 456+ if (warn_invalid_escape_sequence(parser, s, first_invalid_escape_ptr, t) < 0) { 457+ /* We have not decref u before because first_invalid_escape_ptr 458+ points inside u. */ 459 Py_XDECREF(u); 460 Py_DECREF(v); 461 return NULL; 462@@ -217,14 +220,17 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t) 463 static PyObject * 464 decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t) 465 { 466- const char *first_invalid_escape; 467- PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape); 468+ int first_invalid_escape_char; 469+ const char *first_invalid_escape_ptr; 470+ PyObject *result = _PyBytes_DecodeEscape2(s, len, NULL, 471+ &first_invalid_escape_char, 472+ &first_invalid_escape_ptr); 473 if (result == NULL) { 474 return NULL; 475 } 476 477- if (first_invalid_escape != NULL) { 478- if (warn_invalid_escape_sequence(p, s, first_invalid_escape, t) < 0) { 479+ if (first_invalid_escape_ptr != NULL) { 480+ if (warn_invalid_escape_sequence(p, s, first_invalid_escape_ptr, t) < 0) { 481 Py_DECREF(result); 482 return NULL; 483 }