Serenity Operating System
at master 1130 lines 48 kB view raw
1/* 2 * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> 3 * 4 * SPDX-License-Identifier: BSD-2-Clause 5 */ 6 7#include <LibTest/TestCase.h> // import first, to prevent warning of VERIFY* redefinition 8 9#include <AK/Debug.h> 10#include <AK/StringBuilder.h> 11#include <AK/Tuple.h> 12#include <LibRegex/Regex.h> 13#include <LibRegex/RegexDebug.h> 14#include <LibRegex/RegexMatcher.h> 15#include <stdio.h> 16 17static ECMAScriptOptions match_test_api_options(const ECMAScriptOptions options) 18{ 19 return options; 20} 21 22static PosixOptions match_test_api_options(const PosixOptions options) 23{ 24 return options; 25} 26 27template<typename... Flags> 28static constexpr ECMAScriptFlags combine_flags(Flags&&... flags) 29requires((IsSame<Flags, ECMAScriptFlags> && ...)) 30{ 31 return static_cast<ECMAScriptFlags>((static_cast<regex::FlagsUnderlyingType>(flags) | ...)); 32} 33 34TEST_CASE(regex_options_ecmascript) 35{ 36 ECMAScriptOptions eo; 37 eo |= ECMAScriptFlags::Global; 38 39 EXPECT(eo.has_flag_set(ECMAScriptFlags::Global)); 40 EXPECT(!eo.has_flag_set(ECMAScriptFlags::Insensitive)); 41 42 eo = match_test_api_options(ECMAScriptFlags::Global | ECMAScriptFlags::Insensitive | ECMAScriptFlags::Sticky); 43 EXPECT(eo.has_flag_set(ECMAScriptFlags::Global)); 44 EXPECT(eo.has_flag_set(ECMAScriptFlags::Insensitive)); 45 EXPECT(eo.has_flag_set(ECMAScriptFlags::Sticky)); 46 EXPECT(!eo.has_flag_set(ECMAScriptFlags::Unicode)); 47 EXPECT(!eo.has_flag_set(ECMAScriptFlags::Multiline)); 48 EXPECT(!eo.has_flag_set(ECMAScriptFlags::SingleLine)); 49 50 eo &= ECMAScriptFlags::Insensitive; 51 EXPECT(!eo.has_flag_set(ECMAScriptFlags::Global)); 52 EXPECT(eo.has_flag_set(ECMAScriptFlags::Insensitive)); 53 EXPECT(!eo.has_flag_set(ECMAScriptFlags::Multiline)); 54 55 eo &= ECMAScriptFlags::Sticky; 56 EXPECT(!eo.has_flag_set(ECMAScriptFlags::Global)); 57 EXPECT(!eo.has_flag_set(ECMAScriptFlags::Insensitive)); 58 EXPECT(!eo.has_flag_set(ECMAScriptFlags::Multiline)); 59 EXPECT(!eo.has_flag_set(ECMAScriptFlags::Sticky)); 60 61 eo = ~ECMAScriptFlags::Insensitive; 62 EXPECT(eo.has_flag_set(ECMAScriptFlags::Global)); 63 EXPECT(!eo.has_flag_set(ECMAScriptFlags::Insensitive)); 64 EXPECT(eo.has_flag_set(ECMAScriptFlags::Multiline)); 65 EXPECT(eo.has_flag_set(ECMAScriptFlags::Sticky)); 66} 67 68TEST_CASE(regex_options_posix) 69{ 70 PosixOptions eo; 71 eo |= PosixFlags::Global; 72 73 EXPECT(eo.has_flag_set(PosixFlags::Global)); 74 EXPECT(!eo.has_flag_set(PosixFlags::Insensitive)); 75 76 eo = match_test_api_options(PosixFlags::Global | PosixFlags::Insensitive | PosixFlags::MatchNotBeginOfLine); 77 EXPECT(eo.has_flag_set(PosixFlags::Global)); 78 EXPECT(eo.has_flag_set(PosixFlags::Insensitive)); 79 EXPECT(eo.has_flag_set(PosixFlags::MatchNotBeginOfLine)); 80 EXPECT(!eo.has_flag_set(PosixFlags::Unicode)); 81 EXPECT(!eo.has_flag_set(PosixFlags::Multiline)); 82 83 eo &= PosixFlags::Insensitive; 84 EXPECT(!eo.has_flag_set(PosixFlags::Global)); 85 EXPECT(eo.has_flag_set(PosixFlags::Insensitive)); 86 EXPECT(!eo.has_flag_set(PosixFlags::Multiline)); 87 88 eo &= PosixFlags::MatchNotBeginOfLine; 89 EXPECT(!eo.has_flag_set(PosixFlags::Global)); 90 EXPECT(!eo.has_flag_set(PosixFlags::Insensitive)); 91 EXPECT(!eo.has_flag_set(PosixFlags::Multiline)); 92 93 eo = ~PosixFlags::Insensitive; 94 EXPECT(eo.has_flag_set(PosixFlags::Global)); 95 EXPECT(!eo.has_flag_set(PosixFlags::Insensitive)); 96 EXPECT(eo.has_flag_set(PosixFlags::Multiline)); 97} 98 99TEST_CASE(regex_lexer) 100{ 101 Lexer l("/[.*+?^${}()|[\\]\\\\]/g"sv); 102 EXPECT(l.next().type() == regex::TokenType::Slash); 103 EXPECT(l.next().type() == regex::TokenType::LeftBracket); 104 EXPECT(l.next().type() == regex::TokenType::Period); 105 EXPECT(l.next().type() == regex::TokenType::Asterisk); 106 EXPECT(l.next().type() == regex::TokenType::Plus); 107 EXPECT(l.next().type() == regex::TokenType::Questionmark); 108 EXPECT(l.next().type() == regex::TokenType::Circumflex); 109 EXPECT(l.next().type() == regex::TokenType::Dollar); 110 EXPECT(l.next().type() == regex::TokenType::LeftCurly); 111 EXPECT(l.next().type() == regex::TokenType::RightCurly); 112 EXPECT(l.next().type() == regex::TokenType::LeftParen); 113 EXPECT(l.next().type() == regex::TokenType::RightParen); 114 EXPECT(l.next().type() == regex::TokenType::Pipe); 115 EXPECT(l.next().type() == regex::TokenType::LeftBracket); 116 EXPECT(l.next().type() == regex::TokenType::EscapeSequence); 117 EXPECT(l.next().type() == regex::TokenType::EscapeSequence); 118 EXPECT(l.next().type() == regex::TokenType::RightBracket); 119 EXPECT(l.next().type() == regex::TokenType::Slash); 120 EXPECT(l.next().type() == regex::TokenType::Char); 121} 122 123TEST_CASE(parser_error_parens) 124{ 125 DeprecatedString pattern = "test()test"; 126 Lexer l(pattern); 127 PosixExtendedParser p(l); 128 p.parse(); 129 EXPECT(p.has_error()); 130 EXPECT(p.error() == regex::Error::EmptySubExpression); 131} 132 133TEST_CASE(parser_error_special_characters_used_at_wrong_place) 134{ 135 DeprecatedString pattern; 136 Vector<char, 5> chars = { '*', '+', '?', '{' }; 137 StringBuilder b; 138 139 Lexer l; 140 PosixExtended p(l); 141 142 for (auto& ch : chars) { 143 // First in ere 144 b.clear(); 145 b.append(ch); 146 pattern = b.to_deprecated_string(); 147 l.set_source(pattern); 148 p.parse(); 149 EXPECT(p.has_error()); 150 EXPECT(p.error() == regex::Error::InvalidRepetitionMarker); 151 152 // After vertical line 153 b.clear(); 154 b.append("a|"sv); 155 b.append(ch); 156 pattern = b.to_deprecated_string(); 157 l.set_source(pattern); 158 p.parse(); 159 EXPECT(p.has_error()); 160 EXPECT(p.error() == regex::Error::InvalidRepetitionMarker); 161 162 // After circumflex 163 b.clear(); 164 b.append('^'); 165 b.append(ch); 166 pattern = b.to_deprecated_string(); 167 l.set_source(pattern); 168 p.parse(); 169 EXPECT(p.has_error()); 170 EXPECT(p.error() == regex::Error::InvalidRepetitionMarker); 171 172 // After dollar 173 b.clear(); 174 b.append('$'); 175 b.append(ch); 176 pattern = b.to_deprecated_string(); 177 l.set_source(pattern); 178 p.parse(); 179 EXPECT(p.has_error()); 180 EXPECT(p.error() == regex::Error::InvalidRepetitionMarker); 181 182 // After left parens 183 b.clear(); 184 b.append('('); 185 b.append(ch); 186 b.append(')'); 187 pattern = b.to_deprecated_string(); 188 l.set_source(pattern); 189 p.parse(); 190 EXPECT(p.has_error()); 191 EXPECT(p.error() == regex::Error::InvalidRepetitionMarker); 192 } 193} 194 195TEST_CASE(parser_error_vertical_line_used_at_wrong_place) 196{ 197 Lexer l; 198 PosixExtended p(l); 199 200 // First in ere 201 l.set_source("|asdf"sv); 202 p.parse(); 203 EXPECT(p.has_error()); 204 EXPECT(p.error() == regex::Error::EmptySubExpression); 205 206 // Last in ere 207 l.set_source("asdf|"sv); 208 p.parse(); 209 EXPECT(p.has_error()); 210 EXPECT(p.error() == regex::Error::EmptySubExpression); 211 212 // After left parens 213 l.set_source("(|asdf)"sv); 214 p.parse(); 215 EXPECT(p.has_error()); 216 EXPECT(p.error() == regex::Error::EmptySubExpression); 217 218 // Proceed right parens 219 l.set_source("(asdf)|"sv); 220 p.parse(); 221 EXPECT(p.has_error()); 222 EXPECT(p.error() == regex::Error::EmptySubExpression); 223} 224 225TEST_CASE(catch_all_first) 226{ 227 Regex<PosixExtended> re("^.*$"); 228 RegexResult m; 229 re.match("Hello World"sv, m); 230 EXPECT(m.count == 1); 231 EXPECT(re.match("Hello World"sv, m)); 232} 233 234TEST_CASE(catch_all) 235{ 236 Regex<PosixExtended> re("^.*$", PosixFlags::Global); 237 238 EXPECT(re.has_match("Hello World"sv)); 239 EXPECT(re.match("Hello World"sv).success); 240 EXPECT(re.match("Hello World"sv).count == 1); 241 242 EXPECT(has_match("Hello World"sv, re)); 243 auto res = match("Hello World"sv, re); 244 EXPECT(res.success); 245 EXPECT(res.count == 1); 246 EXPECT(res.matches.size() == 1); 247 EXPECT(res.matches.first().view == "Hello World"); 248} 249 250TEST_CASE(catch_all_again) 251{ 252 Regex<PosixExtended> re("^.*$", PosixFlags::Extra); 253 EXPECT_EQ(has_match("Hello World"sv, re), true); 254} 255 256TEST_CASE(char_utf8) 257{ 258 Regex<PosixExtended> re("😀"); 259 RegexResult result; 260 261 EXPECT_EQ((result = match(Utf8View { "Привет, мир! 😀 γειά σου κόσμος 😀 こんにちは世界"sv }, re, PosixFlags::Global)).success, true); 262 EXPECT_EQ(result.count, 2u); 263} 264 265TEST_CASE(catch_all_newline) 266{ 267 Regex<PosixExtended> re("^.*$", PosixFlags::Multiline | PosixFlags::StringCopyMatches); 268 RegexResult result; 269 auto lambda = [&result, &re]() { 270 DeprecatedString aaa = "Hello World\nTest\n1234\n"; 271 result = match(aaa, re); 272 EXPECT_EQ(result.success, true); 273 }; 274 lambda(); 275 EXPECT_EQ(result.count, 3u); 276 EXPECT_EQ(result.matches.at(0).view, "Hello World"); 277 EXPECT_EQ(result.matches.at(1).view, "Test"); 278 EXPECT_EQ(result.matches.at(2).view, "1234"); 279} 280 281TEST_CASE(catch_all_newline_view) 282{ 283 Regex<PosixExtended> re("^.*$", PosixFlags::Multiline); 284 RegexResult result; 285 286 DeprecatedString aaa = "Hello World\nTest\n1234\n"; 287 result = match(aaa, re); 288 EXPECT_EQ(result.success, true); 289 EXPECT_EQ(result.count, 3u); 290 DeprecatedString str = "Hello World"; 291 EXPECT_EQ(result.matches.at(0).view, str.view()); 292 EXPECT_EQ(result.matches.at(1).view, "Test"); 293 EXPECT_EQ(result.matches.at(2).view, "1234"); 294} 295 296TEST_CASE(catch_all_newline_2) 297{ 298 Regex<PosixExtended> re("^.*$"); 299 RegexResult result; 300 result = match("Hello World\nTest\n1234\n"sv, re, PosixFlags::Multiline | PosixFlags::StringCopyMatches); 301 EXPECT_EQ(result.success, true); 302 EXPECT_EQ(result.count, 3u); 303 EXPECT_EQ(result.matches.at(0).view, "Hello World"); 304 EXPECT_EQ(result.matches.at(1).view, "Test"); 305 EXPECT_EQ(result.matches.at(2).view, "1234"); 306 307 result = match("Hello World\nTest\n1234\n"sv, re); 308 EXPECT_EQ(result.success, true); 309 EXPECT_EQ(result.count, 1u); 310 EXPECT_EQ(result.matches.at(0).view, "Hello World\nTest\n1234\n"); 311} 312 313TEST_CASE(match_all_character_class) 314{ 315 Regex<PosixExtended> re("[[:alpha:]]"); 316 DeprecatedString str = "[Window]\nOpacity=255\nAudibleBeep=0\n"; 317 RegexResult result = match(str, re, PosixFlags::Global | PosixFlags::StringCopyMatches); 318 319 EXPECT_EQ(result.success, true); 320 EXPECT_EQ(result.count, 24u); 321 EXPECT_EQ(result.matches.at(0).view, "W"); 322 EXPECT_EQ(result.matches.at(1).view, "i"); 323 EXPECT_EQ(result.matches.at(2).view, "n"); 324} 325 326TEST_CASE(match_character_class_with_assertion) 327{ 328 Regex<PosixExtended> re("[[:alpha:]]+$"); 329 DeprecatedString str = "abcdef"; 330 RegexResult result = match(str, re); 331 332 EXPECT_EQ(result.success, true); 333 EXPECT_EQ(result.count, 1u); 334} 335 336TEST_CASE(example_for_git_commit) 337{ 338 Regex<PosixExtended> re("^.*$"); 339 auto result = re.match("Well, hello friends!\nHello World!"sv); 340 341 EXPECT(result.success); 342 EXPECT(result.count == 1); 343 EXPECT(result.matches.at(0).view.starts_with("Well"sv)); 344 EXPECT(result.matches.at(0).view.length() == 33); 345 346 EXPECT(re.has_match("Well,...."sv)); 347 348 result = re.match("Well, hello friends!\nHello World!"sv, PosixFlags::Multiline); 349 350 EXPECT(result.success); 351 EXPECT(result.count == 2); 352 EXPECT(result.matches.at(0).view == "Well, hello friends!"); 353 EXPECT(result.matches.at(1).view == "Hello World!"); 354} 355 356TEST_CASE(email_address) 357{ 358 Regex<PosixExtended> re("^[A-Z0-9a-z._%+-]{1,64}@([A-Za-z0-9-]{1,63}\\.){1,125}[A-Za-z]{2,63}$"); 359 EXPECT(re.has_match("hello.world@domain.tld"sv)); 360 EXPECT(re.has_match("this.is.a.very_long_email_address@world.wide.web"sv)); 361} 362 363TEST_CASE(ini_file_entries) 364{ 365 Regex<PosixExtended> re("[[:alpha:]]*=([[:digit:]]*)|\\[(.*)\\]"); 366 RegexResult result; 367 368 if constexpr (REGEX_DEBUG) { 369 RegexDebug regex_dbg(stderr); 370 regex_dbg.print_raw_bytecode(re); 371 regex_dbg.print_header(); 372 regex_dbg.print_bytecode(re); 373 } 374 375 DeprecatedString haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n"; 376 EXPECT_EQ(re.search(haystack.view(), result, PosixFlags::Multiline), true); 377 EXPECT_EQ(result.count, 3u); 378 379 if constexpr (REGEX_DEBUG) { 380 for (auto& v : result.matches) 381 fprintf(stderr, "%s\n", v.view.to_deprecated_string().characters()); 382 } 383 384 EXPECT_EQ(result.matches.at(0).view, "[Window]"); 385 EXPECT_EQ(result.capture_group_matches.at(0).at(0).view, "Window"); 386 EXPECT_EQ(result.matches.at(1).view, "Opacity=255"); 387 EXPECT_EQ(result.matches.at(1).line, 1u); 388 EXPECT_EQ(result.matches.at(1).column, 0u); 389 EXPECT_EQ(result.capture_group_matches.at(1).at(0).view, "255"); 390 EXPECT_EQ(result.capture_group_matches.at(1).at(0).line, 1u); 391 EXPECT_EQ(result.capture_group_matches.at(1).at(0).column, 8u); 392 EXPECT_EQ(result.matches.at(2).view, "AudibleBeep=0"); 393 EXPECT_EQ(result.capture_group_matches.at(2).at(0).view, "0"); 394 EXPECT_EQ(result.capture_group_matches.at(2).at(0).line, 2u); 395 EXPECT_EQ(result.capture_group_matches.at(2).at(0).column, 12u); 396} 397 398TEST_CASE(ini_file_entries2) 399{ 400 Regex<PosixExtended> re("[[:alpha:]]*=([[:digit:]]*)"); 401 RegexResult result; 402 403 DeprecatedString haystack = "ViewMode=Icon"; 404 405 EXPECT_EQ(re.match(haystack.view(), result), false); 406 EXPECT_EQ(result.count, 0u); 407 408 EXPECT_EQ(re.search(haystack.view(), result), true); 409 EXPECT_EQ(result.count, 1u); 410} 411 412TEST_CASE(named_capture_group) 413{ 414 Regex<PosixExtended> re("[[:alpha:]]*=(?<Test>[[:digit:]]*)"); 415 RegexResult result; 416 417 if constexpr (REGEX_DEBUG) { 418 RegexDebug regex_dbg(stderr); 419 regex_dbg.print_raw_bytecode(re); 420 regex_dbg.print_header(); 421 regex_dbg.print_bytecode(re); 422 } 423 424 DeprecatedString haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n"; 425 EXPECT_EQ(re.search(haystack, result, PosixFlags::Multiline), true); 426 EXPECT_EQ(result.count, 2u); 427 EXPECT_EQ(result.matches.at(0).view, "Opacity=255"); 428 EXPECT_EQ(result.capture_group_matches.at(0).at(0).view, "255"); 429 EXPECT_EQ(result.capture_group_matches.at(0).at(0).capture_group_name, "Test"); 430 EXPECT_EQ(result.matches.at(1).view, "AudibleBeep=0"); 431 EXPECT_EQ(result.capture_group_matches.at(1).at(0).view, "0"); 432 EXPECT_EQ(result.capture_group_matches.at(1).at(0).capture_group_name, "Test"); 433} 434 435TEST_CASE(ecma262_named_capture_group_with_dollar_sign) 436{ 437 Regex<ECMA262> re("[a-zA-Z]*=(?<$Test$>[0-9]*)"); 438 RegexResult result; 439 440 if constexpr (REGEX_DEBUG) { 441 RegexDebug regex_dbg(stderr); 442 regex_dbg.print_raw_bytecode(re); 443 regex_dbg.print_header(); 444 regex_dbg.print_bytecode(re); 445 } 446 447 DeprecatedString haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n"; 448 EXPECT_EQ(re.search(haystack, result, ECMAScriptFlags::Multiline), true); 449 EXPECT_EQ(result.count, 2u); 450 EXPECT_EQ(result.matches.at(0).view, "Opacity=255"); 451 EXPECT_EQ(result.capture_group_matches.at(0).at(0).view, "255"); 452 EXPECT_EQ(result.capture_group_matches.at(0).at(0).capture_group_name, "$Test$"); 453 EXPECT_EQ(result.matches.at(1).view, "AudibleBeep=0"); 454 EXPECT_EQ(result.capture_group_matches.at(1).at(0).view, "0"); 455 EXPECT_EQ(result.capture_group_matches.at(1).at(0).capture_group_name, "$Test$"); 456} 457 458TEST_CASE(a_star) 459{ 460 Regex<PosixExtended> re("a*"); 461 RegexResult result; 462 463 if constexpr (REGEX_DEBUG) { 464 RegexDebug regex_dbg(stderr); 465 regex_dbg.print_raw_bytecode(re); 466 regex_dbg.print_header(); 467 regex_dbg.print_bytecode(re); 468 } 469 470 DeprecatedString haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n"; 471 EXPECT_EQ(re.search(haystack.view(), result, PosixFlags::Multiline), true); 472 EXPECT_EQ(result.count, 32u); 473 if (result.count == 32u) { 474 EXPECT_EQ(result.matches.at(0).view.length(), 0u); 475 EXPECT_EQ(result.matches.at(10).view.length(), 1u); 476 EXPECT_EQ(result.matches.at(10).view, "a"); 477 EXPECT_EQ(result.matches.at(31).view.length(), 0u); 478 } 479} 480 481TEST_CASE(simple_period_end_benchmark) 482{ 483 Regex<PosixExtended> re("hello.$"); 484 RegexResult m; 485 EXPECT_EQ(re.search("Hello1"sv, m), false); 486 EXPECT_EQ(re.search("hello1hello1"sv, m), true); 487 EXPECT_EQ(re.search("hello2hell"sv, m), false); 488 EXPECT_EQ(re.search("hello?"sv, m), true); 489} 490 491TEST_CASE(posix_extended_nested_capture_group) 492{ 493 Regex<PosixExtended> re("(h(e(?<llo>llo)))"); // group 0 -> "hello", group 1 -> "ello", group 2/"llo" -> "llo" 494 auto result = re.match("hello"sv); 495 EXPECT(result.success); 496 EXPECT_EQ(result.capture_group_matches.size(), 1u); 497 EXPECT_EQ(result.capture_group_matches[0].size(), 3u); 498 EXPECT_EQ(result.capture_group_matches[0][0].view, "hello"sv); 499 EXPECT_EQ(result.capture_group_matches[0][1].view, "ello"sv); 500 EXPECT_EQ(result.capture_group_matches[0][2].view, "llo"sv); 501} 502 503auto parse_test_case_long_disjunction_chain = DeprecatedString::repeated("a|"sv, 100000); 504 505TEST_CASE(ECMA262_parse) 506{ 507 struct _test { 508 StringView pattern; 509 regex::Error expected_error { regex::Error::NoError }; 510 regex::ECMAScriptFlags flags {}; 511 }; 512 513 _test const tests[] { 514 { "^hello.$"sv }, 515 { "^(hello.)$"sv }, 516 { "^h{0,1}ello.$"sv }, 517 { "^hello\\W$"sv }, 518 { "^hell\\w.$"sv }, 519 { "^hell\\x6f1$"sv }, // ^hello1$ 520 { "^hel(?:l\\w).$"sv }, 521 { "^hel(?<LO>l\\w).$"sv }, 522 { "^[-a-zA-Z\\w\\s]+$"sv }, 523 { "\\bhello\\B"sv }, 524 { "^[\\w+/_-]+[=]{0,2}$"sv }, // #4189 525 { "^(?:[^<]*(<[\\w\\W]+>)[^>]*$|#([\\w\\-]*)$)"sv }, // #4189 526 { "\\/"sv }, // #4189 527 { ",/=-:"sv }, // #4243 528 { "\\x"sv }, // Even invalid escapes are allowed if ~unicode. 529 { "\\x1"sv }, // Even invalid escapes are allowed if ~unicode. 530 { "\\x1"sv, regex::Error::InvalidPattern, regex::ECMAScriptFlags::Unicode }, 531 { "\\x11"sv }, 532 { "\\x11"sv, regex::Error::NoError, regex::ECMAScriptFlags::Unicode }, 533 { "\\"sv, regex::Error::InvalidTrailingEscape }, 534 { "(?"sv, regex::Error::InvalidCaptureGroup }, 535 { "\\u1234"sv, regex::Error::NoError, regex::ECMAScriptFlags::Unicode }, 536 { "[\\u1234]"sv, regex::Error::NoError, regex::ECMAScriptFlags::Unicode }, 537 { "\\u1"sv, regex::Error::InvalidPattern, regex::ECMAScriptFlags::Unicode }, 538 { "[\\u1]"sv, regex::Error::InvalidPattern, regex::ECMAScriptFlags::Unicode }, 539 { ",(?"sv, regex::Error::InvalidCaptureGroup }, // #4583 540 { "{1}"sv, regex::Error::InvalidPattern }, 541 { "{1,2}"sv, regex::Error::InvalidPattern }, 542 { "\\uxxxx"sv, regex::Error::NoError }, 543 { "\\uxxxx"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, 544 { "\\ud83d"sv, regex::Error::NoError, ECMAScriptFlags::Unicode }, 545 { "\\ud83d\\uxxxx"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, 546 { "\\u{0}"sv }, 547 { "\\u{0}"sv, regex::Error::NoError, ECMAScriptFlags::Unicode }, 548 { "\\u{10ffff}"sv, regex::Error::NoError, ECMAScriptFlags::Unicode }, 549 { "\\u{10ffff"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, 550 { "\\u{10ffffx"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, 551 { "\\u{110000}"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, 552 { "\\p"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, 553 { "\\p{"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, 554 { "\\p{}"sv, regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode }, 555 { "\\p{AsCiI}"sv, regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode }, 556 { "\\p{hello friends}"sv, regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode }, 557 { "\\p{Prepended_Concatenation_Mark}"sv, regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode }, 558 { "\\p{ASCII}"sv, regex::Error::NoError, ECMAScriptFlags::Unicode }, 559 { "\\\\p{1}"sv, regex::Error::NoError, ECMAScriptFlags::Unicode }, 560 { "\\\\p{AsCiI}"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, 561 { "\\\\p{ASCII}"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, 562 { "\\c"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, 563 { "\\c"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, 564 { "[\\c]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, 565 { "[\\c]"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, 566 { "\\c`"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, 567 { "\\c`"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, 568 { "[\\c`]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, 569 { "[\\c`]"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, 570 { "\\A"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, 571 { "\\A"sv, regex::Error::InvalidCharacterClass, ECMAScriptFlags::Unicode }, 572 { "[\\A]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, 573 { "[\\A]"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, 574 { "\\0"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, 575 { "\\0"sv, regex::Error::NoError, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) }, 576 { "\\00"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, 577 { "\\00"sv, regex::Error::InvalidCharacterClass, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) }, 578 { "[\\0]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, 579 { "[\\0]"sv, regex::Error::NoError, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) }, 580 { "[\\00]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, 581 { "[\\00]"sv, regex::Error::InvalidPattern, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) }, 582 { "\\^\\$\\\\\\.\\*\\+\\?\\(\\)\\[\\]\\{\\}\\|\\/"sv, regex::Error::NoError, ECMAScriptFlags::Unicode }, 583 { "[\\^\\$\\\\\\.\\*\\+\\?\\(\\)\\[\\]\\{\\}\\|\\/]"sv, regex::Error::NoError, ECMAScriptFlags::Unicode }, 584 { "]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, 585 { "]"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, 586 { "\\]"sv, regex::Error::NoError, ECMAScriptFlags::Unicode }, 587 { "}"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, 588 { "}"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, 589 { "\\}"sv, regex::Error::NoError, ECMAScriptFlags::Unicode }, 590 { "a{9007199254740991}"sv }, // 2^53 - 1 591 { "a{9007199254740991,}"sv }, 592 { "a{9007199254740991,9007199254740991}"sv }, 593 { "a{9007199254740992}"sv, regex::Error::InvalidBraceContent }, 594 { "a{9007199254740992,}"sv, regex::Error::InvalidBraceContent }, 595 { "a{9007199254740991,9007199254740992}"sv, regex::Error::InvalidBraceContent }, 596 { "a{9007199254740992,9007199254740991}"sv, regex::Error::InvalidBraceContent }, 597 { "a{9007199254740992,9007199254740992}"sv, regex::Error::InvalidBraceContent }, 598 { "(?<a>a)(?<a>b)"sv, regex::Error::DuplicateNamedCapture }, 599 { "(?<a>a)(?<b>b)(?<a>c)"sv, regex::Error::DuplicateNamedCapture }, 600 { "(?<1a>a)"sv, regex::Error::InvalidNameForCaptureGroup }, 601 { "(?<\\a>a)"sv, regex::Error::InvalidNameForCaptureGroup }, 602 { "(?<\ta>a)"sv, regex::Error::InvalidNameForCaptureGroup }, 603 { "(?<$$_$$>a)"sv }, 604 { "(?<ÿ>a)"sv }, 605 { "(?<𝓑𝓻𝓸𝔀𝓷>a)"sv }, 606 { "((?=lg)?[vl]k\\-?\\d{3}) bui| 3\\.[-\\w; ]{10}lg?-([06cv9]{3,4})"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, // #12373, quantifiable assertions. 607 { parse_test_case_long_disjunction_chain.view() }, // A whole lot of disjunctions, should not overflow the stack. 608 { "(\"|')(?:(?!\\2)[^\\\\\\r\\n]|\\\\.)*\\2"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, // LegacyOctalEscapeSequence should not consume too many chars (and should not crash) 609 }; 610 611 for (auto& test : tests) { 612 Regex<ECMA262> re(test.pattern, test.flags); 613 EXPECT_EQ(re.parser_result.error, test.expected_error); 614 if constexpr (REGEX_DEBUG) { 615 dbgln("\n"); 616 RegexDebug regex_dbg(stderr); 617 regex_dbg.print_raw_bytecode(re); 618 regex_dbg.print_header(); 619 regex_dbg.print_bytecode(re); 620 dbgln("\n"); 621 } 622 } 623} 624 625TEST_CASE(ECMA262_match) 626{ 627 struct _test { 628 StringView pattern; 629 StringView subject; 630 bool matches { true }; 631 ECMAScriptFlags options {}; 632 }; 633 // clang-format off 634 constexpr _test tests[] { 635 { "^hello.$"sv, "hello1"sv }, 636 { "^(hello.)$"sv, "hello1"sv }, 637 { "^h{0,1}ello.$"sv, "ello1"sv }, 638 { "^hello\\W$"sv, "hello!"sv }, 639 { "^hell\\w.$"sv, "hellx!"sv }, 640 { "^hell\\x6f1$"sv, "hello1"sv }, 641 { "^hel(?<LO>l.)1$"sv, "hello1"sv }, 642 { "^hel(?<LO>l.)1*\\k<LO>.$"sv, "hello1lo1"sv }, 643 { "^[-a-z1-3\\s]+$"sv, "hell2 o1"sv }, 644 { "^[\\0-\\x1f]$"sv, "\n"sv }, 645 { .pattern = "\\bhello\\B"sv, .subject = "hello1"sv, .options = ECMAScriptFlags::Global }, 646 { "\\b.*\\b"sv, "hello1"sv }, 647 { "[^\\D\\S]{2}"sv, "1 "sv }, 648 { "bar(?=f.)foo"sv, "barfoo"sv }, 649 { "bar(?=foo)bar"sv, "barbar"sv, false }, 650 { "bar(?!foo)bar"sv, "barbar"sv, true }, 651 { "bar(?!bar)bar"sv, "barbar"sv, false }, 652 { "bar.*(?<=foo)"sv, "barbar"sv, false }, 653 { "bar.*(?<!foo)"sv, "barbar"sv, true }, 654 { "((...)X)+"sv, "fooXbarXbazX"sv, true }, 655 { "(?:)"sv, ""sv, true }, 656 { "\\^"sv, "^"sv }, 657 { "\\^\\$\\\\\\.\\*\\+\\?\\(\\)\\[\\]\\{\\}\\|\\/"sv, "^$\\.*+?()[]{}|/"sv, true, ECMAScriptFlags::Unicode }, 658 { "[\\^\\$\\\\\\.\\*\\+\\?\\(\\)\\[\\]\\{\\}\\|\\/]{15}"sv, "^$\\.*+?()[]{}|/"sv, true, ECMAScriptFlags::Unicode }, 659 { "(a{2}){3}"sv, "aaaaaa"sv }, 660 { "(a{2}){3}"sv, "aaaabaa"sv, false }, 661 { "(a{2}){4}"sv, "aaaaaaaa"sv }, 662 { "(a{2}){4}"sv, "aaaaaabaa"sv, false }, 663 { "(a{3}){2}"sv, "aaaaaa"sv }, 664 { "(a{3}){2}"sv, "aaaabaa"sv, false }, 665 { "(a{4}){2}"sv, "aaaaaaaa"sv }, 666 { "(a{4}){2}"sv, "aaaaaabaa"sv, false }, 667 { "\\u{4}"sv, "uuuu"sv }, 668 { "(?<=.{3})f"sv, "abcdef"sv, true, (ECMAScriptFlags)regex::AllFlags::Global }, 669 { "(?<=.{3})f"sv, "abc😀ef"sv, true, (ECMAScriptFlags)regex::AllFlags::Global }, 670 // ECMA262, B.1.4. Regular Expression Pattern extensions for browsers 671 { "{"sv, "{"sv, true, ECMAScriptFlags::BrowserExtended }, 672 { "\\5"sv, "\5"sv, true, ECMAScriptFlags::BrowserExtended }, 673 { "\\05"sv, "\5"sv, true, ECMAScriptFlags::BrowserExtended }, 674 { "\\455"sv, "\45""5"sv, true, ECMAScriptFlags::BrowserExtended }, 675 { "\\314"sv, "\314"sv, true, ECMAScriptFlags::BrowserExtended }, 676 { "\\c"sv, "\\c"sv, true, ECMAScriptFlags::BrowserExtended }, 677 { "\\cf"sv, "\06"sv, true, ECMAScriptFlags::BrowserExtended }, 678 { "\\c1"sv, "\\c1"sv, true, ECMAScriptFlags::BrowserExtended }, 679 { "[\\c1]"sv, "\x11"sv, true, ECMAScriptFlags::BrowserExtended }, 680 { "[\\w-\\d]"sv, "-"sv, true, ECMAScriptFlags::BrowserExtended }, 681 { "^(?:^^\\.?|[!+-]|!=|!==|#|%|%=|&|&&|&&=|&=|\\(|\\*|\\*=|\\+=|,|-=|->|\\/|\\/=|:|::|;|<|<<|<<=|<=|=|==|===|>|>=|>>|>>=|>>>|>>>=|[?@[^]|\\^=|\\^\\^|\\^\\^=|{|\\||\\|=|\\|\\||\\|\\|=|~|break|case|continue|delete|do|else|finally|instanceof|return|throw|try|typeof)\\s*(\\/(?=[^*/])(?:[^/[\\\\]|\\\\[\\S\\s]|\\[(?:[^\\\\\\]]|\\\\[\\S\\s])*(?:]|$))+\\/)"sv, 682 "return /xx/"sv, true, ECMAScriptFlags::BrowserExtended 683 }, // #5517, appears to be matching JS expressions that involve regular expressions... 684 { "a{2,}"sv, "aaaa"sv }, // #5518 685 { "\\0"sv, "\0"sv, true, ECMAScriptFlags::BrowserExtended }, 686 { "\\0"sv, "\0"sv, true, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) }, 687 { "\\01"sv, "\1"sv, true, ECMAScriptFlags::BrowserExtended }, 688 { "[\\0]"sv, "\0"sv, true, ECMAScriptFlags::BrowserExtended }, 689 { "[\\0]"sv, "\0"sv, true, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) }, 690 { "[\\01]"sv, "\1"sv, true, ECMAScriptFlags::BrowserExtended }, 691 { "(\0|a)"sv, "a"sv, true }, // #9686, Should allow null bytes in pattern 692 { "(.*?)a(?!(a+)b\\2c)\\2(.*)"sv, "baaabaac"sv, true }, // #6042, Groups inside lookarounds may be referenced outside, but their contents appear empty if the pattern in the lookaround fails. 693 { "a|$"sv, "x"sv, true, (ECMAScriptFlags)regex::AllFlags::Global }, // #11940, Global (not the 'g' flag) regexps should attempt to match the zero-length end of the string too. 694 { "foo\nbar"sv, "foo\nbar"sv, true }, // #12126, ECMA262 regexp should match literal newlines without the 's' flag. 695 { "foo[^]bar"sv, "foo\nbar"sv, true }, // #12126, ECMA262 regexp should match newline with [^]. 696 { "^[_A-Z]+$"sv, "_aA"sv, true, ECMAScriptFlags::Insensitive }, // Insensitive lookup table: characters in a range do not necessarily lie in the same range after being converted to lowercase. 697 { "^[a-sy-z]$"sv, "b"sv, true, ECMAScriptFlags::Insensitive }, 698 { "^[a-sy-z]$"sv, "y"sv, true, ECMAScriptFlags::Insensitive }, 699 { "^[a-sy-z]$"sv, "u"sv, false, ECMAScriptFlags::Insensitive }, 700 { "."sv, "\n\r\u2028\u2029"sv, false }, // Dot should not match any of CR/LF/LS/PS in ECMA262 mode without DotAll. 701 }; 702 // clang-format on 703 704 for (auto& test : tests) { 705 Regex<ECMA262> re(test.pattern, test.options); 706 if constexpr (REGEX_DEBUG) { 707 dbgln("\n"); 708 RegexDebug regex_dbg(stderr); 709 regex_dbg.print_raw_bytecode(re); 710 regex_dbg.print_header(); 711 regex_dbg.print_bytecode(re); 712 dbgln("\n"); 713 } 714 EXPECT_EQ(re.parser_result.error, regex::Error::NoError); 715 EXPECT_EQ(re.match(test.subject).success, test.matches); 716 } 717} 718 719TEST_CASE(ECMA262_unicode_match) 720{ 721 constexpr auto space_and_line_terminator_code_points = Array { 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0020, 0x00A0, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000, 0xFEFF }; 722 723 StringBuilder builder; 724 for (u32 code_point : space_and_line_terminator_code_points) 725 builder.append_code_point(code_point); 726 auto space_and_line_terminators = builder.to_deprecated_string(); 727 728 struct _test { 729 StringView pattern; 730 StringView subject; 731 bool matches { true }; 732 ECMAScriptFlags options {}; 733 }; 734 _test tests[] { 735 { "\xf0\x9d\x8c\x86"sv, "abcdef"sv, false, ECMAScriptFlags::Unicode }, 736 { "[\xf0\x9d\x8c\x86]"sv, "abcdef"sv, false, ECMAScriptFlags::Unicode }, 737 { "\\ud83d"sv, "😀"sv, true }, 738 { "\\ud83d"sv, "😀"sv, false, ECMAScriptFlags::Unicode }, 739 { "\\ude00"sv, "😀"sv, true }, 740 { "\\ude00"sv, "😀"sv, false, ECMAScriptFlags::Unicode }, 741 { "\\ud83d\\ude00"sv, "😀"sv, true }, 742 { "\\ud83d\\ude00"sv, "😀"sv, true, ECMAScriptFlags::Unicode }, 743 { "\\u{1f600}"sv, "😀"sv, true, ECMAScriptFlags::Unicode }, 744 { "\\ud83d\\ud83d"sv, "\xed\xa0\xbd\xed\xa0\xbd"sv, true }, 745 { "\\ud83d\\ud83d"sv, "\xed\xa0\xbd\xed\xa0\xbd"sv, true, ECMAScriptFlags::Unicode }, 746 { "(?<=.{3})f"sv, "abcdef"sv, true, ECMAScriptFlags::Unicode }, 747 { "(?<=.{3})f"sv, "abc😀ef"sv, true, ECMAScriptFlags::Unicode }, 748 { "(?<𝓑𝓻𝓸𝔀𝓷>brown)"sv, "brown"sv, true, ECMAScriptFlags::Unicode }, 749 { "(?<\\u{1d4d1}\\u{1d4fb}\\u{1d4f8}\\u{1d500}\\u{1d4f7}>brown)"sv, "brown"sv, true, ECMAScriptFlags::Unicode }, 750 { "(?<\\ud835\\udcd1\\ud835\\udcfb\\ud835\\udcf8\\ud835\\udd00\\ud835\\udcf7>brown)"sv, "brown"sv, true, ECMAScriptFlags::Unicode }, 751 { "^\\s+$"sv, space_and_line_terminators }, 752 { "^\\s+$"sv, space_and_line_terminators, true, ECMAScriptFlags::Unicode }, 753 }; 754 755 for (auto& test : tests) { 756 Regex<ECMA262> re(test.pattern, (ECMAScriptFlags)regex::AllFlags::Global | test.options); 757 758 auto subject = MUST(AK::utf8_to_utf16(test.subject)); 759 Utf16View view { subject }; 760 761 if constexpr (REGEX_DEBUG) { 762 dbgln("\n"); 763 RegexDebug regex_dbg(stderr); 764 regex_dbg.print_raw_bytecode(re); 765 regex_dbg.print_header(); 766 regex_dbg.print_bytecode(re); 767 dbgln("\n"); 768 } 769 770 EXPECT_EQ(re.parser_result.error, regex::Error::NoError); 771 EXPECT_EQ(re.match(view).success, test.matches); 772 } 773} 774 775TEST_CASE(ECMA262_unicode_sets_match) 776{ 777 struct _test { 778 StringView pattern; 779 StringView subject; 780 bool matches { true }; 781 ECMAScriptFlags options {}; 782 }; 783 784 constexpr _test tests[] { 785 { "[\\w--x]"sv, "x"sv, false }, 786 { "[\\w&&x]"sv, "y"sv, false }, 787 { "[\\w--x]"sv, "y"sv, true }, 788 { "[\\w&&x]"sv, "x"sv, true }, 789 { "[[0-9\\w]--x--6]"sv, "6"sv, false }, 790 { "[[0-9\\w]--x--6]"sv, "x"sv, false }, 791 { "[[0-9\\w]--x--6]"sv, "y"sv, true }, 792 { "[[0-9\\w]--x--6]"sv, "9"sv, true }, 793 { "[\\w&&\\d]"sv, "a"sv, false }, 794 { "[\\w&&\\d]"sv, "4"sv, true }, 795 }; 796 797 for (auto& test : tests) { 798 Regex<ECMA262> re(test.pattern, (ECMAScriptFlags)regex::AllFlags::UnicodeSets | test.options); 799 if constexpr (REGEX_DEBUG) { 800 dbgln("\n"); 801 RegexDebug regex_dbg(stderr); 802 regex_dbg.print_raw_bytecode(re); 803 regex_dbg.print_header(); 804 regex_dbg.print_bytecode(re); 805 dbgln("\n"); 806 } 807 808 EXPECT_EQ(re.parser_result.error, regex::Error::NoError); 809 auto result = re.match(test.subject).success; 810 EXPECT_EQ(result, test.matches); 811 } 812} 813 814TEST_CASE(ECMA262_property_match) 815{ 816 struct _test { 817 StringView pattern; 818 StringView subject; 819 bool matches { true }; 820 ECMAScriptFlags options {}; 821 }; 822 823 constexpr _test tests[] { 824 { "\\p{ASCII}"sv, "a"sv, false }, 825 { "\\p{ASCII}"sv, "p{ASCII}"sv, true }, 826 { "\\p{ASCII}"sv, "a"sv, true, ECMAScriptFlags::Unicode }, 827 { "\\p{ASCII}"sv, "😀"sv, false, ECMAScriptFlags::Unicode }, 828 { "\\P{ASCII}"sv, "a"sv, false, ECMAScriptFlags::Unicode }, 829 { "\\P{ASCII}"sv, "😀"sv, true, ECMAScriptFlags::Unicode }, 830 { "\\p{ASCII_Hex_Digit}"sv, "1"sv, true, ECMAScriptFlags::Unicode }, 831 { "\\p{ASCII_Hex_Digit}"sv, "a"sv, true, ECMAScriptFlags::Unicode }, 832 { "\\p{ASCII_Hex_Digit}"sv, "x"sv, false, ECMAScriptFlags::Unicode }, 833 { "\\P{ASCII_Hex_Digit}"sv, "1"sv, false, ECMAScriptFlags::Unicode }, 834 { "\\P{ASCII_Hex_Digit}"sv, "a"sv, false, ECMAScriptFlags::Unicode }, 835 { "\\P{ASCII_Hex_Digit}"sv, "x"sv, true, ECMAScriptFlags::Unicode }, 836 { "\\p{Any}"sv, "\xcd\xb8"sv, true, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point. 837 { "\\P{Any}"sv, "\xcd\xb8"sv, false, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point. 838 { "\\p{Assigned}"sv, "\xcd\xb8"sv, false, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point. 839 { "\\P{Assigned}"sv, "\xcd\xb8"sv, true, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point. 840 { "\\p{Lu}"sv, "a"sv, false, ECMAScriptFlags::Unicode }, 841 { "\\p{Lu}"sv, "A"sv, true, ECMAScriptFlags::Unicode }, 842 { "\\p{Lu}"sv, "9"sv, false, ECMAScriptFlags::Unicode }, 843 { "\\p{Cased_Letter}"sv, "a"sv, true, ECMAScriptFlags::Unicode }, 844 { "\\p{Cased_Letter}"sv, "A"sv, true, ECMAScriptFlags::Unicode }, 845 { "\\p{Cased_Letter}"sv, "9"sv, false, ECMAScriptFlags::Unicode }, 846 { "\\P{Cased_Letter}"sv, "a"sv, false, ECMAScriptFlags::Unicode }, 847 { "\\P{Cased_Letter}"sv, "A"sv, false, ECMAScriptFlags::Unicode }, 848 { "\\P{Cased_Letter}"sv, "9"sv, true, ECMAScriptFlags::Unicode }, 849 { "\\p{General_Category=Cased_Letter}"sv, "a"sv, true, ECMAScriptFlags::Unicode }, 850 { "\\p{General_Category=Cased_Letter}"sv, "A"sv, true, ECMAScriptFlags::Unicode }, 851 { "\\p{General_Category=Cased_Letter}"sv, "9"sv, false, ECMAScriptFlags::Unicode }, 852 { "\\p{gc=Cased_Letter}"sv, "a"sv, true, ECMAScriptFlags::Unicode }, 853 { "\\p{gc=Cased_Letter}"sv, "A"sv, true, ECMAScriptFlags::Unicode }, 854 { "\\p{gc=Cased_Letter}"sv, "9"sv, false, ECMAScriptFlags::Unicode }, 855 { "\\p{Script=Latin}"sv, "a"sv, true, ECMAScriptFlags::Unicode }, 856 { "\\p{Script=Latin}"sv, "A"sv, true, ECMAScriptFlags::Unicode }, 857 { "\\p{Script=Latin}"sv, "9"sv, false, ECMAScriptFlags::Unicode }, 858 { "\\p{sc=Latin}"sv, "a"sv, true, ECMAScriptFlags::Unicode }, 859 { "\\p{sc=Latin}"sv, "A"sv, true, ECMAScriptFlags::Unicode }, 860 { "\\p{sc=Latin}"sv, "9"sv, false, ECMAScriptFlags::Unicode }, 861 { "\\p{Script_Extensions=Deva}"sv, "a"sv, false, ECMAScriptFlags::Unicode }, 862 { "\\p{Script_Extensions=Beng}"sv, "\xe1\xb3\x95"sv, true, ECMAScriptFlags::Unicode }, // U+01CD5 863 { "\\p{Script_Extensions=Deva}"sv, "\xe1\xb3\x95"sv, true, ECMAScriptFlags::Unicode }, // U+01CD5 864 { "\\p{scx=Deva}"sv, "a"sv, false, ECMAScriptFlags::Unicode }, 865 { "\\p{scx=Beng}"sv, "\xe1\xb3\x95"sv, true, ECMAScriptFlags::Unicode }, // U+01CD5 866 { "\\p{scx=Deva}"sv, "\xe1\xb3\x95"sv, true, ECMAScriptFlags::Unicode }, // U+01CD5 867 }; 868 869 for (auto& test : tests) { 870 Regex<ECMA262> re(test.pattern, (ECMAScriptFlags)regex::AllFlags::Global | regex::ECMAScriptFlags::BrowserExtended | test.options); 871 872 auto subject = MUST(AK::utf8_to_utf16(test.subject)); 873 Utf16View view { subject }; 874 875 if constexpr (REGEX_DEBUG) { 876 dbgln("\n"); 877 RegexDebug regex_dbg(stderr); 878 regex_dbg.print_raw_bytecode(re); 879 regex_dbg.print_header(); 880 regex_dbg.print_bytecode(re); 881 dbgln("\n"); 882 } 883 884 EXPECT_EQ(re.parser_result.error, regex::Error::NoError); 885 EXPECT_EQ(re.match(view).success, test.matches); 886 } 887} 888 889TEST_CASE(replace) 890{ 891 struct _test { 892 StringView pattern; 893 StringView replacement; 894 StringView subject; 895 StringView expected; 896 ECMAScriptFlags options {}; 897 }; 898 899 constexpr _test tests[] { 900 { "foo(.+)"sv, "aaa"sv, "test"sv, "test"sv }, 901 { "foo(.+)"sv, "test\\1"sv, "foobar"sv, "testbar"sv }, 902 { "foo(.+)"sv, "\\2\\1"sv, "foobar"sv, "\\2bar"sv }, 903 { "foo(.+)"sv, "\\\\\\1"sv, "foobar"sv, "\\bar"sv }, 904 { "foo(.)"sv, "a\\1"sv, "fooxfooy"sv, "axay"sv, ECMAScriptFlags::Multiline }, 905 }; 906 907 for (auto& test : tests) { 908 Regex<ECMA262> re(test.pattern, test.options); 909 if constexpr (REGEX_DEBUG) { 910 dbgln("\n"); 911 RegexDebug regex_dbg(stderr); 912 regex_dbg.print_raw_bytecode(re); 913 regex_dbg.print_header(); 914 regex_dbg.print_bytecode(re); 915 dbgln("\n"); 916 } 917 EXPECT_EQ(re.parser_result.error, regex::Error::NoError); 918 EXPECT_EQ(re.replace(test.subject, test.replacement), test.expected); 919 } 920} 921 922TEST_CASE(case_insensitive_match) 923{ 924 Regex<PosixExtended> re("cd", PosixFlags::Insensitive | PosixFlags::Global); 925 auto result = re.match("AEKFCD"sv); 926 927 EXPECT_EQ(result.success, true); 928 if (result.success) { 929 EXPECT_EQ(result.matches.at(0).column, 4ul); 930 } 931} 932 933TEST_CASE(extremely_long_fork_chain) 934{ 935 Regex<ECMA262> re("(?:aa)*"); 936 auto result = re.match(DeprecatedString::repeated('a', 1000)); 937 EXPECT_EQ(result.success, true); 938} 939 940TEST_CASE(theoretically_infinite_loop) 941{ 942 Array patterns { 943 "(a*)*"sv, // Infinitely matching empty substrings, the outer loop should short-circuit. 944 "(a*?)*"sv, // Infinitely matching empty substrings, the outer loop should short-circuit. 945 "(a*)*?"sv, // Should match exactly nothing. 946 "(?:)*?"sv, // Should not generate an infinite fork loop. 947 }; 948 for (auto& pattern : patterns) { 949 Regex<ECMA262> re(pattern); 950 auto result = re.match(""sv); 951 EXPECT_EQ(result.success, true); 952 } 953} 954 955static auto g_lots_of_a_s = DeprecatedString::repeated('a', 10'000'000); 956 957BENCHMARK_CASE(fork_performance) 958{ 959 Regex<ECMA262> re("(?:aa)*"); 960 auto result = re.match(g_lots_of_a_s); 961 EXPECT_EQ(result.success, true); 962} 963 964TEST_CASE(optimizer_atomic_groups) 965{ 966 Array tests { 967 // Fork -> ForkReplace 968 Tuple { "a*b"sv, "aaaaa"sv, false }, 969 Tuple { "a+b"sv, "aaaaa"sv, false }, 970 Tuple { "\\\\(\\d+)"sv, "\\\\"sv, false }, // Rewrite bug turning a+ to a*, see #10952. 971 Tuple { "[a-z.]+\\."sv, "..."sv, true }, // Rewrite bug, incorrect interpretation of Compare. 972 Tuple { "[.-]+\\."sv, ".-."sv, true }, 973 // Alternative fuse 974 Tuple { "(abcfoo|abcbar|abcbaz).*x"sv, "abcbarx"sv, true }, 975 Tuple { "(a|a)"sv, "a"sv, true }, 976 Tuple { "(a|)"sv, ""sv, true }, // Ensure that empty alternatives are not outright removed 977 Tuple { "a{2,3}|a{5,8}"sv, "abc"sv, false }, // Optimizer should not mess up the instruction stream by ignoring inter-insn dependencies, see #11247. 978 Tuple { "^(a{2,3}|a{5,8})$"sv, "aaaa"sv, false }, // Optimizer should not mess up the instruction stream by ignoring inter-insn dependencies, see #11247. 979 // Optimizer should not chop off *half* of an instruction when fusing instructions. 980 Tuple { "cubic-bezier\\(\\s*(-?\\d+\\.?\\d*|-?\\.\\d+)\\s*,\\s*(-?\\d+\\.?\\d*|-?\\.\\d+)\\s*,\\s*(-?\\d+\\.?\\d*|-?\\.\\d+)\\s*,\\s*(-?\\d+\\.?\\d*|-?\\.\\d+)\\s*\\)"sv, "cubic-bezier(.05, 0, 0, 1)"sv, true }, 981 // ForkReplace shouldn't be applied where it would change the semantics 982 Tuple { "(1+)\\1"sv, "11"sv, true }, 983 Tuple { "(1+)1"sv, "11"sv, true }, 984 Tuple { "(1+)0"sv, "10"sv, true }, 985 // Rewrite should not skip over first required iteration of <x>+. 986 Tuple { "a+"sv, ""sv, false }, 987 // 'y' and [^x] have an overlap ('y'), the loop should not be rewritten here. 988 Tuple { "[^x]+y"sv, "ay"sv, true }, 989 // .+ should not be rewritten here, as it's followed by something that would be matched by `.`. 990 Tuple { ".+(a|b|c)"sv, "xxa"sv, true }, 991 }; 992 993 for (auto& test : tests) { 994 Regex<ECMA262> re(test.get<0>()); 995 auto result = re.match(test.get<1>()); 996 EXPECT_EQ(result.success, test.get<2>()); 997 } 998} 999 1000TEST_CASE(optimizer_char_class_lut) 1001{ 1002 Regex<ECMA262> re(R"([\f\n\r\t\v\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000\ufeff]+$)"); 1003 1004 if constexpr (REGEX_DEBUG) { 1005 dbgln("\n"); 1006 RegexDebug regex_dbg(stderr); 1007 regex_dbg.print_raw_bytecode(re); 1008 regex_dbg.print_header(); 1009 regex_dbg.print_bytecode(re); 1010 dbgln("\n"); 1011 } 1012 1013 // This will go through _all_ alternatives in the character class, and then fail. 1014 for (size_t i = 0; i < 1'000'000; ++i) 1015 EXPECT_EQ(re.match("1635488940000"sv).success, false); 1016} 1017 1018TEST_CASE(optimizer_alternation) 1019{ 1020 Array tests { 1021 // Pattern, Subject, Expected length 1022 Tuple { "a|"sv, "a"sv, 1u }, 1023 }; 1024 1025 for (auto& test : tests) { 1026 Regex<ECMA262> re(test.get<0>()); 1027 auto result = re.match(test.get<1>()); 1028 EXPECT(result.success); 1029 EXPECT_EQ(result.matches.first().view.length(), test.get<2>()); 1030 } 1031} 1032 1033TEST_CASE(posix_basic_dollar_is_end_anchor) 1034{ 1035 // Ensure that a dollar sign at the end only matches the end of the line. 1036 { 1037 Regex<PosixBasic> re("abc$"); 1038 EXPECT_EQ(re.match("123abcdef"sv, PosixFlags::Global).success, false); 1039 EXPECT_EQ(re.match("123abc"sv, PosixFlags::Global).success, true); 1040 EXPECT_EQ(re.match("123abc$def"sv, PosixFlags::Global).success, false); 1041 EXPECT_EQ(re.match("123abc$"sv, PosixFlags::Global).success, false); 1042 } 1043} 1044 1045TEST_CASE(posix_basic_dollar_is_literal) 1046{ 1047 // Ensure that a dollar sign in the middle is treated as a literal. 1048 { 1049 Regex<PosixBasic> re("abc$d"); 1050 EXPECT_EQ(re.match("123abcdef"sv, PosixFlags::Global).success, false); 1051 EXPECT_EQ(re.match("123abc"sv, PosixFlags::Global).success, false); 1052 EXPECT_EQ(re.match("123abc$def"sv, PosixFlags::Global).success, true); 1053 EXPECT_EQ(re.match("123abc$"sv, PosixFlags::Global).success, false); 1054 } 1055 1056 // Ensure that a dollar sign is always treated as a literal if escaped, even if at the end of the pattern. 1057 { 1058 Regex<PosixBasic> re("abc\\$"); 1059 EXPECT_EQ(re.match("123abcdef"sv, PosixFlags::Global).success, false); 1060 EXPECT_EQ(re.match("123abc"sv, PosixFlags::Global).success, false); 1061 EXPECT_EQ(re.match("123abc$def"sv, PosixFlags::Global).success, true); 1062 EXPECT_EQ(re.match("123abc$"sv, PosixFlags::Global).success, true); 1063 } 1064} 1065 1066TEST_CASE(negative_lookahead) 1067{ 1068 { 1069 // Negative lookahead with more than 2 forks difference between lookahead init and finish. 1070 auto options = ECMAScriptOptions { ECMAScriptFlags::Global }; 1071 options.reset_flag((ECMAScriptFlags)regex::AllFlags::Internal_Stateful); 1072 Regex<ECMA262> re(":(?!\\^\\)|1)", options); 1073 EXPECT_EQ(re.match(":^)"sv).success, false); 1074 EXPECT_EQ(re.match(":1"sv).success, false); 1075 EXPECT_EQ(re.match(":foobar"sv).success, true); 1076 } 1077 { 1078 // Correctly count forks with nested groups and optimised loops 1079 Regex<ECMA262> re("^((?:[^\\n]|\\n(?! *\\n))+)(?:\\n *)+\\n"); 1080 EXPECT_EQ(re.match("foo\n\n"sv).success, true); 1081 EXPECT_EQ(re.match("foo\n"sv).success, false); 1082 } 1083} 1084 1085TEST_CASE(single_match_flag) 1086{ 1087 { 1088 // Ensure that only a single match is produced and nothing past that. 1089 Regex<ECMA262> re("[\\u0008-\\uffff]"sv, ECMAScriptFlags::Global | (ECMAScriptFlags)regex::AllFlags::SingleMatch); 1090 auto result = re.match("ABC"sv); 1091 EXPECT_EQ(result.success, true); 1092 EXPECT_EQ(result.matches.size(), 1u); 1093 EXPECT_EQ(result.matches.first().view.to_deprecated_string(), "A"sv); 1094 } 1095} 1096 1097TEST_CASE(empty_string_wildcard_match) 1098{ 1099 { 1100 // Ensure that the wildcard ".*" matches the empty string exactly once 1101 Regex<ECMA262> re(".*"sv, ECMAScriptFlags::Global); 1102 auto result = re.match(""sv); 1103 EXPECT_EQ(result.success, true); 1104 EXPECT_EQ(result.matches.size(), 1u); 1105 EXPECT_EQ(result.matches.first().view.to_deprecated_string(), ""sv); 1106 } 1107} 1108 1109TEST_CASE(inversion_state_in_char_class) 1110{ 1111 { 1112 // #13755, /[\S\s]/.exec("hello") should be [ "h" ], not null. 1113 Regex<ECMA262> re("[\\S\\s]", ECMAScriptFlags::Global | (ECMAScriptFlags)regex::AllFlags::SingleMatch); 1114 1115 auto result = re.match("hello"sv); 1116 EXPECT_EQ(result.success, true); 1117 EXPECT_EQ(result.matches.size(), 1u); 1118 EXPECT_EQ(result.matches.first().view.to_deprecated_string(), "h"sv); 1119 } 1120 { 1121 Regex<ECMA262> re("^(?:([^\\s!\"#%-,\\./;->@\\[-\\^`\\{-~]+(?=([=~}\\s/.)|]))))"sv, ECMAScriptFlags::Global); 1122 1123 auto result = re.match("slideNumbers}}"sv); 1124 EXPECT_EQ(result.success, true); 1125 EXPECT_EQ(result.matches.size(), 1u); 1126 EXPECT_EQ(result.matches.first().view.to_deprecated_string(), "slideNumbers"sv); 1127 EXPECT_EQ(result.capture_group_matches.first()[0].view.to_deprecated_string(), "slideNumbers"sv); 1128 EXPECT_EQ(result.capture_group_matches.first()[1].view.to_deprecated_string(), "}"sv); 1129 } 1130}