Serenity Operating System
1/*
2 * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 */
6
7#include <LibTest/TestCase.h> // import first, to prevent warning of VERIFY* redefinition
8
9#include <AK/Debug.h>
10#include <AK/StringBuilder.h>
11#include <AK/Tuple.h>
12#include <LibRegex/Regex.h>
13#include <LibRegex/RegexDebug.h>
14#include <LibRegex/RegexMatcher.h>
15#include <stdio.h>
16
17static ECMAScriptOptions match_test_api_options(const ECMAScriptOptions options)
18{
19 return options;
20}
21
22static PosixOptions match_test_api_options(const PosixOptions options)
23{
24 return options;
25}
26
27template<typename... Flags>
28static constexpr ECMAScriptFlags combine_flags(Flags&&... flags)
29requires((IsSame<Flags, ECMAScriptFlags> && ...))
30{
31 return static_cast<ECMAScriptFlags>((static_cast<regex::FlagsUnderlyingType>(flags) | ...));
32}
33
34TEST_CASE(regex_options_ecmascript)
35{
36 ECMAScriptOptions eo;
37 eo |= ECMAScriptFlags::Global;
38
39 EXPECT(eo.has_flag_set(ECMAScriptFlags::Global));
40 EXPECT(!eo.has_flag_set(ECMAScriptFlags::Insensitive));
41
42 eo = match_test_api_options(ECMAScriptFlags::Global | ECMAScriptFlags::Insensitive | ECMAScriptFlags::Sticky);
43 EXPECT(eo.has_flag_set(ECMAScriptFlags::Global));
44 EXPECT(eo.has_flag_set(ECMAScriptFlags::Insensitive));
45 EXPECT(eo.has_flag_set(ECMAScriptFlags::Sticky));
46 EXPECT(!eo.has_flag_set(ECMAScriptFlags::Unicode));
47 EXPECT(!eo.has_flag_set(ECMAScriptFlags::Multiline));
48 EXPECT(!eo.has_flag_set(ECMAScriptFlags::SingleLine));
49
50 eo &= ECMAScriptFlags::Insensitive;
51 EXPECT(!eo.has_flag_set(ECMAScriptFlags::Global));
52 EXPECT(eo.has_flag_set(ECMAScriptFlags::Insensitive));
53 EXPECT(!eo.has_flag_set(ECMAScriptFlags::Multiline));
54
55 eo &= ECMAScriptFlags::Sticky;
56 EXPECT(!eo.has_flag_set(ECMAScriptFlags::Global));
57 EXPECT(!eo.has_flag_set(ECMAScriptFlags::Insensitive));
58 EXPECT(!eo.has_flag_set(ECMAScriptFlags::Multiline));
59 EXPECT(!eo.has_flag_set(ECMAScriptFlags::Sticky));
60
61 eo = ~ECMAScriptFlags::Insensitive;
62 EXPECT(eo.has_flag_set(ECMAScriptFlags::Global));
63 EXPECT(!eo.has_flag_set(ECMAScriptFlags::Insensitive));
64 EXPECT(eo.has_flag_set(ECMAScriptFlags::Multiline));
65 EXPECT(eo.has_flag_set(ECMAScriptFlags::Sticky));
66}
67
68TEST_CASE(regex_options_posix)
69{
70 PosixOptions eo;
71 eo |= PosixFlags::Global;
72
73 EXPECT(eo.has_flag_set(PosixFlags::Global));
74 EXPECT(!eo.has_flag_set(PosixFlags::Insensitive));
75
76 eo = match_test_api_options(PosixFlags::Global | PosixFlags::Insensitive | PosixFlags::MatchNotBeginOfLine);
77 EXPECT(eo.has_flag_set(PosixFlags::Global));
78 EXPECT(eo.has_flag_set(PosixFlags::Insensitive));
79 EXPECT(eo.has_flag_set(PosixFlags::MatchNotBeginOfLine));
80 EXPECT(!eo.has_flag_set(PosixFlags::Unicode));
81 EXPECT(!eo.has_flag_set(PosixFlags::Multiline));
82
83 eo &= PosixFlags::Insensitive;
84 EXPECT(!eo.has_flag_set(PosixFlags::Global));
85 EXPECT(eo.has_flag_set(PosixFlags::Insensitive));
86 EXPECT(!eo.has_flag_set(PosixFlags::Multiline));
87
88 eo &= PosixFlags::MatchNotBeginOfLine;
89 EXPECT(!eo.has_flag_set(PosixFlags::Global));
90 EXPECT(!eo.has_flag_set(PosixFlags::Insensitive));
91 EXPECT(!eo.has_flag_set(PosixFlags::Multiline));
92
93 eo = ~PosixFlags::Insensitive;
94 EXPECT(eo.has_flag_set(PosixFlags::Global));
95 EXPECT(!eo.has_flag_set(PosixFlags::Insensitive));
96 EXPECT(eo.has_flag_set(PosixFlags::Multiline));
97}
98
99TEST_CASE(regex_lexer)
100{
101 Lexer l("/[.*+?^${}()|[\\]\\\\]/g"sv);
102 EXPECT(l.next().type() == regex::TokenType::Slash);
103 EXPECT(l.next().type() == regex::TokenType::LeftBracket);
104 EXPECT(l.next().type() == regex::TokenType::Period);
105 EXPECT(l.next().type() == regex::TokenType::Asterisk);
106 EXPECT(l.next().type() == regex::TokenType::Plus);
107 EXPECT(l.next().type() == regex::TokenType::Questionmark);
108 EXPECT(l.next().type() == regex::TokenType::Circumflex);
109 EXPECT(l.next().type() == regex::TokenType::Dollar);
110 EXPECT(l.next().type() == regex::TokenType::LeftCurly);
111 EXPECT(l.next().type() == regex::TokenType::RightCurly);
112 EXPECT(l.next().type() == regex::TokenType::LeftParen);
113 EXPECT(l.next().type() == regex::TokenType::RightParen);
114 EXPECT(l.next().type() == regex::TokenType::Pipe);
115 EXPECT(l.next().type() == regex::TokenType::LeftBracket);
116 EXPECT(l.next().type() == regex::TokenType::EscapeSequence);
117 EXPECT(l.next().type() == regex::TokenType::EscapeSequence);
118 EXPECT(l.next().type() == regex::TokenType::RightBracket);
119 EXPECT(l.next().type() == regex::TokenType::Slash);
120 EXPECT(l.next().type() == regex::TokenType::Char);
121}
122
123TEST_CASE(parser_error_parens)
124{
125 DeprecatedString pattern = "test()test";
126 Lexer l(pattern);
127 PosixExtendedParser p(l);
128 p.parse();
129 EXPECT(p.has_error());
130 EXPECT(p.error() == regex::Error::EmptySubExpression);
131}
132
133TEST_CASE(parser_error_special_characters_used_at_wrong_place)
134{
135 DeprecatedString pattern;
136 Vector<char, 5> chars = { '*', '+', '?', '{' };
137 StringBuilder b;
138
139 Lexer l;
140 PosixExtended p(l);
141
142 for (auto& ch : chars) {
143 // First in ere
144 b.clear();
145 b.append(ch);
146 pattern = b.to_deprecated_string();
147 l.set_source(pattern);
148 p.parse();
149 EXPECT(p.has_error());
150 EXPECT(p.error() == regex::Error::InvalidRepetitionMarker);
151
152 // After vertical line
153 b.clear();
154 b.append("a|"sv);
155 b.append(ch);
156 pattern = b.to_deprecated_string();
157 l.set_source(pattern);
158 p.parse();
159 EXPECT(p.has_error());
160 EXPECT(p.error() == regex::Error::InvalidRepetitionMarker);
161
162 // After circumflex
163 b.clear();
164 b.append('^');
165 b.append(ch);
166 pattern = b.to_deprecated_string();
167 l.set_source(pattern);
168 p.parse();
169 EXPECT(p.has_error());
170 EXPECT(p.error() == regex::Error::InvalidRepetitionMarker);
171
172 // After dollar
173 b.clear();
174 b.append('$');
175 b.append(ch);
176 pattern = b.to_deprecated_string();
177 l.set_source(pattern);
178 p.parse();
179 EXPECT(p.has_error());
180 EXPECT(p.error() == regex::Error::InvalidRepetitionMarker);
181
182 // After left parens
183 b.clear();
184 b.append('(');
185 b.append(ch);
186 b.append(')');
187 pattern = b.to_deprecated_string();
188 l.set_source(pattern);
189 p.parse();
190 EXPECT(p.has_error());
191 EXPECT(p.error() == regex::Error::InvalidRepetitionMarker);
192 }
193}
194
195TEST_CASE(parser_error_vertical_line_used_at_wrong_place)
196{
197 Lexer l;
198 PosixExtended p(l);
199
200 // First in ere
201 l.set_source("|asdf"sv);
202 p.parse();
203 EXPECT(p.has_error());
204 EXPECT(p.error() == regex::Error::EmptySubExpression);
205
206 // Last in ere
207 l.set_source("asdf|"sv);
208 p.parse();
209 EXPECT(p.has_error());
210 EXPECT(p.error() == regex::Error::EmptySubExpression);
211
212 // After left parens
213 l.set_source("(|asdf)"sv);
214 p.parse();
215 EXPECT(p.has_error());
216 EXPECT(p.error() == regex::Error::EmptySubExpression);
217
218 // Proceed right parens
219 l.set_source("(asdf)|"sv);
220 p.parse();
221 EXPECT(p.has_error());
222 EXPECT(p.error() == regex::Error::EmptySubExpression);
223}
224
225TEST_CASE(catch_all_first)
226{
227 Regex<PosixExtended> re("^.*$");
228 RegexResult m;
229 re.match("Hello World"sv, m);
230 EXPECT(m.count == 1);
231 EXPECT(re.match("Hello World"sv, m));
232}
233
234TEST_CASE(catch_all)
235{
236 Regex<PosixExtended> re("^.*$", PosixFlags::Global);
237
238 EXPECT(re.has_match("Hello World"sv));
239 EXPECT(re.match("Hello World"sv).success);
240 EXPECT(re.match("Hello World"sv).count == 1);
241
242 EXPECT(has_match("Hello World"sv, re));
243 auto res = match("Hello World"sv, re);
244 EXPECT(res.success);
245 EXPECT(res.count == 1);
246 EXPECT(res.matches.size() == 1);
247 EXPECT(res.matches.first().view == "Hello World");
248}
249
250TEST_CASE(catch_all_again)
251{
252 Regex<PosixExtended> re("^.*$", PosixFlags::Extra);
253 EXPECT_EQ(has_match("Hello World"sv, re), true);
254}
255
256TEST_CASE(char_utf8)
257{
258 Regex<PosixExtended> re("😀");
259 RegexResult result;
260
261 EXPECT_EQ((result = match(Utf8View { "Привет, мир! 😀 γειά σου κόσμος 😀 こんにちは世界"sv }, re, PosixFlags::Global)).success, true);
262 EXPECT_EQ(result.count, 2u);
263}
264
265TEST_CASE(catch_all_newline)
266{
267 Regex<PosixExtended> re("^.*$", PosixFlags::Multiline | PosixFlags::StringCopyMatches);
268 RegexResult result;
269 auto lambda = [&result, &re]() {
270 DeprecatedString aaa = "Hello World\nTest\n1234\n";
271 result = match(aaa, re);
272 EXPECT_EQ(result.success, true);
273 };
274 lambda();
275 EXPECT_EQ(result.count, 3u);
276 EXPECT_EQ(result.matches.at(0).view, "Hello World");
277 EXPECT_EQ(result.matches.at(1).view, "Test");
278 EXPECT_EQ(result.matches.at(2).view, "1234");
279}
280
281TEST_CASE(catch_all_newline_view)
282{
283 Regex<PosixExtended> re("^.*$", PosixFlags::Multiline);
284 RegexResult result;
285
286 DeprecatedString aaa = "Hello World\nTest\n1234\n";
287 result = match(aaa, re);
288 EXPECT_EQ(result.success, true);
289 EXPECT_EQ(result.count, 3u);
290 DeprecatedString str = "Hello World";
291 EXPECT_EQ(result.matches.at(0).view, str.view());
292 EXPECT_EQ(result.matches.at(1).view, "Test");
293 EXPECT_EQ(result.matches.at(2).view, "1234");
294}
295
296TEST_CASE(catch_all_newline_2)
297{
298 Regex<PosixExtended> re("^.*$");
299 RegexResult result;
300 result = match("Hello World\nTest\n1234\n"sv, re, PosixFlags::Multiline | PosixFlags::StringCopyMatches);
301 EXPECT_EQ(result.success, true);
302 EXPECT_EQ(result.count, 3u);
303 EXPECT_EQ(result.matches.at(0).view, "Hello World");
304 EXPECT_EQ(result.matches.at(1).view, "Test");
305 EXPECT_EQ(result.matches.at(2).view, "1234");
306
307 result = match("Hello World\nTest\n1234\n"sv, re);
308 EXPECT_EQ(result.success, true);
309 EXPECT_EQ(result.count, 1u);
310 EXPECT_EQ(result.matches.at(0).view, "Hello World\nTest\n1234\n");
311}
312
313TEST_CASE(match_all_character_class)
314{
315 Regex<PosixExtended> re("[[:alpha:]]");
316 DeprecatedString str = "[Window]\nOpacity=255\nAudibleBeep=0\n";
317 RegexResult result = match(str, re, PosixFlags::Global | PosixFlags::StringCopyMatches);
318
319 EXPECT_EQ(result.success, true);
320 EXPECT_EQ(result.count, 24u);
321 EXPECT_EQ(result.matches.at(0).view, "W");
322 EXPECT_EQ(result.matches.at(1).view, "i");
323 EXPECT_EQ(result.matches.at(2).view, "n");
324}
325
326TEST_CASE(match_character_class_with_assertion)
327{
328 Regex<PosixExtended> re("[[:alpha:]]+$");
329 DeprecatedString str = "abcdef";
330 RegexResult result = match(str, re);
331
332 EXPECT_EQ(result.success, true);
333 EXPECT_EQ(result.count, 1u);
334}
335
336TEST_CASE(example_for_git_commit)
337{
338 Regex<PosixExtended> re("^.*$");
339 auto result = re.match("Well, hello friends!\nHello World!"sv);
340
341 EXPECT(result.success);
342 EXPECT(result.count == 1);
343 EXPECT(result.matches.at(0).view.starts_with("Well"sv));
344 EXPECT(result.matches.at(0).view.length() == 33);
345
346 EXPECT(re.has_match("Well,...."sv));
347
348 result = re.match("Well, hello friends!\nHello World!"sv, PosixFlags::Multiline);
349
350 EXPECT(result.success);
351 EXPECT(result.count == 2);
352 EXPECT(result.matches.at(0).view == "Well, hello friends!");
353 EXPECT(result.matches.at(1).view == "Hello World!");
354}
355
356TEST_CASE(email_address)
357{
358 Regex<PosixExtended> re("^[A-Z0-9a-z._%+-]{1,64}@([A-Za-z0-9-]{1,63}\\.){1,125}[A-Za-z]{2,63}$");
359 EXPECT(re.has_match("hello.world@domain.tld"sv));
360 EXPECT(re.has_match("this.is.a.very_long_email_address@world.wide.web"sv));
361}
362
363TEST_CASE(ini_file_entries)
364{
365 Regex<PosixExtended> re("[[:alpha:]]*=([[:digit:]]*)|\\[(.*)\\]");
366 RegexResult result;
367
368 if constexpr (REGEX_DEBUG) {
369 RegexDebug regex_dbg(stderr);
370 regex_dbg.print_raw_bytecode(re);
371 regex_dbg.print_header();
372 regex_dbg.print_bytecode(re);
373 }
374
375 DeprecatedString haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
376 EXPECT_EQ(re.search(haystack.view(), result, PosixFlags::Multiline), true);
377 EXPECT_EQ(result.count, 3u);
378
379 if constexpr (REGEX_DEBUG) {
380 for (auto& v : result.matches)
381 fprintf(stderr, "%s\n", v.view.to_deprecated_string().characters());
382 }
383
384 EXPECT_EQ(result.matches.at(0).view, "[Window]");
385 EXPECT_EQ(result.capture_group_matches.at(0).at(0).view, "Window");
386 EXPECT_EQ(result.matches.at(1).view, "Opacity=255");
387 EXPECT_EQ(result.matches.at(1).line, 1u);
388 EXPECT_EQ(result.matches.at(1).column, 0u);
389 EXPECT_EQ(result.capture_group_matches.at(1).at(0).view, "255");
390 EXPECT_EQ(result.capture_group_matches.at(1).at(0).line, 1u);
391 EXPECT_EQ(result.capture_group_matches.at(1).at(0).column, 8u);
392 EXPECT_EQ(result.matches.at(2).view, "AudibleBeep=0");
393 EXPECT_EQ(result.capture_group_matches.at(2).at(0).view, "0");
394 EXPECT_EQ(result.capture_group_matches.at(2).at(0).line, 2u);
395 EXPECT_EQ(result.capture_group_matches.at(2).at(0).column, 12u);
396}
397
398TEST_CASE(ini_file_entries2)
399{
400 Regex<PosixExtended> re("[[:alpha:]]*=([[:digit:]]*)");
401 RegexResult result;
402
403 DeprecatedString haystack = "ViewMode=Icon";
404
405 EXPECT_EQ(re.match(haystack.view(), result), false);
406 EXPECT_EQ(result.count, 0u);
407
408 EXPECT_EQ(re.search(haystack.view(), result), true);
409 EXPECT_EQ(result.count, 1u);
410}
411
412TEST_CASE(named_capture_group)
413{
414 Regex<PosixExtended> re("[[:alpha:]]*=(?<Test>[[:digit:]]*)");
415 RegexResult result;
416
417 if constexpr (REGEX_DEBUG) {
418 RegexDebug regex_dbg(stderr);
419 regex_dbg.print_raw_bytecode(re);
420 regex_dbg.print_header();
421 regex_dbg.print_bytecode(re);
422 }
423
424 DeprecatedString haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
425 EXPECT_EQ(re.search(haystack, result, PosixFlags::Multiline), true);
426 EXPECT_EQ(result.count, 2u);
427 EXPECT_EQ(result.matches.at(0).view, "Opacity=255");
428 EXPECT_EQ(result.capture_group_matches.at(0).at(0).view, "255");
429 EXPECT_EQ(result.capture_group_matches.at(0).at(0).capture_group_name, "Test");
430 EXPECT_EQ(result.matches.at(1).view, "AudibleBeep=0");
431 EXPECT_EQ(result.capture_group_matches.at(1).at(0).view, "0");
432 EXPECT_EQ(result.capture_group_matches.at(1).at(0).capture_group_name, "Test");
433}
434
435TEST_CASE(ecma262_named_capture_group_with_dollar_sign)
436{
437 Regex<ECMA262> re("[a-zA-Z]*=(?<$Test$>[0-9]*)");
438 RegexResult result;
439
440 if constexpr (REGEX_DEBUG) {
441 RegexDebug regex_dbg(stderr);
442 regex_dbg.print_raw_bytecode(re);
443 regex_dbg.print_header();
444 regex_dbg.print_bytecode(re);
445 }
446
447 DeprecatedString haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
448 EXPECT_EQ(re.search(haystack, result, ECMAScriptFlags::Multiline), true);
449 EXPECT_EQ(result.count, 2u);
450 EXPECT_EQ(result.matches.at(0).view, "Opacity=255");
451 EXPECT_EQ(result.capture_group_matches.at(0).at(0).view, "255");
452 EXPECT_EQ(result.capture_group_matches.at(0).at(0).capture_group_name, "$Test$");
453 EXPECT_EQ(result.matches.at(1).view, "AudibleBeep=0");
454 EXPECT_EQ(result.capture_group_matches.at(1).at(0).view, "0");
455 EXPECT_EQ(result.capture_group_matches.at(1).at(0).capture_group_name, "$Test$");
456}
457
458TEST_CASE(a_star)
459{
460 Regex<PosixExtended> re("a*");
461 RegexResult result;
462
463 if constexpr (REGEX_DEBUG) {
464 RegexDebug regex_dbg(stderr);
465 regex_dbg.print_raw_bytecode(re);
466 regex_dbg.print_header();
467 regex_dbg.print_bytecode(re);
468 }
469
470 DeprecatedString haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
471 EXPECT_EQ(re.search(haystack.view(), result, PosixFlags::Multiline), true);
472 EXPECT_EQ(result.count, 32u);
473 if (result.count == 32u) {
474 EXPECT_EQ(result.matches.at(0).view.length(), 0u);
475 EXPECT_EQ(result.matches.at(10).view.length(), 1u);
476 EXPECT_EQ(result.matches.at(10).view, "a");
477 EXPECT_EQ(result.matches.at(31).view.length(), 0u);
478 }
479}
480
481TEST_CASE(simple_period_end_benchmark)
482{
483 Regex<PosixExtended> re("hello.$");
484 RegexResult m;
485 EXPECT_EQ(re.search("Hello1"sv, m), false);
486 EXPECT_EQ(re.search("hello1hello1"sv, m), true);
487 EXPECT_EQ(re.search("hello2hell"sv, m), false);
488 EXPECT_EQ(re.search("hello?"sv, m), true);
489}
490
491TEST_CASE(posix_extended_nested_capture_group)
492{
493 Regex<PosixExtended> re("(h(e(?<llo>llo)))"); // group 0 -> "hello", group 1 -> "ello", group 2/"llo" -> "llo"
494 auto result = re.match("hello"sv);
495 EXPECT(result.success);
496 EXPECT_EQ(result.capture_group_matches.size(), 1u);
497 EXPECT_EQ(result.capture_group_matches[0].size(), 3u);
498 EXPECT_EQ(result.capture_group_matches[0][0].view, "hello"sv);
499 EXPECT_EQ(result.capture_group_matches[0][1].view, "ello"sv);
500 EXPECT_EQ(result.capture_group_matches[0][2].view, "llo"sv);
501}
502
503auto parse_test_case_long_disjunction_chain = DeprecatedString::repeated("a|"sv, 100000);
504
505TEST_CASE(ECMA262_parse)
506{
507 struct _test {
508 StringView pattern;
509 regex::Error expected_error { regex::Error::NoError };
510 regex::ECMAScriptFlags flags {};
511 };
512
513 _test const tests[] {
514 { "^hello.$"sv },
515 { "^(hello.)$"sv },
516 { "^h{0,1}ello.$"sv },
517 { "^hello\\W$"sv },
518 { "^hell\\w.$"sv },
519 { "^hell\\x6f1$"sv }, // ^hello1$
520 { "^hel(?:l\\w).$"sv },
521 { "^hel(?<LO>l\\w).$"sv },
522 { "^[-a-zA-Z\\w\\s]+$"sv },
523 { "\\bhello\\B"sv },
524 { "^[\\w+/_-]+[=]{0,2}$"sv }, // #4189
525 { "^(?:[^<]*(<[\\w\\W]+>)[^>]*$|#([\\w\\-]*)$)"sv }, // #4189
526 { "\\/"sv }, // #4189
527 { ",/=-:"sv }, // #4243
528 { "\\x"sv }, // Even invalid escapes are allowed if ~unicode.
529 { "\\x1"sv }, // Even invalid escapes are allowed if ~unicode.
530 { "\\x1"sv, regex::Error::InvalidPattern, regex::ECMAScriptFlags::Unicode },
531 { "\\x11"sv },
532 { "\\x11"sv, regex::Error::NoError, regex::ECMAScriptFlags::Unicode },
533 { "\\"sv, regex::Error::InvalidTrailingEscape },
534 { "(?"sv, regex::Error::InvalidCaptureGroup },
535 { "\\u1234"sv, regex::Error::NoError, regex::ECMAScriptFlags::Unicode },
536 { "[\\u1234]"sv, regex::Error::NoError, regex::ECMAScriptFlags::Unicode },
537 { "\\u1"sv, regex::Error::InvalidPattern, regex::ECMAScriptFlags::Unicode },
538 { "[\\u1]"sv, regex::Error::InvalidPattern, regex::ECMAScriptFlags::Unicode },
539 { ",(?"sv, regex::Error::InvalidCaptureGroup }, // #4583
540 { "{1}"sv, regex::Error::InvalidPattern },
541 { "{1,2}"sv, regex::Error::InvalidPattern },
542 { "\\uxxxx"sv, regex::Error::NoError },
543 { "\\uxxxx"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
544 { "\\ud83d"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
545 { "\\ud83d\\uxxxx"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
546 { "\\u{0}"sv },
547 { "\\u{0}"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
548 { "\\u{10ffff}"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
549 { "\\u{10ffff"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
550 { "\\u{10ffffx"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
551 { "\\u{110000}"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
552 { "\\p"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
553 { "\\p{"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
554 { "\\p{}"sv, regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode },
555 { "\\p{AsCiI}"sv, regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode },
556 { "\\p{hello friends}"sv, regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode },
557 { "\\p{Prepended_Concatenation_Mark}"sv, regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode },
558 { "\\p{ASCII}"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
559 { "\\\\p{1}"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
560 { "\\\\p{AsCiI}"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
561 { "\\\\p{ASCII}"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
562 { "\\c"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
563 { "\\c"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
564 { "[\\c]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
565 { "[\\c]"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
566 { "\\c`"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
567 { "\\c`"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
568 { "[\\c`]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
569 { "[\\c`]"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
570 { "\\A"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
571 { "\\A"sv, regex::Error::InvalidCharacterClass, ECMAScriptFlags::Unicode },
572 { "[\\A]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
573 { "[\\A]"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
574 { "\\0"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
575 { "\\0"sv, regex::Error::NoError, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
576 { "\\00"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
577 { "\\00"sv, regex::Error::InvalidCharacterClass, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
578 { "[\\0]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
579 { "[\\0]"sv, regex::Error::NoError, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
580 { "[\\00]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
581 { "[\\00]"sv, regex::Error::InvalidPattern, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
582 { "\\^\\$\\\\\\.\\*\\+\\?\\(\\)\\[\\]\\{\\}\\|\\/"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
583 { "[\\^\\$\\\\\\.\\*\\+\\?\\(\\)\\[\\]\\{\\}\\|\\/]"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
584 { "]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
585 { "]"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
586 { "\\]"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
587 { "}"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
588 { "}"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
589 { "\\}"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
590 { "a{9007199254740991}"sv }, // 2^53 - 1
591 { "a{9007199254740991,}"sv },
592 { "a{9007199254740991,9007199254740991}"sv },
593 { "a{9007199254740992}"sv, regex::Error::InvalidBraceContent },
594 { "a{9007199254740992,}"sv, regex::Error::InvalidBraceContent },
595 { "a{9007199254740991,9007199254740992}"sv, regex::Error::InvalidBraceContent },
596 { "a{9007199254740992,9007199254740991}"sv, regex::Error::InvalidBraceContent },
597 { "a{9007199254740992,9007199254740992}"sv, regex::Error::InvalidBraceContent },
598 { "(?<a>a)(?<a>b)"sv, regex::Error::DuplicateNamedCapture },
599 { "(?<a>a)(?<b>b)(?<a>c)"sv, regex::Error::DuplicateNamedCapture },
600 { "(?<1a>a)"sv, regex::Error::InvalidNameForCaptureGroup },
601 { "(?<\\a>a)"sv, regex::Error::InvalidNameForCaptureGroup },
602 { "(?<\ta>a)"sv, regex::Error::InvalidNameForCaptureGroup },
603 { "(?<$$_$$>a)"sv },
604 { "(?<ÿ>a)"sv },
605 { "(?<𝓑𝓻𝓸𝔀𝓷>a)"sv },
606 { "((?=lg)?[vl]k\\-?\\d{3}) bui| 3\\.[-\\w; ]{10}lg?-([06cv9]{3,4})"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, // #12373, quantifiable assertions.
607 { parse_test_case_long_disjunction_chain.view() }, // A whole lot of disjunctions, should not overflow the stack.
608 { "(\"|')(?:(?!\\2)[^\\\\\\r\\n]|\\\\.)*\\2"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, // LegacyOctalEscapeSequence should not consume too many chars (and should not crash)
609 };
610
611 for (auto& test : tests) {
612 Regex<ECMA262> re(test.pattern, test.flags);
613 EXPECT_EQ(re.parser_result.error, test.expected_error);
614 if constexpr (REGEX_DEBUG) {
615 dbgln("\n");
616 RegexDebug regex_dbg(stderr);
617 regex_dbg.print_raw_bytecode(re);
618 regex_dbg.print_header();
619 regex_dbg.print_bytecode(re);
620 dbgln("\n");
621 }
622 }
623}
624
625TEST_CASE(ECMA262_match)
626{
627 struct _test {
628 StringView pattern;
629 StringView subject;
630 bool matches { true };
631 ECMAScriptFlags options {};
632 };
633 // clang-format off
634 constexpr _test tests[] {
635 { "^hello.$"sv, "hello1"sv },
636 { "^(hello.)$"sv, "hello1"sv },
637 { "^h{0,1}ello.$"sv, "ello1"sv },
638 { "^hello\\W$"sv, "hello!"sv },
639 { "^hell\\w.$"sv, "hellx!"sv },
640 { "^hell\\x6f1$"sv, "hello1"sv },
641 { "^hel(?<LO>l.)1$"sv, "hello1"sv },
642 { "^hel(?<LO>l.)1*\\k<LO>.$"sv, "hello1lo1"sv },
643 { "^[-a-z1-3\\s]+$"sv, "hell2 o1"sv },
644 { "^[\\0-\\x1f]$"sv, "\n"sv },
645 { .pattern = "\\bhello\\B"sv, .subject = "hello1"sv, .options = ECMAScriptFlags::Global },
646 { "\\b.*\\b"sv, "hello1"sv },
647 { "[^\\D\\S]{2}"sv, "1 "sv },
648 { "bar(?=f.)foo"sv, "barfoo"sv },
649 { "bar(?=foo)bar"sv, "barbar"sv, false },
650 { "bar(?!foo)bar"sv, "barbar"sv, true },
651 { "bar(?!bar)bar"sv, "barbar"sv, false },
652 { "bar.*(?<=foo)"sv, "barbar"sv, false },
653 { "bar.*(?<!foo)"sv, "barbar"sv, true },
654 { "((...)X)+"sv, "fooXbarXbazX"sv, true },
655 { "(?:)"sv, ""sv, true },
656 { "\\^"sv, "^"sv },
657 { "\\^\\$\\\\\\.\\*\\+\\?\\(\\)\\[\\]\\{\\}\\|\\/"sv, "^$\\.*+?()[]{}|/"sv, true, ECMAScriptFlags::Unicode },
658 { "[\\^\\$\\\\\\.\\*\\+\\?\\(\\)\\[\\]\\{\\}\\|\\/]{15}"sv, "^$\\.*+?()[]{}|/"sv, true, ECMAScriptFlags::Unicode },
659 { "(a{2}){3}"sv, "aaaaaa"sv },
660 { "(a{2}){3}"sv, "aaaabaa"sv, false },
661 { "(a{2}){4}"sv, "aaaaaaaa"sv },
662 { "(a{2}){4}"sv, "aaaaaabaa"sv, false },
663 { "(a{3}){2}"sv, "aaaaaa"sv },
664 { "(a{3}){2}"sv, "aaaabaa"sv, false },
665 { "(a{4}){2}"sv, "aaaaaaaa"sv },
666 { "(a{4}){2}"sv, "aaaaaabaa"sv, false },
667 { "\\u{4}"sv, "uuuu"sv },
668 { "(?<=.{3})f"sv, "abcdef"sv, true, (ECMAScriptFlags)regex::AllFlags::Global },
669 { "(?<=.{3})f"sv, "abc😀ef"sv, true, (ECMAScriptFlags)regex::AllFlags::Global },
670 // ECMA262, B.1.4. Regular Expression Pattern extensions for browsers
671 { "{"sv, "{"sv, true, ECMAScriptFlags::BrowserExtended },
672 { "\\5"sv, "\5"sv, true, ECMAScriptFlags::BrowserExtended },
673 { "\\05"sv, "\5"sv, true, ECMAScriptFlags::BrowserExtended },
674 { "\\455"sv, "\45""5"sv, true, ECMAScriptFlags::BrowserExtended },
675 { "\\314"sv, "\314"sv, true, ECMAScriptFlags::BrowserExtended },
676 { "\\c"sv, "\\c"sv, true, ECMAScriptFlags::BrowserExtended },
677 { "\\cf"sv, "\06"sv, true, ECMAScriptFlags::BrowserExtended },
678 { "\\c1"sv, "\\c1"sv, true, ECMAScriptFlags::BrowserExtended },
679 { "[\\c1]"sv, "\x11"sv, true, ECMAScriptFlags::BrowserExtended },
680 { "[\\w-\\d]"sv, "-"sv, true, ECMAScriptFlags::BrowserExtended },
681 { "^(?:^^\\.?|[!+-]|!=|!==|#|%|%=|&|&&|&&=|&=|\\(|\\*|\\*=|\\+=|,|-=|->|\\/|\\/=|:|::|;|<|<<|<<=|<=|=|==|===|>|>=|>>|>>=|>>>|>>>=|[?@[^]|\\^=|\\^\\^|\\^\\^=|{|\\||\\|=|\\|\\||\\|\\|=|~|break|case|continue|delete|do|else|finally|instanceof|return|throw|try|typeof)\\s*(\\/(?=[^*/])(?:[^/[\\\\]|\\\\[\\S\\s]|\\[(?:[^\\\\\\]]|\\\\[\\S\\s])*(?:]|$))+\\/)"sv,
682 "return /xx/"sv, true, ECMAScriptFlags::BrowserExtended
683 }, // #5517, appears to be matching JS expressions that involve regular expressions...
684 { "a{2,}"sv, "aaaa"sv }, // #5518
685 { "\\0"sv, "\0"sv, true, ECMAScriptFlags::BrowserExtended },
686 { "\\0"sv, "\0"sv, true, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
687 { "\\01"sv, "\1"sv, true, ECMAScriptFlags::BrowserExtended },
688 { "[\\0]"sv, "\0"sv, true, ECMAScriptFlags::BrowserExtended },
689 { "[\\0]"sv, "\0"sv, true, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
690 { "[\\01]"sv, "\1"sv, true, ECMAScriptFlags::BrowserExtended },
691 { "(\0|a)"sv, "a"sv, true }, // #9686, Should allow null bytes in pattern
692 { "(.*?)a(?!(a+)b\\2c)\\2(.*)"sv, "baaabaac"sv, true }, // #6042, Groups inside lookarounds may be referenced outside, but their contents appear empty if the pattern in the lookaround fails.
693 { "a|$"sv, "x"sv, true, (ECMAScriptFlags)regex::AllFlags::Global }, // #11940, Global (not the 'g' flag) regexps should attempt to match the zero-length end of the string too.
694 { "foo\nbar"sv, "foo\nbar"sv, true }, // #12126, ECMA262 regexp should match literal newlines without the 's' flag.
695 { "foo[^]bar"sv, "foo\nbar"sv, true }, // #12126, ECMA262 regexp should match newline with [^].
696 { "^[_A-Z]+$"sv, "_aA"sv, true, ECMAScriptFlags::Insensitive }, // Insensitive lookup table: characters in a range do not necessarily lie in the same range after being converted to lowercase.
697 { "^[a-sy-z]$"sv, "b"sv, true, ECMAScriptFlags::Insensitive },
698 { "^[a-sy-z]$"sv, "y"sv, true, ECMAScriptFlags::Insensitive },
699 { "^[a-sy-z]$"sv, "u"sv, false, ECMAScriptFlags::Insensitive },
700 { "."sv, "\n\r\u2028\u2029"sv, false }, // Dot should not match any of CR/LF/LS/PS in ECMA262 mode without DotAll.
701 };
702 // clang-format on
703
704 for (auto& test : tests) {
705 Regex<ECMA262> re(test.pattern, test.options);
706 if constexpr (REGEX_DEBUG) {
707 dbgln("\n");
708 RegexDebug regex_dbg(stderr);
709 regex_dbg.print_raw_bytecode(re);
710 regex_dbg.print_header();
711 regex_dbg.print_bytecode(re);
712 dbgln("\n");
713 }
714 EXPECT_EQ(re.parser_result.error, regex::Error::NoError);
715 EXPECT_EQ(re.match(test.subject).success, test.matches);
716 }
717}
718
719TEST_CASE(ECMA262_unicode_match)
720{
721 constexpr auto space_and_line_terminator_code_points = Array { 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0020, 0x00A0, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000, 0xFEFF };
722
723 StringBuilder builder;
724 for (u32 code_point : space_and_line_terminator_code_points)
725 builder.append_code_point(code_point);
726 auto space_and_line_terminators = builder.to_deprecated_string();
727
728 struct _test {
729 StringView pattern;
730 StringView subject;
731 bool matches { true };
732 ECMAScriptFlags options {};
733 };
734 _test tests[] {
735 { "\xf0\x9d\x8c\x86"sv, "abcdef"sv, false, ECMAScriptFlags::Unicode },
736 { "[\xf0\x9d\x8c\x86]"sv, "abcdef"sv, false, ECMAScriptFlags::Unicode },
737 { "\\ud83d"sv, "😀"sv, true },
738 { "\\ud83d"sv, "😀"sv, false, ECMAScriptFlags::Unicode },
739 { "\\ude00"sv, "😀"sv, true },
740 { "\\ude00"sv, "😀"sv, false, ECMAScriptFlags::Unicode },
741 { "\\ud83d\\ude00"sv, "😀"sv, true },
742 { "\\ud83d\\ude00"sv, "😀"sv, true, ECMAScriptFlags::Unicode },
743 { "\\u{1f600}"sv, "😀"sv, true, ECMAScriptFlags::Unicode },
744 { "\\ud83d\\ud83d"sv, "\xed\xa0\xbd\xed\xa0\xbd"sv, true },
745 { "\\ud83d\\ud83d"sv, "\xed\xa0\xbd\xed\xa0\xbd"sv, true, ECMAScriptFlags::Unicode },
746 { "(?<=.{3})f"sv, "abcdef"sv, true, ECMAScriptFlags::Unicode },
747 { "(?<=.{3})f"sv, "abc😀ef"sv, true, ECMAScriptFlags::Unicode },
748 { "(?<𝓑𝓻𝓸𝔀𝓷>brown)"sv, "brown"sv, true, ECMAScriptFlags::Unicode },
749 { "(?<\\u{1d4d1}\\u{1d4fb}\\u{1d4f8}\\u{1d500}\\u{1d4f7}>brown)"sv, "brown"sv, true, ECMAScriptFlags::Unicode },
750 { "(?<\\ud835\\udcd1\\ud835\\udcfb\\ud835\\udcf8\\ud835\\udd00\\ud835\\udcf7>brown)"sv, "brown"sv, true, ECMAScriptFlags::Unicode },
751 { "^\\s+$"sv, space_and_line_terminators },
752 { "^\\s+$"sv, space_and_line_terminators, true, ECMAScriptFlags::Unicode },
753 };
754
755 for (auto& test : tests) {
756 Regex<ECMA262> re(test.pattern, (ECMAScriptFlags)regex::AllFlags::Global | test.options);
757
758 auto subject = MUST(AK::utf8_to_utf16(test.subject));
759 Utf16View view { subject };
760
761 if constexpr (REGEX_DEBUG) {
762 dbgln("\n");
763 RegexDebug regex_dbg(stderr);
764 regex_dbg.print_raw_bytecode(re);
765 regex_dbg.print_header();
766 regex_dbg.print_bytecode(re);
767 dbgln("\n");
768 }
769
770 EXPECT_EQ(re.parser_result.error, regex::Error::NoError);
771 EXPECT_EQ(re.match(view).success, test.matches);
772 }
773}
774
775TEST_CASE(ECMA262_unicode_sets_match)
776{
777 struct _test {
778 StringView pattern;
779 StringView subject;
780 bool matches { true };
781 ECMAScriptFlags options {};
782 };
783
784 constexpr _test tests[] {
785 { "[\\w--x]"sv, "x"sv, false },
786 { "[\\w&&x]"sv, "y"sv, false },
787 { "[\\w--x]"sv, "y"sv, true },
788 { "[\\w&&x]"sv, "x"sv, true },
789 { "[[0-9\\w]--x--6]"sv, "6"sv, false },
790 { "[[0-9\\w]--x--6]"sv, "x"sv, false },
791 { "[[0-9\\w]--x--6]"sv, "y"sv, true },
792 { "[[0-9\\w]--x--6]"sv, "9"sv, true },
793 { "[\\w&&\\d]"sv, "a"sv, false },
794 { "[\\w&&\\d]"sv, "4"sv, true },
795 };
796
797 for (auto& test : tests) {
798 Regex<ECMA262> re(test.pattern, (ECMAScriptFlags)regex::AllFlags::UnicodeSets | test.options);
799 if constexpr (REGEX_DEBUG) {
800 dbgln("\n");
801 RegexDebug regex_dbg(stderr);
802 regex_dbg.print_raw_bytecode(re);
803 regex_dbg.print_header();
804 regex_dbg.print_bytecode(re);
805 dbgln("\n");
806 }
807
808 EXPECT_EQ(re.parser_result.error, regex::Error::NoError);
809 auto result = re.match(test.subject).success;
810 EXPECT_EQ(result, test.matches);
811 }
812}
813
814TEST_CASE(ECMA262_property_match)
815{
816 struct _test {
817 StringView pattern;
818 StringView subject;
819 bool matches { true };
820 ECMAScriptFlags options {};
821 };
822
823 constexpr _test tests[] {
824 { "\\p{ASCII}"sv, "a"sv, false },
825 { "\\p{ASCII}"sv, "p{ASCII}"sv, true },
826 { "\\p{ASCII}"sv, "a"sv, true, ECMAScriptFlags::Unicode },
827 { "\\p{ASCII}"sv, "😀"sv, false, ECMAScriptFlags::Unicode },
828 { "\\P{ASCII}"sv, "a"sv, false, ECMAScriptFlags::Unicode },
829 { "\\P{ASCII}"sv, "😀"sv, true, ECMAScriptFlags::Unicode },
830 { "\\p{ASCII_Hex_Digit}"sv, "1"sv, true, ECMAScriptFlags::Unicode },
831 { "\\p{ASCII_Hex_Digit}"sv, "a"sv, true, ECMAScriptFlags::Unicode },
832 { "\\p{ASCII_Hex_Digit}"sv, "x"sv, false, ECMAScriptFlags::Unicode },
833 { "\\P{ASCII_Hex_Digit}"sv, "1"sv, false, ECMAScriptFlags::Unicode },
834 { "\\P{ASCII_Hex_Digit}"sv, "a"sv, false, ECMAScriptFlags::Unicode },
835 { "\\P{ASCII_Hex_Digit}"sv, "x"sv, true, ECMAScriptFlags::Unicode },
836 { "\\p{Any}"sv, "\xcd\xb8"sv, true, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point.
837 { "\\P{Any}"sv, "\xcd\xb8"sv, false, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point.
838 { "\\p{Assigned}"sv, "\xcd\xb8"sv, false, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point.
839 { "\\P{Assigned}"sv, "\xcd\xb8"sv, true, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point.
840 { "\\p{Lu}"sv, "a"sv, false, ECMAScriptFlags::Unicode },
841 { "\\p{Lu}"sv, "A"sv, true, ECMAScriptFlags::Unicode },
842 { "\\p{Lu}"sv, "9"sv, false, ECMAScriptFlags::Unicode },
843 { "\\p{Cased_Letter}"sv, "a"sv, true, ECMAScriptFlags::Unicode },
844 { "\\p{Cased_Letter}"sv, "A"sv, true, ECMAScriptFlags::Unicode },
845 { "\\p{Cased_Letter}"sv, "9"sv, false, ECMAScriptFlags::Unicode },
846 { "\\P{Cased_Letter}"sv, "a"sv, false, ECMAScriptFlags::Unicode },
847 { "\\P{Cased_Letter}"sv, "A"sv, false, ECMAScriptFlags::Unicode },
848 { "\\P{Cased_Letter}"sv, "9"sv, true, ECMAScriptFlags::Unicode },
849 { "\\p{General_Category=Cased_Letter}"sv, "a"sv, true, ECMAScriptFlags::Unicode },
850 { "\\p{General_Category=Cased_Letter}"sv, "A"sv, true, ECMAScriptFlags::Unicode },
851 { "\\p{General_Category=Cased_Letter}"sv, "9"sv, false, ECMAScriptFlags::Unicode },
852 { "\\p{gc=Cased_Letter}"sv, "a"sv, true, ECMAScriptFlags::Unicode },
853 { "\\p{gc=Cased_Letter}"sv, "A"sv, true, ECMAScriptFlags::Unicode },
854 { "\\p{gc=Cased_Letter}"sv, "9"sv, false, ECMAScriptFlags::Unicode },
855 { "\\p{Script=Latin}"sv, "a"sv, true, ECMAScriptFlags::Unicode },
856 { "\\p{Script=Latin}"sv, "A"sv, true, ECMAScriptFlags::Unicode },
857 { "\\p{Script=Latin}"sv, "9"sv, false, ECMAScriptFlags::Unicode },
858 { "\\p{sc=Latin}"sv, "a"sv, true, ECMAScriptFlags::Unicode },
859 { "\\p{sc=Latin}"sv, "A"sv, true, ECMAScriptFlags::Unicode },
860 { "\\p{sc=Latin}"sv, "9"sv, false, ECMAScriptFlags::Unicode },
861 { "\\p{Script_Extensions=Deva}"sv, "a"sv, false, ECMAScriptFlags::Unicode },
862 { "\\p{Script_Extensions=Beng}"sv, "\xe1\xb3\x95"sv, true, ECMAScriptFlags::Unicode }, // U+01CD5
863 { "\\p{Script_Extensions=Deva}"sv, "\xe1\xb3\x95"sv, true, ECMAScriptFlags::Unicode }, // U+01CD5
864 { "\\p{scx=Deva}"sv, "a"sv, false, ECMAScriptFlags::Unicode },
865 { "\\p{scx=Beng}"sv, "\xe1\xb3\x95"sv, true, ECMAScriptFlags::Unicode }, // U+01CD5
866 { "\\p{scx=Deva}"sv, "\xe1\xb3\x95"sv, true, ECMAScriptFlags::Unicode }, // U+01CD5
867 };
868
869 for (auto& test : tests) {
870 Regex<ECMA262> re(test.pattern, (ECMAScriptFlags)regex::AllFlags::Global | regex::ECMAScriptFlags::BrowserExtended | test.options);
871
872 auto subject = MUST(AK::utf8_to_utf16(test.subject));
873 Utf16View view { subject };
874
875 if constexpr (REGEX_DEBUG) {
876 dbgln("\n");
877 RegexDebug regex_dbg(stderr);
878 regex_dbg.print_raw_bytecode(re);
879 regex_dbg.print_header();
880 regex_dbg.print_bytecode(re);
881 dbgln("\n");
882 }
883
884 EXPECT_EQ(re.parser_result.error, regex::Error::NoError);
885 EXPECT_EQ(re.match(view).success, test.matches);
886 }
887}
888
889TEST_CASE(replace)
890{
891 struct _test {
892 StringView pattern;
893 StringView replacement;
894 StringView subject;
895 StringView expected;
896 ECMAScriptFlags options {};
897 };
898
899 constexpr _test tests[] {
900 { "foo(.+)"sv, "aaa"sv, "test"sv, "test"sv },
901 { "foo(.+)"sv, "test\\1"sv, "foobar"sv, "testbar"sv },
902 { "foo(.+)"sv, "\\2\\1"sv, "foobar"sv, "\\2bar"sv },
903 { "foo(.+)"sv, "\\\\\\1"sv, "foobar"sv, "\\bar"sv },
904 { "foo(.)"sv, "a\\1"sv, "fooxfooy"sv, "axay"sv, ECMAScriptFlags::Multiline },
905 };
906
907 for (auto& test : tests) {
908 Regex<ECMA262> re(test.pattern, test.options);
909 if constexpr (REGEX_DEBUG) {
910 dbgln("\n");
911 RegexDebug regex_dbg(stderr);
912 regex_dbg.print_raw_bytecode(re);
913 regex_dbg.print_header();
914 regex_dbg.print_bytecode(re);
915 dbgln("\n");
916 }
917 EXPECT_EQ(re.parser_result.error, regex::Error::NoError);
918 EXPECT_EQ(re.replace(test.subject, test.replacement), test.expected);
919 }
920}
921
922TEST_CASE(case_insensitive_match)
923{
924 Regex<PosixExtended> re("cd", PosixFlags::Insensitive | PosixFlags::Global);
925 auto result = re.match("AEKFCD"sv);
926
927 EXPECT_EQ(result.success, true);
928 if (result.success) {
929 EXPECT_EQ(result.matches.at(0).column, 4ul);
930 }
931}
932
933TEST_CASE(extremely_long_fork_chain)
934{
935 Regex<ECMA262> re("(?:aa)*");
936 auto result = re.match(DeprecatedString::repeated('a', 1000));
937 EXPECT_EQ(result.success, true);
938}
939
940TEST_CASE(theoretically_infinite_loop)
941{
942 Array patterns {
943 "(a*)*"sv, // Infinitely matching empty substrings, the outer loop should short-circuit.
944 "(a*?)*"sv, // Infinitely matching empty substrings, the outer loop should short-circuit.
945 "(a*)*?"sv, // Should match exactly nothing.
946 "(?:)*?"sv, // Should not generate an infinite fork loop.
947 };
948 for (auto& pattern : patterns) {
949 Regex<ECMA262> re(pattern);
950 auto result = re.match(""sv);
951 EXPECT_EQ(result.success, true);
952 }
953}
954
955static auto g_lots_of_a_s = DeprecatedString::repeated('a', 10'000'000);
956
957BENCHMARK_CASE(fork_performance)
958{
959 Regex<ECMA262> re("(?:aa)*");
960 auto result = re.match(g_lots_of_a_s);
961 EXPECT_EQ(result.success, true);
962}
963
964TEST_CASE(optimizer_atomic_groups)
965{
966 Array tests {
967 // Fork -> ForkReplace
968 Tuple { "a*b"sv, "aaaaa"sv, false },
969 Tuple { "a+b"sv, "aaaaa"sv, false },
970 Tuple { "\\\\(\\d+)"sv, "\\\\"sv, false }, // Rewrite bug turning a+ to a*, see #10952.
971 Tuple { "[a-z.]+\\."sv, "..."sv, true }, // Rewrite bug, incorrect interpretation of Compare.
972 Tuple { "[.-]+\\."sv, ".-."sv, true },
973 // Alternative fuse
974 Tuple { "(abcfoo|abcbar|abcbaz).*x"sv, "abcbarx"sv, true },
975 Tuple { "(a|a)"sv, "a"sv, true },
976 Tuple { "(a|)"sv, ""sv, true }, // Ensure that empty alternatives are not outright removed
977 Tuple { "a{2,3}|a{5,8}"sv, "abc"sv, false }, // Optimizer should not mess up the instruction stream by ignoring inter-insn dependencies, see #11247.
978 Tuple { "^(a{2,3}|a{5,8})$"sv, "aaaa"sv, false }, // Optimizer should not mess up the instruction stream by ignoring inter-insn dependencies, see #11247.
979 // Optimizer should not chop off *half* of an instruction when fusing instructions.
980 Tuple { "cubic-bezier\\(\\s*(-?\\d+\\.?\\d*|-?\\.\\d+)\\s*,\\s*(-?\\d+\\.?\\d*|-?\\.\\d+)\\s*,\\s*(-?\\d+\\.?\\d*|-?\\.\\d+)\\s*,\\s*(-?\\d+\\.?\\d*|-?\\.\\d+)\\s*\\)"sv, "cubic-bezier(.05, 0, 0, 1)"sv, true },
981 // ForkReplace shouldn't be applied where it would change the semantics
982 Tuple { "(1+)\\1"sv, "11"sv, true },
983 Tuple { "(1+)1"sv, "11"sv, true },
984 Tuple { "(1+)0"sv, "10"sv, true },
985 // Rewrite should not skip over first required iteration of <x>+.
986 Tuple { "a+"sv, ""sv, false },
987 // 'y' and [^x] have an overlap ('y'), the loop should not be rewritten here.
988 Tuple { "[^x]+y"sv, "ay"sv, true },
989 // .+ should not be rewritten here, as it's followed by something that would be matched by `.`.
990 Tuple { ".+(a|b|c)"sv, "xxa"sv, true },
991 };
992
993 for (auto& test : tests) {
994 Regex<ECMA262> re(test.get<0>());
995 auto result = re.match(test.get<1>());
996 EXPECT_EQ(result.success, test.get<2>());
997 }
998}
999
1000TEST_CASE(optimizer_char_class_lut)
1001{
1002 Regex<ECMA262> re(R"([\f\n\r\t\v\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000\ufeff]+$)");
1003
1004 if constexpr (REGEX_DEBUG) {
1005 dbgln("\n");
1006 RegexDebug regex_dbg(stderr);
1007 regex_dbg.print_raw_bytecode(re);
1008 regex_dbg.print_header();
1009 regex_dbg.print_bytecode(re);
1010 dbgln("\n");
1011 }
1012
1013 // This will go through _all_ alternatives in the character class, and then fail.
1014 for (size_t i = 0; i < 1'000'000; ++i)
1015 EXPECT_EQ(re.match("1635488940000"sv).success, false);
1016}
1017
1018TEST_CASE(optimizer_alternation)
1019{
1020 Array tests {
1021 // Pattern, Subject, Expected length
1022 Tuple { "a|"sv, "a"sv, 1u },
1023 };
1024
1025 for (auto& test : tests) {
1026 Regex<ECMA262> re(test.get<0>());
1027 auto result = re.match(test.get<1>());
1028 EXPECT(result.success);
1029 EXPECT_EQ(result.matches.first().view.length(), test.get<2>());
1030 }
1031}
1032
1033TEST_CASE(posix_basic_dollar_is_end_anchor)
1034{
1035 // Ensure that a dollar sign at the end only matches the end of the line.
1036 {
1037 Regex<PosixBasic> re("abc$");
1038 EXPECT_EQ(re.match("123abcdef"sv, PosixFlags::Global).success, false);
1039 EXPECT_EQ(re.match("123abc"sv, PosixFlags::Global).success, true);
1040 EXPECT_EQ(re.match("123abc$def"sv, PosixFlags::Global).success, false);
1041 EXPECT_EQ(re.match("123abc$"sv, PosixFlags::Global).success, false);
1042 }
1043}
1044
1045TEST_CASE(posix_basic_dollar_is_literal)
1046{
1047 // Ensure that a dollar sign in the middle is treated as a literal.
1048 {
1049 Regex<PosixBasic> re("abc$d");
1050 EXPECT_EQ(re.match("123abcdef"sv, PosixFlags::Global).success, false);
1051 EXPECT_EQ(re.match("123abc"sv, PosixFlags::Global).success, false);
1052 EXPECT_EQ(re.match("123abc$def"sv, PosixFlags::Global).success, true);
1053 EXPECT_EQ(re.match("123abc$"sv, PosixFlags::Global).success, false);
1054 }
1055
1056 // Ensure that a dollar sign is always treated as a literal if escaped, even if at the end of the pattern.
1057 {
1058 Regex<PosixBasic> re("abc\\$");
1059 EXPECT_EQ(re.match("123abcdef"sv, PosixFlags::Global).success, false);
1060 EXPECT_EQ(re.match("123abc"sv, PosixFlags::Global).success, false);
1061 EXPECT_EQ(re.match("123abc$def"sv, PosixFlags::Global).success, true);
1062 EXPECT_EQ(re.match("123abc$"sv, PosixFlags::Global).success, true);
1063 }
1064}
1065
1066TEST_CASE(negative_lookahead)
1067{
1068 {
1069 // Negative lookahead with more than 2 forks difference between lookahead init and finish.
1070 auto options = ECMAScriptOptions { ECMAScriptFlags::Global };
1071 options.reset_flag((ECMAScriptFlags)regex::AllFlags::Internal_Stateful);
1072 Regex<ECMA262> re(":(?!\\^\\)|1)", options);
1073 EXPECT_EQ(re.match(":^)"sv).success, false);
1074 EXPECT_EQ(re.match(":1"sv).success, false);
1075 EXPECT_EQ(re.match(":foobar"sv).success, true);
1076 }
1077 {
1078 // Correctly count forks with nested groups and optimised loops
1079 Regex<ECMA262> re("^((?:[^\\n]|\\n(?! *\\n))+)(?:\\n *)+\\n");
1080 EXPECT_EQ(re.match("foo\n\n"sv).success, true);
1081 EXPECT_EQ(re.match("foo\n"sv).success, false);
1082 }
1083}
1084
1085TEST_CASE(single_match_flag)
1086{
1087 {
1088 // Ensure that only a single match is produced and nothing past that.
1089 Regex<ECMA262> re("[\\u0008-\\uffff]"sv, ECMAScriptFlags::Global | (ECMAScriptFlags)regex::AllFlags::SingleMatch);
1090 auto result = re.match("ABC"sv);
1091 EXPECT_EQ(result.success, true);
1092 EXPECT_EQ(result.matches.size(), 1u);
1093 EXPECT_EQ(result.matches.first().view.to_deprecated_string(), "A"sv);
1094 }
1095}
1096
1097TEST_CASE(empty_string_wildcard_match)
1098{
1099 {
1100 // Ensure that the wildcard ".*" matches the empty string exactly once
1101 Regex<ECMA262> re(".*"sv, ECMAScriptFlags::Global);
1102 auto result = re.match(""sv);
1103 EXPECT_EQ(result.success, true);
1104 EXPECT_EQ(result.matches.size(), 1u);
1105 EXPECT_EQ(result.matches.first().view.to_deprecated_string(), ""sv);
1106 }
1107}
1108
1109TEST_CASE(inversion_state_in_char_class)
1110{
1111 {
1112 // #13755, /[\S\s]/.exec("hello") should be [ "h" ], not null.
1113 Regex<ECMA262> re("[\\S\\s]", ECMAScriptFlags::Global | (ECMAScriptFlags)regex::AllFlags::SingleMatch);
1114
1115 auto result = re.match("hello"sv);
1116 EXPECT_EQ(result.success, true);
1117 EXPECT_EQ(result.matches.size(), 1u);
1118 EXPECT_EQ(result.matches.first().view.to_deprecated_string(), "h"sv);
1119 }
1120 {
1121 Regex<ECMA262> re("^(?:([^\\s!\"#%-,\\./;->@\\[-\\^`\\{-~]+(?=([=~}\\s/.)|]))))"sv, ECMAScriptFlags::Global);
1122
1123 auto result = re.match("slideNumbers}}"sv);
1124 EXPECT_EQ(result.success, true);
1125 EXPECT_EQ(result.matches.size(), 1u);
1126 EXPECT_EQ(result.matches.first().view.to_deprecated_string(), "slideNumbers"sv);
1127 EXPECT_EQ(result.capture_group_matches.first()[0].view.to_deprecated_string(), "slideNumbers"sv);
1128 EXPECT_EQ(result.capture_group_matches.first()[1].view.to_deprecated_string(), "}"sv);
1129 }
1130}