@recaptime-dev's working patches + fork for Phorge, a community fork of Phabricator. (Upstream dev and stable branches are at upstream/main and upstream/stable respectively.)
hq.recaptime.dev/wiki/Phorge
phorge
phabricator
1<?php
2
3final class PhutilSearchQueryCompiler
4 extends Phobject {
5
6 private $operators = '+ -><()~*:""&|';
7 private $query;
8 private $stemmer;
9 private $enableFunctions = false;
10
11 const OPERATOR_NOT = 'not';
12 const OPERATOR_AND = 'and';
13 const OPERATOR_SUBSTRING = 'sub';
14 const OPERATOR_EXACT = 'exact';
15 const OPERATOR_ABSENT = 'absent';
16 const OPERATOR_PRESENT = 'present';
17
18 public function setOperators($operators) {
19 $this->operators = $operators;
20 return $this;
21 }
22
23 public function getOperators() {
24 return $this->operators;
25 }
26
27 public function setStemmer(PhutilSearchStemmer $stemmer) {
28 $this->stemmer = $stemmer;
29 return $this;
30 }
31
32 /**
33 * @return PhutilSearchStemmer
34 */
35 public function getStemmer() {
36 return $this->stemmer;
37 }
38
39 /**
40 * @param bool $enable_functions
41 */
42 public function setEnableFunctions($enable_functions) {
43 $this->enableFunctions = $enable_functions;
44 return $this;
45 }
46
47 /**
48 * @return bool
49 */
50 public function getEnableFunctions() {
51 return $this->enableFunctions;
52 }
53
54 /**
55 * Get maximum number of tables in a single JOIN. MariaDB and MySQL set this
56 * to 61 tables per https://dev.mysql.com/doc/refman/8.4/en/join.html
57 *
58 * @return int
59 */
60 private function getMaxQueryTokens(): int {
61 return 61;
62 }
63
64 /**
65 * @param array<PhutilSearchQueryToken> $tokens
66 * @return string|null
67 */
68 public function compileQuery(array $tokens) {
69 assert_instances_of($tokens, PhutilSearchQueryToken::class);
70
71 $result = array();
72 foreach ($tokens as $token) {
73 $result[] = $this->renderToken($token);
74 }
75
76 return $this->compileRenderedTokens($result);
77 }
78
79 /**
80 * @param array<PhutilSearchQueryToken> $tokens
81 * @return string|null
82 */
83 public function compileLiteralQuery(array $tokens) {
84 assert_instances_of($tokens, PhutilSearchQueryToken::class);
85
86 $result = array();
87 foreach ($tokens as $token) {
88 if (!$token->isQuoted()) {
89 continue;
90 }
91 $result[] = $this->renderToken($token);
92 }
93
94 return $this->compileRenderedTokens($result);
95 }
96
97 /**
98 * @param array<PhutilSearchQueryToken> $tokens
99 * @return string|null
100 */
101 public function compileStemmedQuery(array $tokens) {
102 assert_instances_of($tokens, PhutilSearchQueryToken::class);
103
104 $result = array();
105 foreach ($tokens as $token) {
106 if ($token->isQuoted()) {
107 continue;
108 }
109 $result[] = $this->renderToken($token, $this->getStemmer());
110 }
111
112 return $this->compileRenderedTokens($result);
113 }
114
115 /**
116 * @return string|null
117 */
118 private function compileRenderedTokens(array $list) {
119 if (!$list) {
120 return null;
121 }
122
123 $list = array_unique($list);
124 return implode(' ', $list);
125 }
126
127 /**
128 * @return PhutilSearchQueryToken[]
129 */
130 public function newTokens($query) {
131 $results = $this->tokenizeQuery($query);
132
133 $tokens = array();
134 foreach ($results as $result) {
135 $tokens[] = PhutilSearchQueryToken::newFromDictionary($result);
136 }
137
138 return $tokens;
139 }
140
141 /**
142 * @param string $query Search string or part of the search string
143 * @return array<string[]> An array consisting of array elements like
144 * {"operator":"and","quoted":false,"value":"get","raw":"get",
145 * "function":null}
146 */
147 private function tokenizeQuery($query) {
148 $maximum_bytes = 1024;
149 if ($query === null) {
150 $query = '';
151 }
152 $query_bytes = strlen($query);
153 if ($query_bytes > $maximum_bytes) {
154 throw new PhutilSearchQueryCompilerSyntaxException(
155 pht(
156 'Query is too long (%s bytes, maximum is %s bytes). '.
157 'Please use more specific search criteria.',
158 new PhutilNumber($query_bytes),
159 new PhutilNumber($maximum_bytes)));
160 }
161
162 $query = phutil_utf8v($query);
163 $length = count($query);
164
165 $enable_functions = $this->getEnableFunctions();
166
167 $mode = 'scan';
168 $current_operator = array();
169 $current_token = array();
170 $current_function = null;
171 $is_quoted = false;
172 $tokens = array();
173
174 if ($enable_functions) {
175 $operator_characters = '[~=+-]';
176 } else {
177 $operator_characters = '[+-]';
178 }
179
180 for ($ii = 0; $ii < $length; $ii++) {
181 $character = $query[$ii];
182
183 if ($mode == 'scan') {
184 if (preg_match('/^\s\z/u', $character)) {
185 continue;
186 }
187
188 $mode = 'function';
189 }
190
191 if ($mode == 'function') {
192 $mode = 'operator';
193
194 if ($enable_functions) {
195 $found = false;
196 for ($jj = $ii; $jj < $length; $jj++) {
197 if (preg_match('/^[a-zA-Z-]\z/u', $query[$jj])) {
198 continue;
199 }
200 if ($query[$jj] == ':') {
201 $found = $jj;
202 }
203 break;
204 }
205
206 if ($found !== false) {
207 $function = array_slice($query, $ii, ($jj - $ii));
208 $current_function = implode('', $function);
209
210 if (!strlen($current_function)) {
211 $current_function = null;
212 }
213
214 $ii = $jj;
215 continue;
216 }
217 }
218 }
219
220 if ($mode == 'operator') {
221 if (!$current_operator) {
222 if (preg_match('/^\s\z/u', $character)) {
223 continue;
224 }
225 }
226
227 if (preg_match('/^'.$operator_characters.'\z/', $character)) {
228 $current_operator[] = $character;
229 continue;
230 }
231
232 $mode = 'quote';
233 }
234
235 if ($mode == 'quote') {
236 if (preg_match('/^"\z/', $character)) {
237 $is_quoted = true;
238 $mode = 'token';
239 continue;
240 }
241
242 $mode = 'token';
243 }
244
245 if ($mode == 'token') {
246 $capture = false;
247 $was_quoted = $is_quoted;
248 if ($is_quoted) {
249 if (preg_match('/^"\z/', $character)) {
250 $capture = true;
251 $mode = 'scan';
252 $is_quoted = false;
253 }
254 } else {
255 if (preg_match('/^\s\z/u', $character)) {
256 $capture = true;
257 $mode = 'scan';
258 }
259
260 if (preg_match('/^"\z/', $character)) {
261 $capture = true;
262 $mode = 'token';
263 $is_quoted = true;
264 }
265 }
266
267 if ($capture) {
268 $token = array(
269 'operator' => $current_operator,
270 'quoted' => $was_quoted,
271 'value' => $current_token,
272 );
273
274 if ($enable_functions) {
275 $token['function'] = $current_function;
276 }
277
278 $tokens[] = $token;
279
280 $current_operator = array();
281 $current_token = array();
282 $current_function = null;
283 continue;
284 } else {
285 $current_token[] = $character;
286 }
287 }
288 }
289
290 if ($is_quoted) {
291 throw new PhutilSearchQueryCompilerSyntaxException(
292 pht(
293 'Query contains unmatched double quotes.'));
294 }
295
296 // If the input query has trailing space, like "a b ", we may exit the
297 // parser without a final token.
298 if ($current_function !== null || $current_operator || $current_token) {
299 $token = array(
300 'operator' => $current_operator,
301 'quoted' => false,
302 'value' => $current_token,
303 );
304
305 if ($enable_functions) {
306 $token['function'] = $current_function;
307 }
308
309 $tokens[] = $token;
310 }
311
312 $query_tokens = count($tokens);
313 $maximum_tokens = $this->getMaxQueryTokens();
314 if ($query_tokens > $maximum_tokens) {
315 throw new PhutilSearchQueryCompilerSyntaxException(
316 pht(
317 'Query has too many search tokens (%s tokens, maximum is %s '.
318 'tokens). Please use more specific search criteria.',
319 new PhutilNumber($query_tokens),
320 new PhutilNumber($maximum_tokens)));
321 }
322
323
324 $results = array();
325 $last_function = null;
326 foreach ($tokens as $token) {
327 $value = implode('', $token['value']);
328 $operator_string = implode('', $token['operator']);
329 $is_quoted = $token['quoted'];
330
331 switch ($operator_string) {
332 case '-':
333 $operator = self::OPERATOR_NOT;
334 break;
335 case '~':
336 $operator = self::OPERATOR_SUBSTRING;
337 break;
338 case '=':
339 $operator = self::OPERATOR_EXACT;
340 break;
341 case '+':
342 $operator = self::OPERATOR_AND;
343 break;
344 case '':
345 $use_substring = false;
346
347 if ($enable_functions && !$is_quoted) {
348 // See T12995. If this query term contains Chinese, Japanese or
349 // Korean characters, treat the term as a substring term by default.
350 // These languages do not separate words with spaces, so the term
351 // search mode is normally useless.
352 if (phutil_utf8_is_cjk($value)) {
353 $use_substring = true;
354 } else if (phutil_preg_match('/^_/', $value)) {
355 // See T13632. Assume users searching for any term that begins
356 // with an underscore intend to perform substring search if they
357 // don't provide an explicit search function.
358 $use_substring = true;
359 }
360 }
361
362 if ($use_substring) {
363 $operator = self::OPERATOR_SUBSTRING;
364 } else {
365 $operator = self::OPERATOR_AND;
366 }
367 break;
368 default:
369 throw new PhutilSearchQueryCompilerSyntaxException(
370 pht(
371 'Query has an invalid sequence of operators ("%s").',
372 $operator_string));
373 }
374
375 if (!strlen($value)) {
376 $require_value = $is_quoted;
377
378 switch ($operator) {
379 case self::OPERATOR_NOT:
380 if ($enable_functions && ($token['function'] !== null)) {
381 $operator = self::OPERATOR_ABSENT;
382 $value = null;
383 } else {
384 $require_value = true;
385 }
386 break;
387 case self::OPERATOR_SUBSTRING:
388 if ($enable_functions && ($token['function'] !== null)) {
389 $operator = self::OPERATOR_PRESENT;
390 $value = null;
391 } else {
392 $require_value = true;
393 }
394 break;
395 default:
396 $require_value = true;
397 break;
398 }
399
400 if ($require_value) {
401 throw new PhutilSearchQueryCompilerSyntaxException(
402 pht(
403 'Query contains a token ("%s") with no search term. Query '.
404 'tokens specify text to search for.',
405 $this->getDisplayToken($token)));
406 }
407 }
408
409 $result = array(
410 'operator' => $operator,
411 'quoted' => $is_quoted,
412 'value' => $value,
413 'raw' => $this->getDisplayToken($token),
414 );
415
416 if ($enable_functions) {
417 // If a user provides a query like "title:a b c", we interpret all
418 // of the terms to be title terms: the "title:" function sticks
419 // until we encounter another function.
420
421 // If a user provides a query like "title:"a"" (with a quoted term),
422 // the function is not sticky.
423
424 if ($token['function'] !== null) {
425 $function = $token['function'];
426 } else {
427 $function = $last_function;
428 }
429
430 $result['function'] = $function;
431
432 // Note that the function remains sticky across quoted terms appearing
433 // after the function term. For example, all of these terms are title
434 // terms:
435 //
436 // title:a "b c" d
437
438 $is_sticky = (!$result['quoted'] || ($token['function'] === null));
439
440 switch ($operator) {
441 case self::OPERATOR_ABSENT:
442 case self::OPERATOR_PRESENT:
443 $is_sticky = false;
444 break;
445 }
446
447 if ($is_sticky) {
448 $last_function = $function;
449 } else {
450 $last_function = null;
451 }
452 }
453
454 $results[] = $result;
455 }
456
457 if ($enable_functions) {
458 // If any function is required to be "absent", there must be no other
459 // terms which make assertions about it.
460
461 $present_tokens = array();
462 $absent_tokens = array();
463 foreach ($results as $result) {
464 if (!isset($result['function'])) {
465 continue;
466 }
467 $function = $result['function'];
468
469 if ($result['operator'] === self::OPERATOR_ABSENT) {
470 $absent_tokens[$function][] = $result;
471 } else {
472 $present_tokens[$function][] = $result;
473 }
474 }
475
476 foreach ($absent_tokens as $function => $tokens) {
477 $absent_token = head($tokens);
478
479 if (empty($present_tokens[$function])) {
480 continue;
481 }
482
483 $present_token = head($present_tokens[$function]);
484
485 throw new PhutilSearchQueryCompilerSyntaxException(
486 pht(
487 'Query field must be absent ("%s") and present ("%s"). This '.
488 'is impossible, so the query is not valid.',
489 $absent_token['raw'],
490 $present_token['raw']));
491 }
492 }
493
494 return $results;
495 }
496
497 private function renderToken(
498 PhutilSearchQueryToken $token,
499 ?PhutilSearchStemmer $stemmer = null) {
500 $value = $token->getValue();
501
502 if ($stemmer) {
503 $value = $stemmer->stemToken($value);
504 }
505
506 $value = $this->quoteToken($value);
507 $operator = $token->getOperator();
508 $prefix = $this->getOperatorPrefix($operator);
509
510 $value = $prefix.$value;
511
512 return $value;
513 }
514
515 private function getOperatorPrefix($operator) {
516 $operators = $this->operators;
517
518 switch ($operator) {
519 case self::OPERATOR_AND:
520 $prefix = $operators[0];
521 break;
522 case self::OPERATOR_NOT:
523 $prefix = $operators[2];
524 break;
525 default:
526 throw new PhutilSearchQueryCompilerSyntaxException(
527 pht(
528 'Unsupported operator prefix "%s".',
529 $operator));
530 }
531
532 if ($prefix == ' ') {
533 $prefix = null;
534 }
535
536 return $prefix;
537 }
538
539 private function quoteToken($value) {
540 $operators = $this->operators;
541
542 $open_quote = $this->operators[10];
543 $close_quote = $this->operators[11];
544
545 return $open_quote.$value.$close_quote;
546 }
547
548 private function getDisplayToken(array $token) {
549 if (isset($token['function'])) {
550 $function = $token['function'].':';
551 } else {
552 $function = '';
553 }
554
555 $operator_string = implode('', $token['operator']);
556
557 $value = implode('', $token['value']);
558
559 $is_quoted = $token['quoted'];
560 if ($is_quoted) {
561 $value = $this->quoteToken($value);
562 }
563
564 return sprintf('%s%s%s', $function, $operator_string, $value);
565 }
566
567}