@recaptime-dev's working patches + fork for Phorge, a community fork of Phabricator. (Upstream dev and stable branches are at upstream/main and upstream/stable respectively.) hq.recaptime.dev/wiki/Phorge
phorge phabricator
at upstream/main 567 lines 15 kB view raw
1<?php 2 3final class PhutilSearchQueryCompiler 4 extends Phobject { 5 6 private $operators = '+ -><()~*:""&|'; 7 private $query; 8 private $stemmer; 9 private $enableFunctions = false; 10 11 const OPERATOR_NOT = 'not'; 12 const OPERATOR_AND = 'and'; 13 const OPERATOR_SUBSTRING = 'sub'; 14 const OPERATOR_EXACT = 'exact'; 15 const OPERATOR_ABSENT = 'absent'; 16 const OPERATOR_PRESENT = 'present'; 17 18 public function setOperators($operators) { 19 $this->operators = $operators; 20 return $this; 21 } 22 23 public function getOperators() { 24 return $this->operators; 25 } 26 27 public function setStemmer(PhutilSearchStemmer $stemmer) { 28 $this->stemmer = $stemmer; 29 return $this; 30 } 31 32 /** 33 * @return PhutilSearchStemmer 34 */ 35 public function getStemmer() { 36 return $this->stemmer; 37 } 38 39 /** 40 * @param bool $enable_functions 41 */ 42 public function setEnableFunctions($enable_functions) { 43 $this->enableFunctions = $enable_functions; 44 return $this; 45 } 46 47 /** 48 * @return bool 49 */ 50 public function getEnableFunctions() { 51 return $this->enableFunctions; 52 } 53 54 /** 55 * Get maximum number of tables in a single JOIN. MariaDB and MySQL set this 56 * to 61 tables per https://dev.mysql.com/doc/refman/8.4/en/join.html 57 * 58 * @return int 59 */ 60 private function getMaxQueryTokens(): int { 61 return 61; 62 } 63 64 /** 65 * @param array<PhutilSearchQueryToken> $tokens 66 * @return string|null 67 */ 68 public function compileQuery(array $tokens) { 69 assert_instances_of($tokens, PhutilSearchQueryToken::class); 70 71 $result = array(); 72 foreach ($tokens as $token) { 73 $result[] = $this->renderToken($token); 74 } 75 76 return $this->compileRenderedTokens($result); 77 } 78 79 /** 80 * @param array<PhutilSearchQueryToken> $tokens 81 * @return string|null 82 */ 83 public function compileLiteralQuery(array $tokens) { 84 assert_instances_of($tokens, PhutilSearchQueryToken::class); 85 86 $result = array(); 87 foreach ($tokens as $token) { 88 if (!$token->isQuoted()) { 89 continue; 90 } 91 $result[] = $this->renderToken($token); 92 } 93 94 return $this->compileRenderedTokens($result); 95 } 96 97 /** 98 * @param array<PhutilSearchQueryToken> $tokens 99 * @return string|null 100 */ 101 public function compileStemmedQuery(array $tokens) { 102 assert_instances_of($tokens, PhutilSearchQueryToken::class); 103 104 $result = array(); 105 foreach ($tokens as $token) { 106 if ($token->isQuoted()) { 107 continue; 108 } 109 $result[] = $this->renderToken($token, $this->getStemmer()); 110 } 111 112 return $this->compileRenderedTokens($result); 113 } 114 115 /** 116 * @return string|null 117 */ 118 private function compileRenderedTokens(array $list) { 119 if (!$list) { 120 return null; 121 } 122 123 $list = array_unique($list); 124 return implode(' ', $list); 125 } 126 127 /** 128 * @return PhutilSearchQueryToken[] 129 */ 130 public function newTokens($query) { 131 $results = $this->tokenizeQuery($query); 132 133 $tokens = array(); 134 foreach ($results as $result) { 135 $tokens[] = PhutilSearchQueryToken::newFromDictionary($result); 136 } 137 138 return $tokens; 139 } 140 141 /** 142 * @param string $query Search string or part of the search string 143 * @return array<string[]> An array consisting of array elements like 144 * {"operator":"and","quoted":false,"value":"get","raw":"get", 145 * "function":null} 146 */ 147 private function tokenizeQuery($query) { 148 $maximum_bytes = 1024; 149 if ($query === null) { 150 $query = ''; 151 } 152 $query_bytes = strlen($query); 153 if ($query_bytes > $maximum_bytes) { 154 throw new PhutilSearchQueryCompilerSyntaxException( 155 pht( 156 'Query is too long (%s bytes, maximum is %s bytes). '. 157 'Please use more specific search criteria.', 158 new PhutilNumber($query_bytes), 159 new PhutilNumber($maximum_bytes))); 160 } 161 162 $query = phutil_utf8v($query); 163 $length = count($query); 164 165 $enable_functions = $this->getEnableFunctions(); 166 167 $mode = 'scan'; 168 $current_operator = array(); 169 $current_token = array(); 170 $current_function = null; 171 $is_quoted = false; 172 $tokens = array(); 173 174 if ($enable_functions) { 175 $operator_characters = '[~=+-]'; 176 } else { 177 $operator_characters = '[+-]'; 178 } 179 180 for ($ii = 0; $ii < $length; $ii++) { 181 $character = $query[$ii]; 182 183 if ($mode == 'scan') { 184 if (preg_match('/^\s\z/u', $character)) { 185 continue; 186 } 187 188 $mode = 'function'; 189 } 190 191 if ($mode == 'function') { 192 $mode = 'operator'; 193 194 if ($enable_functions) { 195 $found = false; 196 for ($jj = $ii; $jj < $length; $jj++) { 197 if (preg_match('/^[a-zA-Z-]\z/u', $query[$jj])) { 198 continue; 199 } 200 if ($query[$jj] == ':') { 201 $found = $jj; 202 } 203 break; 204 } 205 206 if ($found !== false) { 207 $function = array_slice($query, $ii, ($jj - $ii)); 208 $current_function = implode('', $function); 209 210 if (!strlen($current_function)) { 211 $current_function = null; 212 } 213 214 $ii = $jj; 215 continue; 216 } 217 } 218 } 219 220 if ($mode == 'operator') { 221 if (!$current_operator) { 222 if (preg_match('/^\s\z/u', $character)) { 223 continue; 224 } 225 } 226 227 if (preg_match('/^'.$operator_characters.'\z/', $character)) { 228 $current_operator[] = $character; 229 continue; 230 } 231 232 $mode = 'quote'; 233 } 234 235 if ($mode == 'quote') { 236 if (preg_match('/^"\z/', $character)) { 237 $is_quoted = true; 238 $mode = 'token'; 239 continue; 240 } 241 242 $mode = 'token'; 243 } 244 245 if ($mode == 'token') { 246 $capture = false; 247 $was_quoted = $is_quoted; 248 if ($is_quoted) { 249 if (preg_match('/^"\z/', $character)) { 250 $capture = true; 251 $mode = 'scan'; 252 $is_quoted = false; 253 } 254 } else { 255 if (preg_match('/^\s\z/u', $character)) { 256 $capture = true; 257 $mode = 'scan'; 258 } 259 260 if (preg_match('/^"\z/', $character)) { 261 $capture = true; 262 $mode = 'token'; 263 $is_quoted = true; 264 } 265 } 266 267 if ($capture) { 268 $token = array( 269 'operator' => $current_operator, 270 'quoted' => $was_quoted, 271 'value' => $current_token, 272 ); 273 274 if ($enable_functions) { 275 $token['function'] = $current_function; 276 } 277 278 $tokens[] = $token; 279 280 $current_operator = array(); 281 $current_token = array(); 282 $current_function = null; 283 continue; 284 } else { 285 $current_token[] = $character; 286 } 287 } 288 } 289 290 if ($is_quoted) { 291 throw new PhutilSearchQueryCompilerSyntaxException( 292 pht( 293 'Query contains unmatched double quotes.')); 294 } 295 296 // If the input query has trailing space, like "a b ", we may exit the 297 // parser without a final token. 298 if ($current_function !== null || $current_operator || $current_token) { 299 $token = array( 300 'operator' => $current_operator, 301 'quoted' => false, 302 'value' => $current_token, 303 ); 304 305 if ($enable_functions) { 306 $token['function'] = $current_function; 307 } 308 309 $tokens[] = $token; 310 } 311 312 $query_tokens = count($tokens); 313 $maximum_tokens = $this->getMaxQueryTokens(); 314 if ($query_tokens > $maximum_tokens) { 315 throw new PhutilSearchQueryCompilerSyntaxException( 316 pht( 317 'Query has too many search tokens (%s tokens, maximum is %s '. 318 'tokens). Please use more specific search criteria.', 319 new PhutilNumber($query_tokens), 320 new PhutilNumber($maximum_tokens))); 321 } 322 323 324 $results = array(); 325 $last_function = null; 326 foreach ($tokens as $token) { 327 $value = implode('', $token['value']); 328 $operator_string = implode('', $token['operator']); 329 $is_quoted = $token['quoted']; 330 331 switch ($operator_string) { 332 case '-': 333 $operator = self::OPERATOR_NOT; 334 break; 335 case '~': 336 $operator = self::OPERATOR_SUBSTRING; 337 break; 338 case '=': 339 $operator = self::OPERATOR_EXACT; 340 break; 341 case '+': 342 $operator = self::OPERATOR_AND; 343 break; 344 case '': 345 $use_substring = false; 346 347 if ($enable_functions && !$is_quoted) { 348 // See T12995. If this query term contains Chinese, Japanese or 349 // Korean characters, treat the term as a substring term by default. 350 // These languages do not separate words with spaces, so the term 351 // search mode is normally useless. 352 if (phutil_utf8_is_cjk($value)) { 353 $use_substring = true; 354 } else if (phutil_preg_match('/^_/', $value)) { 355 // See T13632. Assume users searching for any term that begins 356 // with an underscore intend to perform substring search if they 357 // don't provide an explicit search function. 358 $use_substring = true; 359 } 360 } 361 362 if ($use_substring) { 363 $operator = self::OPERATOR_SUBSTRING; 364 } else { 365 $operator = self::OPERATOR_AND; 366 } 367 break; 368 default: 369 throw new PhutilSearchQueryCompilerSyntaxException( 370 pht( 371 'Query has an invalid sequence of operators ("%s").', 372 $operator_string)); 373 } 374 375 if (!strlen($value)) { 376 $require_value = $is_quoted; 377 378 switch ($operator) { 379 case self::OPERATOR_NOT: 380 if ($enable_functions && ($token['function'] !== null)) { 381 $operator = self::OPERATOR_ABSENT; 382 $value = null; 383 } else { 384 $require_value = true; 385 } 386 break; 387 case self::OPERATOR_SUBSTRING: 388 if ($enable_functions && ($token['function'] !== null)) { 389 $operator = self::OPERATOR_PRESENT; 390 $value = null; 391 } else { 392 $require_value = true; 393 } 394 break; 395 default: 396 $require_value = true; 397 break; 398 } 399 400 if ($require_value) { 401 throw new PhutilSearchQueryCompilerSyntaxException( 402 pht( 403 'Query contains a token ("%s") with no search term. Query '. 404 'tokens specify text to search for.', 405 $this->getDisplayToken($token))); 406 } 407 } 408 409 $result = array( 410 'operator' => $operator, 411 'quoted' => $is_quoted, 412 'value' => $value, 413 'raw' => $this->getDisplayToken($token), 414 ); 415 416 if ($enable_functions) { 417 // If a user provides a query like "title:a b c", we interpret all 418 // of the terms to be title terms: the "title:" function sticks 419 // until we encounter another function. 420 421 // If a user provides a query like "title:"a"" (with a quoted term), 422 // the function is not sticky. 423 424 if ($token['function'] !== null) { 425 $function = $token['function']; 426 } else { 427 $function = $last_function; 428 } 429 430 $result['function'] = $function; 431 432 // Note that the function remains sticky across quoted terms appearing 433 // after the function term. For example, all of these terms are title 434 // terms: 435 // 436 // title:a "b c" d 437 438 $is_sticky = (!$result['quoted'] || ($token['function'] === null)); 439 440 switch ($operator) { 441 case self::OPERATOR_ABSENT: 442 case self::OPERATOR_PRESENT: 443 $is_sticky = false; 444 break; 445 } 446 447 if ($is_sticky) { 448 $last_function = $function; 449 } else { 450 $last_function = null; 451 } 452 } 453 454 $results[] = $result; 455 } 456 457 if ($enable_functions) { 458 // If any function is required to be "absent", there must be no other 459 // terms which make assertions about it. 460 461 $present_tokens = array(); 462 $absent_tokens = array(); 463 foreach ($results as $result) { 464 if (!isset($result['function'])) { 465 continue; 466 } 467 $function = $result['function']; 468 469 if ($result['operator'] === self::OPERATOR_ABSENT) { 470 $absent_tokens[$function][] = $result; 471 } else { 472 $present_tokens[$function][] = $result; 473 } 474 } 475 476 foreach ($absent_tokens as $function => $tokens) { 477 $absent_token = head($tokens); 478 479 if (empty($present_tokens[$function])) { 480 continue; 481 } 482 483 $present_token = head($present_tokens[$function]); 484 485 throw new PhutilSearchQueryCompilerSyntaxException( 486 pht( 487 'Query field must be absent ("%s") and present ("%s"). This '. 488 'is impossible, so the query is not valid.', 489 $absent_token['raw'], 490 $present_token['raw'])); 491 } 492 } 493 494 return $results; 495 } 496 497 private function renderToken( 498 PhutilSearchQueryToken $token, 499 ?PhutilSearchStemmer $stemmer = null) { 500 $value = $token->getValue(); 501 502 if ($stemmer) { 503 $value = $stemmer->stemToken($value); 504 } 505 506 $value = $this->quoteToken($value); 507 $operator = $token->getOperator(); 508 $prefix = $this->getOperatorPrefix($operator); 509 510 $value = $prefix.$value; 511 512 return $value; 513 } 514 515 private function getOperatorPrefix($operator) { 516 $operators = $this->operators; 517 518 switch ($operator) { 519 case self::OPERATOR_AND: 520 $prefix = $operators[0]; 521 break; 522 case self::OPERATOR_NOT: 523 $prefix = $operators[2]; 524 break; 525 default: 526 throw new PhutilSearchQueryCompilerSyntaxException( 527 pht( 528 'Unsupported operator prefix "%s".', 529 $operator)); 530 } 531 532 if ($prefix == ' ') { 533 $prefix = null; 534 } 535 536 return $prefix; 537 } 538 539 private function quoteToken($value) { 540 $operators = $this->operators; 541 542 $open_quote = $this->operators[10]; 543 $close_quote = $this->operators[11]; 544 545 return $open_quote.$value.$close_quote; 546 } 547 548 private function getDisplayToken(array $token) { 549 if (isset($token['function'])) { 550 $function = $token['function'].':'; 551 } else { 552 $function = ''; 553 } 554 555 $operator_string = implode('', $token['operator']); 556 557 $value = implode('', $token['value']); 558 559 $is_quoted = $token['quoted']; 560 if ($is_quoted) { 561 $value = $this->quoteToken($value); 562 } 563 564 return sprintf('%s%s%s', $function, $operator_string, $value); 565 } 566 567}