@recaptime-dev's working patches + fork for Phorge, a community fork of Phabricator. (Upstream dev and stable branches are at upstream/main and upstream/stable respectively.) hq.recaptime.dev/wiki/Phorge
phorge phabricator

Combine the two different ngram-splitting algorithms into a single engine

Summary:
Ref T13501. Depends on D21127. With the "prefix" behavior removed in D21127, we now have two virtually identical copies of the same code.

The newer one in Ferret is better: it slices utf8 correctly and is slightly more efficient on large inputs. Pull it out and make all callers call into it.

Test Plan:
- Grepped for all affected symbols.
- Ran `bin/search index --force ...` to reindex various objects (tasks, files).
- Searched for things in the UI.

Maniphest Tasks: T13501

Differential Revision: https://secure.phabricator.com/D21128

+104 -102
+2
src/__phutil_library_map__.php
··· 4694 4694 'PhabricatorSearchManagementNgramsWorkflow' => 'applications/search/management/PhabricatorSearchManagementNgramsWorkflow.php', 4695 4695 'PhabricatorSearchManagementQueryWorkflow' => 'applications/search/management/PhabricatorSearchManagementQueryWorkflow.php', 4696 4696 'PhabricatorSearchManagementWorkflow' => 'applications/search/management/PhabricatorSearchManagementWorkflow.php', 4697 + 'PhabricatorSearchNgramEngine' => 'applications/search/engine/PhabricatorSearchNgramEngine.php', 4697 4698 'PhabricatorSearchNgrams' => 'applications/search/ngrams/PhabricatorSearchNgrams.php', 4698 4699 'PhabricatorSearchNgramsDestructionEngineExtension' => 'applications/search/engineextension/PhabricatorSearchNgramsDestructionEngineExtension.php', 4699 4700 'PhabricatorSearchOrderController' => 'applications/search/controller/PhabricatorSearchOrderController.php', ··· 11417 11418 'PhabricatorSearchManagementNgramsWorkflow' => 'PhabricatorSearchManagementWorkflow', 11418 11419 'PhabricatorSearchManagementQueryWorkflow' => 'PhabricatorSearchManagementWorkflow', 11419 11420 'PhabricatorSearchManagementWorkflow' => 'PhabricatorManagementWorkflow', 11421 + 'PhabricatorSearchNgramEngine' => 'Phobject', 11420 11422 'PhabricatorSearchNgrams' => 'PhabricatorSearchDAO', 11421 11423 'PhabricatorSearchNgramsDestructionEngineExtension' => 'PhabricatorDestructionEngineExtension', 11422 11424 'PhabricatorSearchOrderController' => 'PhabricatorSearchBaseController',
+66
src/applications/search/engine/PhabricatorSearchNgramEngine.php
··· 1 + <?php 2 + 3 + final class PhabricatorSearchNgramEngine 4 + extends Phobject { 5 + 6 + public function tokenizeNgramString($value) { 7 + $value = trim($value, ' '); 8 + $value = preg_split('/\s+/u', $value); 9 + return $value; 10 + } 11 + 12 + public function getTermNgramsFromString($string) { 13 + return $this->getNgramsFromString($string, true); 14 + } 15 + 16 + public function getSubstringNgramsFromString($string) { 17 + return $this->getNgramsFromString($string, false); 18 + } 19 + 20 + private function getNgramsFromString($value, $as_term) { 21 + $value = phutil_utf8_strtolower($value); 22 + $tokens = $this->tokenizeNgramString($value); 23 + 24 + // First, extract unique tokens from the string. This reduces the number 25 + // of `phutil_utf8v()` calls we need to make if we are indexing a large 26 + // corpus with redundant terms. 27 + $unique_tokens = array(); 28 + foreach ($tokens as $token) { 29 + if ($as_term) { 30 + $token = ' '.$token.' '; 31 + } 32 + 33 + $unique_tokens[$token] = true; 34 + } 35 + 36 + $ngrams = array(); 37 + foreach ($unique_tokens as $token => $ignored) { 38 + $token_v = phutil_utf8v($token); 39 + $length = count($token_v); 40 + 41 + // NOTE: We're being somewhat clever here to micro-optimize performance, 42 + // especially for very long strings. See PHI87. 43 + 44 + $token_l = array(); 45 + for ($ii = 0; $ii < $length; $ii++) { 46 + $token_l[$ii] = strlen($token_v[$ii]); 47 + } 48 + 49 + $ngram_count = $length - 2; 50 + $cursor = 0; 51 + for ($ii = 0; $ii < $ngram_count; $ii++) { 52 + $ngram_l = $token_l[$ii] + $token_l[$ii + 1] + $token_l[$ii + 2]; 53 + 54 + $ngram = substr($token, $cursor, $ngram_l); 55 + $ngrams[$ngram] = $ngram; 56 + 57 + $cursor += $token_l[$ii]; 58 + } 59 + } 60 + 61 + ksort($ngrams); 62 + 63 + return array_keys($ngrams); 64 + } 65 + 66 + }
+2 -1
src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php
··· 131 131 } 132 132 $ngrams_source = implode("\n", $ngrams_source); 133 133 134 - $ngrams = $engine->getTermNgramsFromString($ngrams_source); 134 + $ngram_engine = new PhabricatorSearchNgramEngine(); 135 + $ngrams = $ngram_engine->getTermNgramsFromString($ngrams_source); 135 136 136 137 $object->openTransaction(); 137 138
-60
src/applications/search/ferret/PhabricatorFerretEngine.php
··· 62 62 return new PhutilSearchStemmer(); 63 63 } 64 64 65 - public function tokenizeString($value) { 66 - $value = trim($value, ' '); 67 - $value = preg_split('/\s+/u', $value); 68 - return $value; 69 - } 70 - 71 - public function getTermNgramsFromString($string) { 72 - return $this->getNgramsFromString($string, true); 73 - } 74 - 75 - public function getSubstringNgramsFromString($string) { 76 - return $this->getNgramsFromString($string, false); 77 - } 78 - 79 - private function getNgramsFromString($value, $as_term) { 80 - $value = phutil_utf8_strtolower($value); 81 - $tokens = $this->tokenizeString($value); 82 - 83 - // First, extract unique tokens from the string. This reduces the number 84 - // of `phutil_utf8v()` calls we need to make if we are indexing a large 85 - // corpus with redundant terms. 86 - $unique_tokens = array(); 87 - foreach ($tokens as $token) { 88 - if ($as_term) { 89 - $token = ' '.$token.' '; 90 - } 91 - 92 - $unique_tokens[$token] = true; 93 - } 94 - 95 - $ngrams = array(); 96 - foreach ($unique_tokens as $token => $ignored) { 97 - $token_v = phutil_utf8v($token); 98 - $length = count($token_v); 99 - 100 - // NOTE: We're being somewhat clever here to micro-optimize performance, 101 - // especially for very long strings. See PHI87. 102 - 103 - $token_l = array(); 104 - for ($ii = 0; $ii < $length; $ii++) { 105 - $token_l[$ii] = strlen($token_v[$ii]); 106 - } 107 - 108 - $ngram_count = $length - 2; 109 - $cursor = 0; 110 - for ($ii = 0; $ii < $ngram_count; $ii++) { 111 - $ngram_l = $token_l[$ii] + $token_l[$ii + 1] + $token_l[$ii + 2]; 112 - 113 - $ngram = substr($token, $cursor, $ngram_l); 114 - $ngrams[$ngram] = $ngram; 115 - 116 - $cursor += $token_l[$ii]; 117 - } 118 - } 119 - 120 - ksort($ngrams); 121 - 122 - return array_keys($ngrams); 123 - } 124 - 125 65 public function newTermsCorpus($raw_corpus) { 126 66 $term_corpus = strtr( 127 67 $raw_corpus,
+2 -2
src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php
··· 43 43 ), 44 44 ); 45 45 46 - $engine = new ManiphestTaskFerretEngine(); 46 + $ngram_engine = new PhabricatorSearchNgramEngine(); 47 47 48 48 foreach ($map as $input => $expect) { 49 - $actual = $engine->getTermNgramsFromString($input); 49 + $actual = $ngram_engine->getTermNgramsFromString($input); 50 50 $this->assertEqual( 51 51 $actual, 52 52 $expect,
+12 -34
src/applications/search/ngrams/PhabricatorSearchNgrams.php
··· 7 7 protected $ngram; 8 8 9 9 private $value; 10 + private $ngramEngine; 10 11 11 12 abstract public function getNgramKey(); 12 13 abstract public function getColumnName(); ··· 44 45 return "{$application}_{$key}_ngrams"; 45 46 } 46 47 47 - final public function tokenizeString($value) { 48 - $value = trim($value, ' '); 49 - $value = preg_split('/ +/', $value); 50 - return $value; 51 - } 52 - 53 - final public function getNgramsFromString($value, $mode) { 54 - $tokens = $this->tokenizeString($value); 55 - 56 - $ngrams = array(); 57 - foreach ($tokens as $token) { 58 - $token = phutil_utf8_strtolower($token); 59 - 60 - switch ($mode) { 61 - case 'query': 62 - break; 63 - case 'index': 64 - $token = ' '.$token.' '; 65 - break; 66 - } 67 - 68 - $len = (strlen($token) - 2); 69 - for ($ii = 0; $ii < $len; $ii++) { 70 - $ngram = substr($token, $ii, 3); 71 - $ngrams[$ngram] = $ngram; 72 - } 73 - } 48 + final public function writeNgram($object_id) { 49 + $ngram_engine = $this->getNgramEngine(); 50 + $ngrams = $ngram_engine->getTermNgramsFromString($this->getValue()); 74 51 75 - ksort($ngrams); 76 - 77 - return array_keys($ngrams); 78 - } 79 - 80 - final public function writeNgram($object_id) { 81 - $ngrams = $this->getNgramsFromString($this->getValue(), 'index'); 82 52 $conn_w = $this->establishConnection('w'); 83 53 84 54 $sql = array(); ··· 105 75 } 106 76 107 77 return $this; 78 + } 79 + 80 + private function getNgramEngine() { 81 + if (!$this->ngramEngine) { 82 + $this->ngramEngine = new PhabricatorSearchNgramEngine(); 83 + } 84 + 85 + return $this->ngramEngine; 108 86 } 109 87 110 88 }
+20 -5
src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php
··· 36 36 private $ferretTables = array(); 37 37 private $ferretQuery; 38 38 private $ferretMetadata = array(); 39 + private $ngramEngine; 39 40 40 41 const FULLTEXT_RANK = '_ft_rank'; 41 42 const FULLTEXT_MODIFIED = '_ft_epochModified'; ··· 1984 1985 $stemmer = $engine->newStemmer(); 1985 1986 1986 1987 $ngram_table = $engine->getNgramsTableName(); 1988 + $ngram_engine = $this->getNgramEngine(); 1987 1989 1988 1990 $flat = array(); 1989 1991 foreach ($this->ferretTokens as $fulltext_token) { ··· 2032 2034 } 2033 2035 2034 2036 if ($is_substring) { 2035 - $ngrams = $engine->getSubstringNgramsFromString($value); 2037 + $ngrams = $ngram_engine->getSubstringNgramsFromString($value); 2036 2038 } else { 2037 2039 $terms_value = $engine->newTermsCorpus($value); 2038 - $ngrams = $engine->getTermNgramsFromString($terms_value); 2040 + $ngrams = $ngram_engine->getTermNgramsFromString($terms_value); 2039 2041 2040 2042 // If this is a stemmed term, only look for ngrams present in both the 2041 2043 // unstemmed and stemmed variations. ··· 2044 2046 // is (or, at least, may be) a normal word and activates. 2045 2047 $terms_value = trim($terms_value, ' '); 2046 2048 $stem_value = $stemmer->stemToken($terms_value); 2047 - $stem_ngrams = $engine->getTermNgramsFromString($stem_value); 2049 + $stem_ngrams = $ngram_engine->getTermNgramsFromString($stem_value); 2048 2050 $ngrams = array_intersect($ngrams, $stem_ngrams); 2049 2051 } 2050 2052 } ··· 2409 2411 2410 2412 2411 2413 protected function buildNgramsJoinClause(AphrontDatabaseConnection $conn) { 2414 + $ngram_engine = $this->getNgramEngine(); 2415 + 2412 2416 $flat = array(); 2413 2417 foreach ($this->ngrams as $spec) { 2414 2418 $length = $spec['length']; ··· 2420 2424 $index = $spec['index']; 2421 2425 $value = $spec['value']; 2422 2426 2423 - $ngrams = $index->getNgramsFromString($value, 'query'); 2427 + $ngrams = $ngram_engine->getSubstringNgramsFromString($value); 2424 2428 2425 2429 foreach ($ngrams as $ngram) { 2426 2430 $flat[] = array( ··· 2476 2480 protected function buildNgramsWhereClause(AphrontDatabaseConnection $conn) { 2477 2481 $where = array(); 2478 2482 2483 + $ngram_engine = $this->getNgramEngine(); 2484 + 2479 2485 foreach ($this->ngrams as $ngram) { 2480 2486 $index = $ngram['index']; 2481 2487 $value = $ngram['value']; ··· 2488 2494 $column = qsprintf($conn, '%T', $column); 2489 2495 } 2490 2496 2491 - $tokens = $index->tokenizeString($value); 2497 + $tokens = $ngram_engine->tokenizeNgramString($value); 2498 + 2492 2499 foreach ($tokens as $token) { 2493 2500 $where[] = qsprintf( 2494 2501 $conn, ··· 2504 2511 2505 2512 protected function shouldGroupNgramResultRows() { 2506 2513 return (bool)$this->ngrams; 2514 + } 2515 + 2516 + private function getNgramEngine() { 2517 + if (!$this->ngramEngine) { 2518 + $this->ngramEngine = new PhabricatorSearchNgramEngine(); 2519 + } 2520 + 2521 + return $this->ngramEngine; 2507 2522 } 2508 2523 2509 2524