@recaptime-dev's working patches + fork for Phorge, a community fork of Phabricator. (Upstream dev and stable branches are at upstream/main and upstream/stable respectively.) hq.recaptime.dev/wiki/Phorge
phorge phabricator

When updating a Ferret search index document, reuse existing rows where possible

Summary:
Ref T13587. Currently, when a document is reindexed by Ferret, the old document is completely discarded and a new version is inserted to replace it.

This approach is simple to implement, but can lead to exhaustion of the ngram AUTO_INCREMENT id column in reasonable circumstances.

Conceptually, this approach "should" be fine and this exhaustion is an awkard implementation detail. However, since it's easy to be less wasteful when performing document updates and all the other approaches are awkward or leaky in other ways that are probably worse, use a more complex implementation to avoid executing unnecessary INSERT statements.

Test Plan:
- Created and indexed a new document, searched for it.
- Updated a document, indexed it with `bin/search index ... --force --trace`, saw only modifications updated in the index.
- Searched for newly added terms (got hits) and removed terms (no longer got hits) to verify add/delete index behavior.

Maniphest Tasks: T13587

Differential Revision: https://secure.phabricator.com/D21495

+259 -85
+259 -85
src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php
··· 134 134 $ngram_engine = new PhabricatorSearchNgramEngine(); 135 135 $ngrams = $ngram_engine->getTermNgramsFromString($ngrams_source); 136 136 137 + $conn = $object->establishConnection('w'); 138 + 139 + if ($ngrams) { 140 + $common = queryfx_all( 141 + $conn, 142 + 'SELECT ngram FROM %T WHERE ngram IN (%Ls)', 143 + $engine->getCommonNgramsTableName(), 144 + $ngrams); 145 + $common = ipull($common, 'ngram', 'ngram'); 146 + 147 + foreach ($ngrams as $key => $ngram) { 148 + if (isset($common[$ngram])) { 149 + unset($ngrams[$key]); 150 + continue; 151 + } 152 + 153 + // NOTE: MySQL discards trailing whitespace in CHAR(X) columns. 154 + $trimmed_ngram = rtrim($ngram, ' '); 155 + if (isset($common[$trimmed_ngram])) { 156 + unset($ngrams[$key]); 157 + continue; 158 + } 159 + } 160 + } 161 + 137 162 $object->openTransaction(); 138 163 139 164 try { 140 - $conn = $object->establishConnection('w'); 141 - $this->deleteOldDocument($engine, $object, $document); 165 + // See T13587. If this document already exists in the index, we try to 166 + // update the existing rows to avoid leaving the ngrams table heavily 167 + // fragmented. 142 168 143 - queryfx( 169 + $old_document = queryfx_one( 144 170 $conn, 145 - 'INSERT INTO %T (objectPHID, isClosed, epochCreated, epochModified, 146 - authorPHID, ownerPHID) VALUES (%s, %d, %d, %d, %ns, %ns)', 171 + 'SELECT id FROM %T WHERE objectPHID = %s', 147 172 $engine->getDocumentTableName(), 148 - $object->getPHID(), 149 - $is_closed, 150 - $document->getDocumentCreated(), 151 - $document->getDocumentModified(), 152 - $author_phid, 153 - $owner_phid); 173 + $object->getPHID()); 174 + if ($old_document) { 175 + $old_document_id = (int)$old_document['id']; 176 + } else { 177 + $old_document_id = null; 178 + } 154 179 155 - $document_id = $conn->getInsertID(); 156 - foreach ($ferret_fields as $ferret_field) { 180 + if ($old_document_id === null) { 157 181 queryfx( 158 182 $conn, 159 - 'INSERT INTO %T (documentID, fieldKey, rawCorpus, termCorpus, 160 - normalCorpus) VALUES (%d, %s, %s, %s, %s)', 161 - $engine->getFieldTableName(), 162 - $document_id, 163 - $ferret_field['fieldKey'], 164 - $ferret_field['rawCorpus'], 165 - $ferret_field['termCorpus'], 166 - $ferret_field['normalCorpus']); 167 - } 183 + 'INSERT INTO %T (objectPHID, isClosed, epochCreated, epochModified, 184 + authorPHID, ownerPHID) VALUES (%s, %d, %d, %d, %ns, %ns)', 185 + $engine->getDocumentTableName(), 186 + $object->getPHID(), 187 + $is_closed, 188 + $document->getDocumentCreated(), 189 + $document->getDocumentModified(), 190 + $author_phid, 191 + $owner_phid); 192 + $document_id = $conn->getInsertID(); 168 193 169 - if ($ngrams) { 170 - $common = queryfx_all( 194 + $is_new = true; 195 + } else { 196 + $document_id = $old_document_id; 197 + queryfx( 171 198 $conn, 172 - 'SELECT ngram FROM %T WHERE ngram IN (%Ls)', 173 - $engine->getCommonNgramsTableName(), 174 - $ngrams); 175 - $common = ipull($common, 'ngram', 'ngram'); 176 - 177 - foreach ($ngrams as $key => $ngram) { 178 - if (isset($common[$ngram])) { 179 - unset($ngrams[$key]); 180 - continue; 181 - } 199 + 'UPDATE %T 200 + SET 201 + isClosed = %d, 202 + epochCreated = %d, 203 + epochModified = %d, 204 + authorPHID = %ns, 205 + ownerPHID = %ns 206 + WHERE id = %d', 207 + $engine->getDocumentTableName(), 208 + $is_closed, 209 + $document->getDocumentCreated(), 210 + $document->getDocumentModified(), 211 + $author_phid, 212 + $owner_phid, 213 + $document_id); 182 214 183 - // NOTE: MySQL discards trailing whitespace in CHAR(X) columns. 184 - $trim_ngram = rtrim($ngram, ' '); 185 - if (isset($common[$ngram])) { 186 - unset($ngrams[$key]); 187 - continue; 188 - } 189 - } 215 + $is_new = false; 190 216 } 191 217 192 - if ($ngrams) { 193 - $sql = array(); 194 - foreach ($ngrams as $ngram) { 195 - $sql[] = qsprintf( 196 - $conn, 197 - '(%d, %s)', 198 - $document_id, 199 - $ngram); 200 - } 218 + $this->updateStoredFields( 219 + $conn, 220 + $is_new, 221 + $document_id, 222 + $engine, 223 + $ferret_fields); 201 224 202 - foreach (PhabricatorLiskDAO::chunkSQL($sql) as $chunk) { 203 - queryfx( 204 - $conn, 205 - 'INSERT INTO %T (documentID, ngram) VALUES %LQ', 206 - $engine->getNgramsTableName(), 207 - $chunk); 208 - } 209 - } 225 + $this->updateStoredNgrams( 226 + $conn, 227 + $is_new, 228 + $document_id, 229 + $engine, 230 + $ngrams); 231 + 210 232 } catch (Exception $ex) { 211 233 $object->killTransaction(); 212 234 throw $ex; 235 + } catch (Throwable $ex) { 236 + $object->killTransaction(); 237 + throw $ex; 213 238 } 214 239 215 240 $object->saveTransaction(); 216 241 } 217 242 243 + private function updateStoredFields( 244 + AphrontDatabaseConnection $conn, 245 + $is_new, 246 + $document_id, 247 + PhabricatorFerretEngine $engine, 248 + $new_fields) { 218 249 219 - private function deleteOldDocument( 250 + if (!$is_new) { 251 + $old_fields = queryfx_all( 252 + $conn, 253 + 'SELECT * FROM %T WHERE documentID = %d', 254 + $engine->getFieldTableName(), 255 + $document_id); 256 + } else { 257 + $old_fields = array(); 258 + } 259 + 260 + $old_fields = ipull($old_fields, null, 'fieldKey'); 261 + $new_fields = ipull($new_fields, null, 'fieldKey'); 262 + 263 + $delete_rows = array(); 264 + $insert_rows = array(); 265 + $update_rows = array(); 266 + 267 + foreach ($old_fields as $field_key => $old_field) { 268 + if (!isset($new_fields[$field_key])) { 269 + $delete_rows[] = $old_field; 270 + } 271 + } 272 + 273 + $compare_keys = array( 274 + 'rawCorpus', 275 + 'termCorpus', 276 + 'normalCorpus', 277 + ); 278 + 279 + foreach ($new_fields as $field_key => $new_field) { 280 + if (!isset($old_fields[$field_key])) { 281 + $insert_rows[] = $new_field; 282 + continue; 283 + } 284 + 285 + $old_field = $old_fields[$field_key]; 286 + 287 + $same_row = true; 288 + foreach ($compare_keys as $compare_key) { 289 + if ($old_field[$compare_key] !== $new_field[$compare_key]) { 290 + $same_row = false; 291 + break; 292 + } 293 + } 294 + 295 + if ($same_row) { 296 + continue; 297 + } 298 + 299 + $new_field['id'] = $old_field['id']; 300 + $update_rows[] = $new_field; 301 + } 302 + 303 + if ($delete_rows) { 304 + queryfx( 305 + $conn, 306 + 'DELETE FROM %T WHERE id IN (%Ld)', 307 + $engine->getFieldTableName(), 308 + ipull($delete_rows, 'id')); 309 + } 310 + 311 + foreach ($update_rows as $update_row) { 312 + queryfx( 313 + $conn, 314 + 'UPDATE %T 315 + SET 316 + rawCorpus = %s, 317 + termCorpus = %s, 318 + normalCorpus = %s 319 + WHERE id = %d', 320 + $engine->getFieldTableName(), 321 + $update_row['rawCorpus'], 322 + $update_row['termCorpus'], 323 + $update_row['normalCorpus'], 324 + $update_row['id']); 325 + } 326 + 327 + foreach ($insert_rows as $insert_row) { 328 + queryfx( 329 + $conn, 330 + 'INSERT INTO %T (documentID, fieldKey, rawCorpus, termCorpus, 331 + normalCorpus) VALUES (%d, %s, %s, %s, %s)', 332 + $engine->getFieldTableName(), 333 + $document_id, 334 + $insert_row['fieldKey'], 335 + $insert_row['rawCorpus'], 336 + $insert_row['termCorpus'], 337 + $insert_row['normalCorpus']); 338 + } 339 + } 340 + 341 + private function updateStoredNgrams( 342 + AphrontDatabaseConnection $conn, 343 + $is_new, 344 + $document_id, 220 345 PhabricatorFerretEngine $engine, 221 - $object, 222 - PhabricatorSearchAbstractDocument $document) { 346 + $new_ngrams) { 223 347 224 - $conn = $object->establishConnection('w'); 348 + if ($is_new) { 349 + $old_ngrams = array(); 350 + } else { 351 + $old_ngrams = queryfx_all( 352 + $conn, 353 + 'SELECT id, ngram FROM %T WHERE documentID = %d', 354 + $engine->getNgramsTableName(), 355 + $document_id); 356 + } 225 357 226 - $old_document = queryfx_one( 227 - $conn, 228 - 'SELECT * FROM %T WHERE objectPHID = %s', 229 - $engine->getDocumentTableName(), 230 - $object->getPHID()); 231 - if (!$old_document) { 232 - return; 358 + $old_ngrams = ipull($old_ngrams, 'id', 'ngram'); 359 + $new_ngrams = array_fuse($new_ngrams); 360 + 361 + $delete_ids = array(); 362 + $insert_ngrams = array(); 363 + 364 + // NOTE: MySQL discards trailing whitespace in CHAR(X) columns. 365 + 366 + foreach ($old_ngrams as $ngram => $id) { 367 + if (isset($new_ngrams[$ngram])) { 368 + continue; 369 + } 370 + 371 + $untrimmed_ngram = $ngram.' '; 372 + if (isset($new_ngrams[$untrimmed_ngram])) { 373 + continue; 374 + } 375 + 376 + $delete_ids[] = $id; 233 377 } 234 378 235 - $old_id = $old_document['id']; 379 + foreach ($new_ngrams as $ngram) { 380 + if (isset($old_ngrams[$ngram])) { 381 + continue; 382 + } 236 383 237 - queryfx( 238 - $conn, 239 - 'DELETE FROM %T WHERE id = %d', 240 - $engine->getDocumentTableName(), 241 - $old_id); 384 + $trimmed_ngram = rtrim($ngram, ' '); 385 + if (isset($old_ngrams[$trimmed_ngram])) { 386 + continue; 387 + } 242 388 243 - queryfx( 244 - $conn, 245 - 'DELETE FROM %T WHERE documentID = %d', 246 - $engine->getFieldTableName(), 247 - $old_id); 389 + $insert_ngrams[] = $ngram; 390 + } 248 391 249 - queryfx( 250 - $conn, 251 - 'DELETE FROM %T WHERE documentID = %d', 252 - $engine->getNgramsTableName(), 253 - $old_id); 392 + if ($delete_ids) { 393 + $sql = array(); 394 + foreach ($delete_ids as $id) { 395 + $sql[] = qsprintf( 396 + $conn, 397 + '%d', 398 + $id); 399 + } 400 + 401 + foreach (PhabricatorLiskDAO::chunkSQL($sql) as $chunk) { 402 + queryfx( 403 + $conn, 404 + 'DELETE FROM %T WHERE id IN (%LQ)', 405 + $engine->getNgramsTableName(), 406 + $chunk); 407 + } 408 + } 409 + 410 + if ($insert_ngrams) { 411 + $sql = array(); 412 + foreach ($insert_ngrams as $ngram) { 413 + $sql[] = qsprintf( 414 + $conn, 415 + '(%d, %s)', 416 + $document_id, 417 + $ngram); 418 + } 419 + 420 + foreach (PhabricatorLiskDAO::chunkSQL($sql) as $chunk) { 421 + queryfx( 422 + $conn, 423 + 'INSERT INTO %T (documentID, ngram) VALUES %LQ', 424 + $engine->getNgramsTableName(), 425 + $chunk); 426 + } 427 + } 254 428 } 255 429 256 430 public function newFerretSearchFunctions() {