@recaptime-dev's working patches + fork for Phorge, a community fork of Phabricator. (Upstream dev and stable branches are at upstream/main and upstream/stable respectively.) hq.recaptime.dev/wiki/Phorge
phorge phabricator
at recaptime-dev/main 419 lines 14 kB view raw
1<?php 2 3/** 4 * Given a commit and a path, efficiently determine the most recent ancestor 5 * commit where the path was touched. 6 * 7 * In Git and Mercurial, log operations with a path are relatively slow. For 8 * example: 9 * 10 * git log -n1 <commit> -- <path> 11 * 12 * ...routinely takes several hundred milliseconds, and equivalent requests 13 * often take longer in Mercurial. 14 * 15 * Unfortunately, this operation is fundamental to rendering a repository for 16 * the web, and essentially everything else that's slow can be reduced to this 17 * plus some trivial work afterward. Making this fast is desirable and powerful, 18 * and allows us to make other things fast by expressing them in terms of this 19 * query. 20 * 21 * Because the query is fundamentally a graph query, it isn't easy to express 22 * in a reasonable way in MySQL, and we can't do round trips to the server to 23 * walk the graph without incurring huge performance penalties. 24 * 25 * However, the total amount of data in the graph is relatively small. By 26 * caching it in chunks and keeping it in APC, we can reasonably load and walk 27 * the graph in PHP quickly. 28 * 29 * For more context, see T2683. 30 * 31 * Structure of the Cache 32 * ====================== 33 * 34 * The cache divides commits into buckets (see @{method:getBucketSize}). To 35 * walk the graph, we pull a commit's bucket. The bucket is a map from commit 36 * IDs to a list of parents and changed paths, separated by `null`. For 37 * example, a bucket might look like this: 38 * 39 * array( 40 * 1 => array(0, null, 17, 18), 41 * 2 => array(1, null, 4), 42 * // ... 43 * ) 44 * 45 * This means that commit ID 1 has parent commit 0 (a special value meaning 46 * no parents) and affected path IDs 17 and 18. Commit ID 2 has parent commit 1, 47 * and affected path 4. 48 * 49 * This data structure attempts to balance compactness, ease of construction, 50 * simplicity of cache semantics, and lookup performance. In the average case, 51 * it appears to do a reasonable job at this. 52 * 53 * @task query Querying the Graph Cache 54 * @task cache Cache Internals 55 */ 56final class PhabricatorRepositoryGraphCache extends Phobject { 57 58 private $rebuiltKeys = array(); 59 60 61/* -( Querying the Graph Cache )------------------------------------------- */ 62 63 64 /** 65 * Search the graph cache for the most recent modification to a path. 66 * 67 * @param int $commit_id The commit ID to search ancestors of. 68 * @param int $path_id The path ID to search for changes to. 69 * @param float $time Maximum number of seconds to spend trying to satisfy 70 * this query using the graph cache. By default `0.5` (500ms). 71 * @return mixed Commit ID, or `null` if no ancestors exist, or `false` if 72 * the graph cache was unable to determine the answer. 73 * @task query 74 */ 75 public function loadLastModifiedCommitID($commit_id, $path_id, $time = 0.5) { 76 $commit_id = (int)$commit_id; 77 $path_id = (int)$path_id; 78 79 $bucket_data = null; 80 $data_key = null; 81 $seen = array(); 82 83 $t_start = microtime(true); 84 $iterations = 0; 85 while (true) { 86 $bucket_key = $this->getBucketKey($commit_id); 87 88 if (($data_key != $bucket_key) || $bucket_data === null) { 89 $bucket_data = $this->getBucketData($bucket_key); 90 $data_key = $bucket_key; 91 } 92 93 if (empty($bucket_data[$commit_id])) { 94 // Rebuild the cache bucket, since the commit might be a very recent 95 // one that we'll pick up by rebuilding. 96 97 $bucket_data = $this->getBucketData($bucket_key, $bucket_data); 98 if (empty($bucket_data[$commit_id])) { 99 // A rebuild didn't help. This can occur legitimately if the commit 100 // is new and hasn't parsed yet. 101 return false; 102 } 103 104 // Otherwise, the rebuild gave us the data, so we can keep going. 105 106 $did_fill = true; 107 } else { 108 $did_fill = false; 109 } 110 111 // Sanity check so we can survive and recover from bad data. 112 if (isset($seen[$commit_id])) { 113 phlog(pht('Unexpected infinite loop in %s!', self::class)); 114 return false; 115 } else { 116 $seen[$commit_id] = true; 117 } 118 119 // `$data` is a list: the commit's parent IDs, followed by `null`, 120 // followed by the modified paths in ascending order. We figure out the 121 // first parent first, then check if the path was touched. If the path 122 // was touched, this is the commit we're after. If not, walk backward 123 // in the tree. 124 125 $items = $bucket_data[$commit_id]; 126 $size = count($items); 127 128 // Walk past the parent information. 129 $parent_id = null; 130 for ($ii = 0;; ++$ii) { 131 if ($items[$ii] === null) { 132 break; 133 } 134 if ($parent_id === null) { 135 $parent_id = $items[$ii]; 136 } 137 } 138 139 // Look for a modification to the path. 140 for (; $ii < $size; ++$ii) { 141 $item = $items[$ii]; 142 if ($item > $path_id) { 143 break; 144 } 145 if ($item === $path_id) { 146 return $commit_id; 147 } 148 } 149 150 if ($parent_id) { 151 $commit_id = $parent_id; 152 153 // Periodically check if we've spent too long looking for a result 154 // in the cache, and return so we can fall back to a VCS operation. 155 // This keeps us from having a degenerate worst case if, e.g., the 156 // cache is cold and we need to inspect a very large number of blocks 157 // to satisfy the query. 158 159 ++$iterations; 160 161 // If we performed a cache fill in this cycle, always check the time 162 // limit, since cache fills may take a significant amount of time. 163 164 if ($did_fill || ($iterations % 64 === 0)) { 165 $t_end = microtime(true); 166 if (($t_end - $t_start) > $time) { 167 return false; 168 } 169 } 170 continue; 171 } 172 173 // If we have an explicit 0, that means this commit really has no parents. 174 // Usually, it is the first commit in the repository. 175 if ($parent_id === 0) { 176 return null; 177 } 178 179 // If we didn't find a parent, the parent data isn't available. We fail 180 // to find an answer in the cache and fall back to querying the VCS. 181 return false; 182 } 183 } 184 185 186/* -( Cache Internals )---------------------------------------------------- */ 187 188 189 /** 190 * Get the bucket key for a given commit ID. 191 * 192 * @param int $commit_id Commit ID. 193 * @return int Bucket key. 194 * @task cache 195 */ 196 private function getBucketKey($commit_id) { 197 return (int)floor($commit_id / $this->getBucketSize()); 198 } 199 200 201 /** 202 * Get the cache key for a given bucket key (from @{method:getBucketKey}). 203 * 204 * @param int $bucket_key Bucket key. 205 * @return string Cache key. 206 * @task cache 207 */ 208 private function getBucketCacheKey($bucket_key) { 209 static $prefix; 210 211 if ($prefix === null) { 212 $self = get_class($this); 213 $size = $this->getBucketSize(); 214 $prefix = "{$self}:{$size}:2:"; 215 } 216 217 return $prefix.$bucket_key; 218 } 219 220 221 /** 222 * Get the number of items per bucket. 223 * 224 * @return int Number of items to store per bucket. 225 * @task cache 226 */ 227 private function getBucketSize() { 228 return 4096; 229 } 230 231 232 /** 233 * Retrieve or build a graph cache bucket from the cache. 234 * 235 * Normally, this operates as a readthrough cache call. It can also be used 236 * to force a cache update by passing the existing data to `$rebuild_data`. 237 * 238 * @param int $bucket_key Bucket key, from @{method:getBucketKey}. 239 * @param mixed $rebuild_data (optional) Current data, to force a cache 240 * rebuild of this bucket. 241 * @return array Data from the cache. 242 * @task cache 243 */ 244 private function getBucketData($bucket_key, $rebuild_data = null) { 245 $cache_key = $this->getBucketCacheKey($bucket_key); 246 247 // TODO: This cache stuff could be handled more gracefully, but the 248 // database cache currently requires values to be strings and needs 249 // some tweaking to support this as part of a stack. Our cache semantics 250 // here are also unusual (not purely readthrough) because this cache is 251 // appendable. 252 253 $cache_level1 = PhabricatorCaches::getRepositoryGraphL1Cache(); 254 $cache_level2 = PhabricatorCaches::getRepositoryGraphL2Cache(); 255 if ($rebuild_data === null) { 256 $bucket_data = $cache_level1->getKey($cache_key); 257 if ($bucket_data) { 258 return $bucket_data; 259 } 260 261 $bucket_data = $cache_level2->getKey($cache_key); 262 if ($bucket_data) { 263 $unserialized = @unserialize($bucket_data); 264 if ($unserialized) { 265 // Fill APC if we got a database hit but missed in APC. 266 $cache_level1->setKey($cache_key, $unserialized); 267 return $unserialized; 268 } 269 } 270 } 271 272 if (!is_array($rebuild_data)) { 273 $rebuild_data = array(); 274 } 275 276 $bucket_data = $this->rebuildBucket($bucket_key, $rebuild_data); 277 278 // Don't bother writing the data if we didn't update anything. 279 if ($bucket_data !== $rebuild_data) { 280 $cache_level2->setKey($cache_key, serialize($bucket_data)); 281 $cache_level1->setKey($cache_key, $bucket_data); 282 } 283 284 return $bucket_data; 285 } 286 287 288 /** 289 * Rebuild a cache bucket, amending existing data if available. 290 * 291 * @param int $bucket_key Bucket key, from @{method:getBucketKey}. 292 * @param array $current_data Existing bucket data. 293 * @return array Rebuilt bucket data. 294 * @task cache 295 */ 296 private function rebuildBucket($bucket_key, array $current_data) { 297 298 // First, check if we've already rebuilt this bucket. In some cases (like 299 // browsing a repository at some commit) it's common to issue many lookups 300 // against one commit. If that commit has been discovered but not yet 301 // fully imported, we'll repeatedly attempt to rebuild the bucket. If the 302 // first rebuild did not work, subsequent rebuilds are very unlikely to 303 // have any effect. We can just skip the rebuild in these cases. 304 305 if (isset($this->rebuiltKeys[$bucket_key])) { 306 return $current_data; 307 } else { 308 $this->rebuiltKeys[$bucket_key] = true; 309 } 310 311 $bucket_min = ($bucket_key * $this->getBucketSize()); 312 $bucket_max = ($bucket_min + $this->getBucketSize()) - 1; 313 314 // We need to reload all of the commits in the bucket because there is 315 // no guarantee that they'll get parsed in order, so we can fill large 316 // commit IDs before small ones. Later on, we'll ignore the commits we 317 // already know about. 318 319 $table_commit = new PhabricatorRepositoryCommit(); 320 $table_repository = new PhabricatorRepository(); 321 $conn_r = $table_commit->establishConnection('r'); 322 323 // Find all the Git and Mercurial commits in the block which have completed 324 // change import. We can't fill the cache accurately for commits which have 325 // not completed change import, so just pretend we don't know about them. 326 // In these cases, we will ultimately fall back to VCS queries. 327 328 $commit_rows = queryfx_all( 329 $conn_r, 330 'SELECT c.id FROM %T c 331 JOIN %T r ON c.repositoryID = r.id AND r.versionControlSystem IN (%Ls) 332 WHERE c.id BETWEEN %d AND %d 333 AND (c.importStatus & %d) = %d', 334 $table_commit->getTableName(), 335 $table_repository->getTableName(), 336 array( 337 PhabricatorRepositoryType::REPOSITORY_TYPE_GIT, 338 PhabricatorRepositoryType::REPOSITORY_TYPE_MERCURIAL, 339 ), 340 $bucket_min, 341 $bucket_max, 342 PhabricatorRepositoryCommit::IMPORTED_CHANGE, 343 PhabricatorRepositoryCommit::IMPORTED_CHANGE); 344 345 // If we don't have any data, just return the existing data. 346 if (!$commit_rows) { 347 return $current_data; 348 } 349 350 // Remove the commits we already have data for. We don't need to rebuild 351 // these. If there's nothing left, return the existing data. 352 353 $commit_ids = ipull($commit_rows, 'id', 'id'); 354 $commit_ids = array_diff_key($commit_ids, $current_data); 355 356 if (!$commit_ids) { 357 return $current_data; 358 } 359 360 // Find all the path changes for the new commits. 361 $path_changes = queryfx_all( 362 $conn_r, 363 'SELECT commitID, pathID FROM %T 364 WHERE commitID IN (%Ld) 365 AND (isDirect = 1 OR changeType = %d)', 366 PhabricatorRepository::TABLE_PATHCHANGE, 367 $commit_ids, 368 DifferentialChangeType::TYPE_CHILD); 369 $path_changes = igroup($path_changes, 'commitID'); 370 371 // Find all the parents for the new commits. 372 $parents = queryfx_all( 373 $conn_r, 374 'SELECT childCommitID, parentCommitID FROM %T 375 WHERE childCommitID IN (%Ld) 376 ORDER BY id ASC', 377 PhabricatorRepository::TABLE_PARENTS, 378 $commit_ids); 379 $parents = igroup($parents, 'childCommitID'); 380 381 // Build the actual data for the cache. 382 foreach ($commit_ids as $commit_id) { 383 $parent_ids = array(); 384 if (!empty($parents[$commit_id])) { 385 foreach ($parents[$commit_id] as $row) { 386 $parent_ids[] = (int)$row['parentCommitID']; 387 } 388 } else { 389 // We expect all rows to have parents (commits with no parents get 390 // an explicit "0" placeholder). If we're in an older repository, the 391 // parent information might not have been populated yet. Decline to fill 392 // the cache if we don't have the parent information, since the fill 393 // will be incorrect. 394 continue; 395 } 396 397 if (isset($path_changes[$commit_id])) { 398 $path_ids = $path_changes[$commit_id]; 399 foreach ($path_ids as $key => $path_id) { 400 $path_ids[$key] = (int)$path_id['pathID']; 401 } 402 sort($path_ids); 403 } else { 404 $path_ids = array(); 405 } 406 407 $value = $parent_ids; 408 $value[] = null; 409 foreach ($path_ids as $path_id) { 410 $value[] = $path_id; 411 } 412 413 $current_data[$commit_id] = $value; 414 } 415 416 return $current_data; 417 } 418 419}