src/applications/repository/graphcache/PhabricatorRepositoryGraphCache.php at recaptime-dev/main

@recaptime-dev's working patches + fork for Phorge, a community fork of Phabricator. (Upstream dev and stable branches are at upstream/main and upstream/stable respectively.) hq.recaptime.dev/wiki/Phorge
phorge phabricator
fork atom
phorge / src / applications / repository / graphcache / PhabricatorRepositoryGraphCache.php
at recaptime-dev/main 419 lines 14 kB view raw
wrap content
Andre Klapper Misc PhpDoc additions or improvements 5mo ago
9d087e8b
  1<?php
  2
  3/**
  4 * Given a commit and a path, efficiently determine the most recent ancestor
  5 * commit where the path was touched.
  6 *
  7 * In Git and Mercurial, log operations with a path are relatively slow. For
  8 * example:
  9 *
 10 *    git log -n1 <commit> -- <path>
 11 *
 12 * ...routinely takes several hundred milliseconds, and equivalent requests
 13 * often take longer in Mercurial.
 14 *
 15 * Unfortunately, this operation is fundamental to rendering a repository for
 16 * the web, and essentially everything else that's slow can be reduced to this
 17 * plus some trivial work afterward. Making this fast is desirable and powerful,
 18 * and allows us to make other things fast by expressing them in terms of this
 19 * query.
 20 *
 21 * Because the query is fundamentally a graph query, it isn't easy to express
 22 * in a reasonable way in MySQL, and we can't do round trips to the server to
 23 * walk the graph without incurring huge performance penalties.
 24 *
 25 * However, the total amount of data in the graph is relatively small. By
 26 * caching it in chunks and keeping it in APC, we can reasonably load and walk
 27 * the graph in PHP quickly.
 28 *
 29 * For more context, see T2683.
 30 *
 31 * Structure of the Cache
 32 * ======================
 33 *
 34 * The cache divides commits into buckets (see @{method:getBucketSize}). To
 35 * walk the graph, we pull a commit's bucket. The bucket is a map from commit
 36 * IDs to a list of parents and changed paths, separated by `null`. For
 37 * example, a bucket might look like this:
 38 *
 39 *   array(
 40 *     1 => array(0, null, 17, 18),
 41 *     2 => array(1, null, 4),
 42 *     // ...
 43 *   )
 44 *
 45 * This means that commit ID 1 has parent commit 0 (a special value meaning
 46 * no parents) and affected path IDs 17 and 18. Commit ID 2 has parent commit 1,
 47 * and affected path 4.
 48 *
 49 * This data structure attempts to balance compactness, ease of construction,
 50 * simplicity of cache semantics, and lookup performance. In the average case,
 51 * it appears to do a reasonable job at this.
 52 *
 53 * @task query Querying the Graph Cache
 54 * @task cache Cache Internals
 55 */
 56final class PhabricatorRepositoryGraphCache extends Phobject {
 57
 58  private $rebuiltKeys = array();
 59
 60
 61/* -(  Querying the Graph Cache  )------------------------------------------- */
 62
 63
 64  /**
 65   * Search the graph cache for the most recent modification to a path.
 66   *
 67   * @param int     $commit_id The commit ID to search ancestors of.
 68   * @param int     $path_id The path ID to search for changes to.
 69   * @param float   $time Maximum number of seconds to spend trying to satisfy
 70   *                this query using the graph cache. By default `0.5` (500ms).
 71   * @return mixed  Commit ID, or `null` if no ancestors exist, or `false` if
 72   *                the graph cache was unable to determine the answer.
 73   * @task query
 74   */
 75  public function loadLastModifiedCommitID($commit_id, $path_id, $time = 0.5) {
 76    $commit_id = (int)$commit_id;
 77    $path_id = (int)$path_id;
 78
 79    $bucket_data = null;
 80    $data_key = null;
 81    $seen = array();
 82
 83    $t_start = microtime(true);
 84    $iterations = 0;
 85    while (true) {
 86      $bucket_key = $this->getBucketKey($commit_id);
 87
 88      if (($data_key != $bucket_key) || $bucket_data === null) {
 89        $bucket_data = $this->getBucketData($bucket_key);
 90        $data_key = $bucket_key;
 91      }
 92
 93      if (empty($bucket_data[$commit_id])) {
 94        // Rebuild the cache bucket, since the commit might be a very recent
 95        // one that we'll pick up by rebuilding.
 96
 97        $bucket_data = $this->getBucketData($bucket_key, $bucket_data);
 98        if (empty($bucket_data[$commit_id])) {
 99          // A rebuild didn't help. This can occur legitimately if the commit
100          // is new and hasn't parsed yet.
101          return false;
102        }
103
104        // Otherwise, the rebuild gave us the data, so we can keep going.
105
106        $did_fill = true;
107      } else {
108        $did_fill = false;
109      }
110
111      // Sanity check so we can survive and recover from bad data.
112      if (isset($seen[$commit_id])) {
113        phlog(pht('Unexpected infinite loop in %s!', self::class));
114        return false;
115      } else {
116        $seen[$commit_id] = true;
117      }
118
119      // `$data` is a list: the commit's parent IDs, followed by `null`,
120      // followed by the modified paths in ascending order. We figure out the
121      // first parent first, then check if the path was touched. If the path
122      // was touched, this is the commit we're after. If not, walk backward
123      // in the tree.
124
125      $items = $bucket_data[$commit_id];
126      $size = count($items);
127
128      // Walk past the parent information.
129      $parent_id = null;
130      for ($ii = 0;; ++$ii) {
131        if ($items[$ii] === null) {
132          break;
133        }
134        if ($parent_id === null) {
135          $parent_id = $items[$ii];
136        }
137      }
138
139      // Look for a modification to the path.
140      for (; $ii < $size; ++$ii) {
141        $item = $items[$ii];
142        if ($item > $path_id) {
143          break;
144        }
145        if ($item === $path_id) {
146          return $commit_id;
147        }
148      }
149
150      if ($parent_id) {
151        $commit_id = $parent_id;
152
153        // Periodically check if we've spent too long looking for a result
154        // in the cache, and return so we can fall back to a VCS operation.
155        // This keeps us from having a degenerate worst case if, e.g., the
156        // cache is cold and we need to inspect a very large number of blocks
157        // to satisfy the query.
158
159        ++$iterations;
160
161        // If we performed a cache fill in this cycle, always check the time
162        // limit, since cache fills may take a significant amount of time.
163
164        if ($did_fill || ($iterations % 64 === 0)) {
165          $t_end = microtime(true);
166          if (($t_end - $t_start) > $time) {
167            return false;
168          }
169        }
170        continue;
171      }
172
173      // If we have an explicit 0, that means this commit really has no parents.
174      // Usually, it is the first commit in the repository.
175      if ($parent_id === 0) {
176        return null;
177      }
178
179      // If we didn't find a parent, the parent data isn't available. We fail
180      // to find an answer in the cache and fall back to querying the VCS.
181      return false;
182    }
183  }
184
185
186/* -(  Cache Internals  )---------------------------------------------------- */
187
188
189  /**
190   * Get the bucket key for a given commit ID.
191   *
192   * @param   int   $commit_id Commit ID.
193   * @return  int   Bucket key.
194   * @task cache
195   */
196  private function getBucketKey($commit_id) {
197    return (int)floor($commit_id / $this->getBucketSize());
198  }
199
200
201  /**
202   * Get the cache key for a given bucket key (from @{method:getBucketKey}).
203   *
204   * @param   int     $bucket_key Bucket key.
205   * @return  string  Cache key.
206   * @task cache
207   */
208  private function getBucketCacheKey($bucket_key) {
209    static $prefix;
210
211    if ($prefix === null) {
212      $self = get_class($this);
213      $size = $this->getBucketSize();
214      $prefix = "{$self}:{$size}:2:";
215    }
216
217    return $prefix.$bucket_key;
218  }
219
220
221  /**
222   * Get the number of items per bucket.
223   *
224   * @return  int Number of items to store per bucket.
225   * @task cache
226   */
227  private function getBucketSize() {
228    return 4096;
229  }
230
231
232  /**
233   * Retrieve or build a graph cache bucket from the cache.
234   *
235   * Normally, this operates as a readthrough cache call. It can also be used
236   * to force a cache update by passing the existing data to `$rebuild_data`.
237   *
238   * @param   int    $bucket_key Bucket key, from @{method:getBucketKey}.
239   * @param   mixed  $rebuild_data (optional) Current data, to force a cache
240   *                 rebuild of this bucket.
241   * @return  array  Data from the cache.
242   * @task cache
243   */
244  private function getBucketData($bucket_key, $rebuild_data = null) {
245    $cache_key = $this->getBucketCacheKey($bucket_key);
246
247    // TODO: This cache stuff could be handled more gracefully, but the
248    // database cache currently requires values to be strings and needs
249    // some tweaking to support this as part of a stack. Our cache semantics
250    // here are also unusual (not purely readthrough) because this cache is
251    // appendable.
252
253    $cache_level1 = PhabricatorCaches::getRepositoryGraphL1Cache();
254    $cache_level2 = PhabricatorCaches::getRepositoryGraphL2Cache();
255    if ($rebuild_data === null) {
256      $bucket_data = $cache_level1->getKey($cache_key);
257      if ($bucket_data) {
258        return $bucket_data;
259      }
260
261      $bucket_data = $cache_level2->getKey($cache_key);
262      if ($bucket_data) {
263        $unserialized = @unserialize($bucket_data);
264        if ($unserialized) {
265          // Fill APC if we got a database hit but missed in APC.
266          $cache_level1->setKey($cache_key, $unserialized);
267          return $unserialized;
268        }
269      }
270    }
271
272    if (!is_array($rebuild_data)) {
273      $rebuild_data = array();
274    }
275
276    $bucket_data = $this->rebuildBucket($bucket_key, $rebuild_data);
277
278    // Don't bother writing the data if we didn't update anything.
279    if ($bucket_data !== $rebuild_data) {
280      $cache_level2->setKey($cache_key, serialize($bucket_data));
281      $cache_level1->setKey($cache_key, $bucket_data);
282    }
283
284    return $bucket_data;
285  }
286
287
288  /**
289   * Rebuild a cache bucket, amending existing data if available.
290   *
291   * @param   int   $bucket_key Bucket key, from @{method:getBucketKey}.
292   * @param   array $current_data Existing bucket data.
293   * @return  array Rebuilt bucket data.
294   * @task cache
295   */
296  private function rebuildBucket($bucket_key, array $current_data) {
297
298    // First, check if we've already rebuilt this bucket. In some cases (like
299    // browsing a repository at some commit) it's common to issue many lookups
300    // against one commit. If that commit has been discovered but not yet
301    // fully imported, we'll repeatedly attempt to rebuild the bucket. If the
302    // first rebuild did not work, subsequent rebuilds are very unlikely to
303    // have any effect. We can just skip the rebuild in these cases.
304
305    if (isset($this->rebuiltKeys[$bucket_key])) {
306      return $current_data;
307    } else {
308      $this->rebuiltKeys[$bucket_key] = true;
309    }
310
311    $bucket_min = ($bucket_key * $this->getBucketSize());
312    $bucket_max = ($bucket_min + $this->getBucketSize()) - 1;
313
314    // We need to reload all of the commits in the bucket because there is
315    // no guarantee that they'll get parsed in order, so we can fill large
316    // commit IDs before small ones. Later on, we'll ignore the commits we
317    // already know about.
318
319    $table_commit = new PhabricatorRepositoryCommit();
320    $table_repository = new PhabricatorRepository();
321    $conn_r = $table_commit->establishConnection('r');
322
323    // Find all the Git and Mercurial commits in the block which have completed
324    // change import. We can't fill the cache accurately for commits which have
325    // not completed change import, so just pretend we don't know about them.
326    // In these cases, we will ultimately fall back to VCS queries.
327
328    $commit_rows = queryfx_all(
329      $conn_r,
330      'SELECT c.id FROM %T c
331        JOIN %T r ON c.repositoryID = r.id AND r.versionControlSystem IN (%Ls)
332        WHERE c.id BETWEEN %d AND %d
333          AND (c.importStatus & %d) = %d',
334      $table_commit->getTableName(),
335      $table_repository->getTableName(),
336      array(
337        PhabricatorRepositoryType::REPOSITORY_TYPE_GIT,
338        PhabricatorRepositoryType::REPOSITORY_TYPE_MERCURIAL,
339      ),
340      $bucket_min,
341      $bucket_max,
342      PhabricatorRepositoryCommit::IMPORTED_CHANGE,
343      PhabricatorRepositoryCommit::IMPORTED_CHANGE);
344
345    // If we don't have any data, just return the existing data.
346    if (!$commit_rows) {
347      return $current_data;
348    }
349
350    // Remove the commits we already have data for. We don't need to rebuild
351    // these. If there's nothing left, return the existing data.
352
353    $commit_ids = ipull($commit_rows, 'id', 'id');
354    $commit_ids = array_diff_key($commit_ids, $current_data);
355
356    if (!$commit_ids) {
357      return $current_data;
358    }
359
360    // Find all the path changes for the new commits.
361    $path_changes = queryfx_all(
362      $conn_r,
363      'SELECT commitID, pathID FROM %T
364        WHERE commitID IN (%Ld)
365        AND (isDirect = 1 OR changeType = %d)',
366      PhabricatorRepository::TABLE_PATHCHANGE,
367      $commit_ids,
368      DifferentialChangeType::TYPE_CHILD);
369    $path_changes = igroup($path_changes, 'commitID');
370
371    // Find all the parents for the new commits.
372    $parents = queryfx_all(
373      $conn_r,
374      'SELECT childCommitID, parentCommitID FROM %T
375        WHERE childCommitID IN (%Ld)
376        ORDER BY id ASC',
377      PhabricatorRepository::TABLE_PARENTS,
378      $commit_ids);
379    $parents = igroup($parents, 'childCommitID');
380
381    // Build the actual data for the cache.
382    foreach ($commit_ids as $commit_id) {
383      $parent_ids = array();
384      if (!empty($parents[$commit_id])) {
385        foreach ($parents[$commit_id] as $row) {
386          $parent_ids[] = (int)$row['parentCommitID'];
387        }
388      } else {
389        // We expect all rows to have parents (commits with no parents get
390        // an explicit "0" placeholder). If we're in an older repository, the
391        // parent information might not have been populated yet. Decline to fill
392        // the cache if we don't have the parent information, since the fill
393        // will be incorrect.
394        continue;
395      }
396
397      if (isset($path_changes[$commit_id])) {
398        $path_ids = $path_changes[$commit_id];
399        foreach ($path_ids as $key => $path_id) {
400          $path_ids[$key] = (int)$path_id['pathID'];
401        }
402        sort($path_ids);
403      } else {
404        $path_ids = array();
405      }
406
407      $value = $parent_ids;
408      $value[] = null;
409      foreach ($path_ids as $path_id) {
410        $value[] = $path_id;
411      }
412
413      $current_data[$commit_id] = $value;
414    }
415
416    return $current_data;
417  }
418
419}