@recaptime-dev's working patches + fork for Phorge, a community fork of Phabricator. (Upstream dev and stable branches are at upstream/main and upstream/stable respectively.)
hq.recaptime.dev/wiki/Phorge
phorge
phabricator
1<?php
2
3/**
4 * Given a commit and a path, efficiently determine the most recent ancestor
5 * commit where the path was touched.
6 *
7 * In Git and Mercurial, log operations with a path are relatively slow. For
8 * example:
9 *
10 * git log -n1 <commit> -- <path>
11 *
12 * ...routinely takes several hundred milliseconds, and equivalent requests
13 * often take longer in Mercurial.
14 *
15 * Unfortunately, this operation is fundamental to rendering a repository for
16 * the web, and essentially everything else that's slow can be reduced to this
17 * plus some trivial work afterward. Making this fast is desirable and powerful,
18 * and allows us to make other things fast by expressing them in terms of this
19 * query.
20 *
21 * Because the query is fundamentally a graph query, it isn't easy to express
22 * in a reasonable way in MySQL, and we can't do round trips to the server to
23 * walk the graph without incurring huge performance penalties.
24 *
25 * However, the total amount of data in the graph is relatively small. By
26 * caching it in chunks and keeping it in APC, we can reasonably load and walk
27 * the graph in PHP quickly.
28 *
29 * For more context, see T2683.
30 *
31 * Structure of the Cache
32 * ======================
33 *
34 * The cache divides commits into buckets (see @{method:getBucketSize}). To
35 * walk the graph, we pull a commit's bucket. The bucket is a map from commit
36 * IDs to a list of parents and changed paths, separated by `null`. For
37 * example, a bucket might look like this:
38 *
39 * array(
40 * 1 => array(0, null, 17, 18),
41 * 2 => array(1, null, 4),
42 * // ...
43 * )
44 *
45 * This means that commit ID 1 has parent commit 0 (a special value meaning
46 * no parents) and affected path IDs 17 and 18. Commit ID 2 has parent commit 1,
47 * and affected path 4.
48 *
49 * This data structure attempts to balance compactness, ease of construction,
50 * simplicity of cache semantics, and lookup performance. In the average case,
51 * it appears to do a reasonable job at this.
52 *
53 * @task query Querying the Graph Cache
54 * @task cache Cache Internals
55 */
56final class PhabricatorRepositoryGraphCache extends Phobject {
57
58 private $rebuiltKeys = array();
59
60
61/* -( Querying the Graph Cache )------------------------------------------- */
62
63
64 /**
65 * Search the graph cache for the most recent modification to a path.
66 *
67 * @param int $commit_id The commit ID to search ancestors of.
68 * @param int $path_id The path ID to search for changes to.
69 * @param float $time Maximum number of seconds to spend trying to satisfy
70 * this query using the graph cache. By default `0.5` (500ms).
71 * @return mixed Commit ID, or `null` if no ancestors exist, or `false` if
72 * the graph cache was unable to determine the answer.
73 * @task query
74 */
75 public function loadLastModifiedCommitID($commit_id, $path_id, $time = 0.5) {
76 $commit_id = (int)$commit_id;
77 $path_id = (int)$path_id;
78
79 $bucket_data = null;
80 $data_key = null;
81 $seen = array();
82
83 $t_start = microtime(true);
84 $iterations = 0;
85 while (true) {
86 $bucket_key = $this->getBucketKey($commit_id);
87
88 if (($data_key != $bucket_key) || $bucket_data === null) {
89 $bucket_data = $this->getBucketData($bucket_key);
90 $data_key = $bucket_key;
91 }
92
93 if (empty($bucket_data[$commit_id])) {
94 // Rebuild the cache bucket, since the commit might be a very recent
95 // one that we'll pick up by rebuilding.
96
97 $bucket_data = $this->getBucketData($bucket_key, $bucket_data);
98 if (empty($bucket_data[$commit_id])) {
99 // A rebuild didn't help. This can occur legitimately if the commit
100 // is new and hasn't parsed yet.
101 return false;
102 }
103
104 // Otherwise, the rebuild gave us the data, so we can keep going.
105
106 $did_fill = true;
107 } else {
108 $did_fill = false;
109 }
110
111 // Sanity check so we can survive and recover from bad data.
112 if (isset($seen[$commit_id])) {
113 phlog(pht('Unexpected infinite loop in %s!', self::class));
114 return false;
115 } else {
116 $seen[$commit_id] = true;
117 }
118
119 // `$data` is a list: the commit's parent IDs, followed by `null`,
120 // followed by the modified paths in ascending order. We figure out the
121 // first parent first, then check if the path was touched. If the path
122 // was touched, this is the commit we're after. If not, walk backward
123 // in the tree.
124
125 $items = $bucket_data[$commit_id];
126 $size = count($items);
127
128 // Walk past the parent information.
129 $parent_id = null;
130 for ($ii = 0;; ++$ii) {
131 if ($items[$ii] === null) {
132 break;
133 }
134 if ($parent_id === null) {
135 $parent_id = $items[$ii];
136 }
137 }
138
139 // Look for a modification to the path.
140 for (; $ii < $size; ++$ii) {
141 $item = $items[$ii];
142 if ($item > $path_id) {
143 break;
144 }
145 if ($item === $path_id) {
146 return $commit_id;
147 }
148 }
149
150 if ($parent_id) {
151 $commit_id = $parent_id;
152
153 // Periodically check if we've spent too long looking for a result
154 // in the cache, and return so we can fall back to a VCS operation.
155 // This keeps us from having a degenerate worst case if, e.g., the
156 // cache is cold and we need to inspect a very large number of blocks
157 // to satisfy the query.
158
159 ++$iterations;
160
161 // If we performed a cache fill in this cycle, always check the time
162 // limit, since cache fills may take a significant amount of time.
163
164 if ($did_fill || ($iterations % 64 === 0)) {
165 $t_end = microtime(true);
166 if (($t_end - $t_start) > $time) {
167 return false;
168 }
169 }
170 continue;
171 }
172
173 // If we have an explicit 0, that means this commit really has no parents.
174 // Usually, it is the first commit in the repository.
175 if ($parent_id === 0) {
176 return null;
177 }
178
179 // If we didn't find a parent, the parent data isn't available. We fail
180 // to find an answer in the cache and fall back to querying the VCS.
181 return false;
182 }
183 }
184
185
186/* -( Cache Internals )---------------------------------------------------- */
187
188
189 /**
190 * Get the bucket key for a given commit ID.
191 *
192 * @param int $commit_id Commit ID.
193 * @return int Bucket key.
194 * @task cache
195 */
196 private function getBucketKey($commit_id) {
197 return (int)floor($commit_id / $this->getBucketSize());
198 }
199
200
201 /**
202 * Get the cache key for a given bucket key (from @{method:getBucketKey}).
203 *
204 * @param int $bucket_key Bucket key.
205 * @return string Cache key.
206 * @task cache
207 */
208 private function getBucketCacheKey($bucket_key) {
209 static $prefix;
210
211 if ($prefix === null) {
212 $self = get_class($this);
213 $size = $this->getBucketSize();
214 $prefix = "{$self}:{$size}:2:";
215 }
216
217 return $prefix.$bucket_key;
218 }
219
220
221 /**
222 * Get the number of items per bucket.
223 *
224 * @return int Number of items to store per bucket.
225 * @task cache
226 */
227 private function getBucketSize() {
228 return 4096;
229 }
230
231
232 /**
233 * Retrieve or build a graph cache bucket from the cache.
234 *
235 * Normally, this operates as a readthrough cache call. It can also be used
236 * to force a cache update by passing the existing data to `$rebuild_data`.
237 *
238 * @param int $bucket_key Bucket key, from @{method:getBucketKey}.
239 * @param mixed $rebuild_data (optional) Current data, to force a cache
240 * rebuild of this bucket.
241 * @return array Data from the cache.
242 * @task cache
243 */
244 private function getBucketData($bucket_key, $rebuild_data = null) {
245 $cache_key = $this->getBucketCacheKey($bucket_key);
246
247 // TODO: This cache stuff could be handled more gracefully, but the
248 // database cache currently requires values to be strings and needs
249 // some tweaking to support this as part of a stack. Our cache semantics
250 // here are also unusual (not purely readthrough) because this cache is
251 // appendable.
252
253 $cache_level1 = PhabricatorCaches::getRepositoryGraphL1Cache();
254 $cache_level2 = PhabricatorCaches::getRepositoryGraphL2Cache();
255 if ($rebuild_data === null) {
256 $bucket_data = $cache_level1->getKey($cache_key);
257 if ($bucket_data) {
258 return $bucket_data;
259 }
260
261 $bucket_data = $cache_level2->getKey($cache_key);
262 if ($bucket_data) {
263 $unserialized = @unserialize($bucket_data);
264 if ($unserialized) {
265 // Fill APC if we got a database hit but missed in APC.
266 $cache_level1->setKey($cache_key, $unserialized);
267 return $unserialized;
268 }
269 }
270 }
271
272 if (!is_array($rebuild_data)) {
273 $rebuild_data = array();
274 }
275
276 $bucket_data = $this->rebuildBucket($bucket_key, $rebuild_data);
277
278 // Don't bother writing the data if we didn't update anything.
279 if ($bucket_data !== $rebuild_data) {
280 $cache_level2->setKey($cache_key, serialize($bucket_data));
281 $cache_level1->setKey($cache_key, $bucket_data);
282 }
283
284 return $bucket_data;
285 }
286
287
288 /**
289 * Rebuild a cache bucket, amending existing data if available.
290 *
291 * @param int $bucket_key Bucket key, from @{method:getBucketKey}.
292 * @param array $current_data Existing bucket data.
293 * @return array Rebuilt bucket data.
294 * @task cache
295 */
296 private function rebuildBucket($bucket_key, array $current_data) {
297
298 // First, check if we've already rebuilt this bucket. In some cases (like
299 // browsing a repository at some commit) it's common to issue many lookups
300 // against one commit. If that commit has been discovered but not yet
301 // fully imported, we'll repeatedly attempt to rebuild the bucket. If the
302 // first rebuild did not work, subsequent rebuilds are very unlikely to
303 // have any effect. We can just skip the rebuild in these cases.
304
305 if (isset($this->rebuiltKeys[$bucket_key])) {
306 return $current_data;
307 } else {
308 $this->rebuiltKeys[$bucket_key] = true;
309 }
310
311 $bucket_min = ($bucket_key * $this->getBucketSize());
312 $bucket_max = ($bucket_min + $this->getBucketSize()) - 1;
313
314 // We need to reload all of the commits in the bucket because there is
315 // no guarantee that they'll get parsed in order, so we can fill large
316 // commit IDs before small ones. Later on, we'll ignore the commits we
317 // already know about.
318
319 $table_commit = new PhabricatorRepositoryCommit();
320 $table_repository = new PhabricatorRepository();
321 $conn_r = $table_commit->establishConnection('r');
322
323 // Find all the Git and Mercurial commits in the block which have completed
324 // change import. We can't fill the cache accurately for commits which have
325 // not completed change import, so just pretend we don't know about them.
326 // In these cases, we will ultimately fall back to VCS queries.
327
328 $commit_rows = queryfx_all(
329 $conn_r,
330 'SELECT c.id FROM %T c
331 JOIN %T r ON c.repositoryID = r.id AND r.versionControlSystem IN (%Ls)
332 WHERE c.id BETWEEN %d AND %d
333 AND (c.importStatus & %d) = %d',
334 $table_commit->getTableName(),
335 $table_repository->getTableName(),
336 array(
337 PhabricatorRepositoryType::REPOSITORY_TYPE_GIT,
338 PhabricatorRepositoryType::REPOSITORY_TYPE_MERCURIAL,
339 ),
340 $bucket_min,
341 $bucket_max,
342 PhabricatorRepositoryCommit::IMPORTED_CHANGE,
343 PhabricatorRepositoryCommit::IMPORTED_CHANGE);
344
345 // If we don't have any data, just return the existing data.
346 if (!$commit_rows) {
347 return $current_data;
348 }
349
350 // Remove the commits we already have data for. We don't need to rebuild
351 // these. If there's nothing left, return the existing data.
352
353 $commit_ids = ipull($commit_rows, 'id', 'id');
354 $commit_ids = array_diff_key($commit_ids, $current_data);
355
356 if (!$commit_ids) {
357 return $current_data;
358 }
359
360 // Find all the path changes for the new commits.
361 $path_changes = queryfx_all(
362 $conn_r,
363 'SELECT commitID, pathID FROM %T
364 WHERE commitID IN (%Ld)
365 AND (isDirect = 1 OR changeType = %d)',
366 PhabricatorRepository::TABLE_PATHCHANGE,
367 $commit_ids,
368 DifferentialChangeType::TYPE_CHILD);
369 $path_changes = igroup($path_changes, 'commitID');
370
371 // Find all the parents for the new commits.
372 $parents = queryfx_all(
373 $conn_r,
374 'SELECT childCommitID, parentCommitID FROM %T
375 WHERE childCommitID IN (%Ld)
376 ORDER BY id ASC',
377 PhabricatorRepository::TABLE_PARENTS,
378 $commit_ids);
379 $parents = igroup($parents, 'childCommitID');
380
381 // Build the actual data for the cache.
382 foreach ($commit_ids as $commit_id) {
383 $parent_ids = array();
384 if (!empty($parents[$commit_id])) {
385 foreach ($parents[$commit_id] as $row) {
386 $parent_ids[] = (int)$row['parentCommitID'];
387 }
388 } else {
389 // We expect all rows to have parents (commits with no parents get
390 // an explicit "0" placeholder). If we're in an older repository, the
391 // parent information might not have been populated yet. Decline to fill
392 // the cache if we don't have the parent information, since the fill
393 // will be incorrect.
394 continue;
395 }
396
397 if (isset($path_changes[$commit_id])) {
398 $path_ids = $path_changes[$commit_id];
399 foreach ($path_ids as $key => $path_id) {
400 $path_ids[$key] = (int)$path_id['pathID'];
401 }
402 sort($path_ids);
403 } else {
404 $path_ids = array();
405 }
406
407 $value = $parent_ids;
408 $value[] = null;
409 foreach ($path_ids as $path_id) {
410 $value[] = $path_id;
411 }
412
413 $current_data[$commit_id] = $value;
414 }
415
416 return $current_data;
417 }
418
419}