@recaptime-dev's working patches + fork for Phorge, a community fork of Phabricator. (Upstream dev and stable branches are at upstream/main and upstream/stable respectively.)
hq.recaptime.dev/wiki/Phorge
phorge
phabricator
1<?php
2
3final class PhutilProseDifferenceEngine extends Phobject {
4
5 public function getDiff($u, $v) {
6 return $this->buildDiff($u, $v, 0);
7 }
8
9 private function buildDiff($u, $v, $level) {
10 $u_parts = $this->splitCorpus($u, $level);
11 $v_parts = $this->splitCorpus($v, $level);
12
13 if ($level === 0) {
14 $diff = $this->newHashDiff($u_parts, $v_parts);
15 $too_large = false;
16 } else {
17 list($diff, $too_large) = $this->newEditDistanceMatrixDiff(
18 $u_parts,
19 $v_parts,
20 $level);
21 }
22
23 $diff->reorderParts();
24
25 // If we just built a character-level diff, we're all done and do not
26 // need to go any deeper.
27 if ($level == 3) {
28 return $diff;
29 }
30
31 $blocks = array();
32 $block = null;
33 foreach ($diff->getParts() as $part) {
34 $type = $part['type'];
35 $text = $part['text'];
36 switch ($type) {
37 case '=':
38 if ($block) {
39 $blocks[] = $block;
40 $block = null;
41 }
42 $blocks[] = array(
43 'type' => $type,
44 'text' => $text,
45 );
46 break;
47 case '-':
48 if (!$block) {
49 $block = array(
50 'type' => '!',
51 'old' => '',
52 'new' => '',
53 );
54 }
55 $block['old'] .= $text;
56 break;
57 case '+':
58 if (!$block) {
59 $block = array(
60 'type' => '!',
61 'old' => '',
62 'new' => '',
63 );
64 }
65 $block['new'] .= $text;
66 break;
67 }
68 }
69
70 if ($block) {
71 $blocks[] = $block;
72 }
73
74 $result = new PhutilProseDiff();
75 foreach ($blocks as $block) {
76 $type = $block['type'];
77 if ($type == '=') {
78 $result->addPart('=', $block['text']);
79 } else {
80 $old = $block['old'];
81 $new = $block['new'];
82 if (!strlen($old) && !strlen($new)) {
83 // Nothing to do.
84 } else if (!strlen($old)) {
85 $result->addPart('+', $new);
86 } else if (!strlen($new)) {
87 $result->addPart('-', $old);
88 } else {
89 if ($too_large) {
90 // If this text was too big to diff, don't try to subdivide it.
91 $result->addPart('-', $old);
92 $result->addPart('+', $new);
93 } else {
94 $subdiff = $this->buildDiff(
95 $old,
96 $new,
97 $level + 1);
98
99 foreach ($subdiff->getParts() as $part) {
100 $result->addPart($part['type'], $part['text']);
101 }
102 }
103 }
104 }
105 }
106
107 $result->reorderParts();
108
109 return $result;
110 }
111
112 private function splitCorpus($corpus, $level) {
113 switch ($level) {
114 case 0:
115 // Level 0: Split into paragraphs.
116 $expr = '/([\n]+)/';
117 break;
118 case 1:
119 // Level 1: Split into sentences.
120 $expr = '/([\n,!;?\.]+)/';
121 break;
122 case 2:
123 // Level 2: Split into words.
124 $expr = '/(\s+)/';
125 break;
126 case 3:
127 // Level 3: Split into characters.
128 return phutil_utf8v_combined($corpus);
129 }
130
131 $pieces = preg_split($expr, $corpus, -1, PREG_SPLIT_DELIM_CAPTURE);
132 return $this->stitchPieces($pieces, $level);
133 }
134
135 private function stitchPieces(array $pieces, $level) {
136 $results = array();
137 $count = count($pieces);
138 for ($ii = 0; $ii < $count; $ii += 2) {
139 $result = $pieces[$ii];
140 if ($ii + 1 < $count) {
141 $result .= $pieces[$ii + 1];
142 }
143
144 if ($level < 2) {
145 $trimmed_pieces = $this->trimApart($result);
146 foreach ($trimmed_pieces as $trimmed_piece) {
147 $results[] = $trimmed_piece;
148 }
149 } else {
150 $results[] = $result;
151 }
152 }
153
154 // If the input ended with a delimiter, we can get an empty final piece.
155 // Just discard it.
156 if (last($results) == '') {
157 array_pop($results);
158 }
159
160 return $results;
161 }
162
163 private function newEditDistanceMatrixDiff(
164 array $u_parts,
165 array $v_parts,
166 $level) {
167
168 $matrix = id(new PhutilEditDistanceMatrix())
169 ->setMaximumLength(128)
170 ->setSequences($u_parts, $v_parts)
171 ->setComputeString(true);
172
173 // For word-level and character-level changes, smooth the output string
174 // to reduce the choppiness of the diff.
175 if ($level > 1) {
176 $matrix->setApplySmoothing(PhutilEditDistanceMatrix::SMOOTHING_FULL);
177 }
178
179 $u_pos = 0;
180 $v_pos = 0;
181
182 $edits = $matrix->getEditString();
183 $edits_length = strlen($edits);
184
185 $diff = new PhutilProseDiff();
186 for ($ii = 0; $ii < $edits_length; $ii++) {
187 $c = $edits[$ii];
188 if ($c == 's') {
189 $diff->addPart('=', $u_parts[$u_pos]);
190 $u_pos++;
191 $v_pos++;
192 } else if ($c == 'd') {
193 $diff->addPart('-', $u_parts[$u_pos]);
194 $u_pos++;
195 } else if ($c == 'i') {
196 $diff->addPart('+', $v_parts[$v_pos]);
197 $v_pos++;
198 } else if ($c == 'x') {
199 $diff->addPart('-', $u_parts[$u_pos]);
200 $diff->addPart('+', $v_parts[$v_pos]);
201 $u_pos++;
202 $v_pos++;
203 } else {
204 throw new Exception(
205 pht(
206 'Unexpected character ("%s") in edit string.',
207 $c));
208 }
209 }
210
211 return array($diff, $matrix->didReachMaximumLength());
212 }
213
214 private function newHashDiff(array $u_parts, array $v_parts) {
215
216 $u_ref = new PhabricatorDocumentRef();
217 $v_ref = new PhabricatorDocumentRef();
218
219 $u_blocks = $this->newDocumentEngineBlocks($u_parts);
220 $v_blocks = $this->newDocumentEngineBlocks($v_parts);
221
222 $rows = id(new PhabricatorDocumentEngineBlocks())
223 ->addBlockList($u_ref, $u_blocks)
224 ->addBlockList($v_ref, $v_blocks)
225 ->newTwoUpLayout();
226
227 $diff = new PhutilProseDiff();
228 foreach ($rows as $row) {
229 list($u_block, $v_block) = $row;
230
231 if ($u_block && $v_block) {
232 if ($u_block->getDifferenceType() === '-') {
233 $diff->addPart('-', $u_block->getContent());
234 $diff->addPart('+', $v_block->getContent());
235 } else {
236 $diff->addPart('=', $u_block->getContent());
237 }
238 } else if ($u_block) {
239 $diff->addPart('-', $u_block->getContent());
240 } else {
241 $diff->addPart('+', $v_block->getContent());
242 }
243 }
244
245 return $diff;
246 }
247
248 private function newDocumentEngineBlocks(array $parts) {
249 $blocks = array();
250
251 foreach ($parts as $part) {
252 $hash = PhabricatorHash::digestForIndex($part);
253
254 $blocks[] = id(new PhabricatorDocumentEngineBlock())
255 ->setContent($part)
256 ->setDifferenceHash($hash);
257 }
258
259 return $blocks;
260 }
261
262 public static function trimApart($input) {
263 // Split pieces into separate text and whitespace sections: make one
264 // piece out of all the whitespace at the beginning, one piece out of
265 // all the actual text in the middle, and one piece out of all the
266 // whitespace at the end.
267
268 $parts = array();
269
270 $length = strlen($input);
271
272 $corpus = ltrim($input);
273 $l_length = strlen($corpus);
274 if ($l_length !== $length) {
275 $parts[] = substr($input, 0, $length - $l_length);
276 }
277
278 $corpus = rtrim($corpus);
279 $lr_length = strlen($corpus);
280
281 if ($lr_length) {
282 $parts[] = $corpus;
283 }
284
285 if ($lr_length !== $l_length) {
286 // NOTE: This will be a negative value; we're slicing from the end of
287 // the input string.
288 $parts[] = substr($input, $lr_length - $l_length);
289 }
290
291 return $parts;
292 }
293
294}