script to retroactively add commitids to past openbsd commits
1#
2# Copyright (c) 2014, 2016 joshua stein <jcs@jcs.org>
3#
4# Redistribution and use in source and binary forms, with or without
5# modification, are permitted provided that the following conditions
6# are met:
7#
8# 1. Redistributions of source code must retain the above copyright
9# notice, this list of conditions and the following disclaimer.
10# 2. Redistributions in binary form must reproduce the above copyright
11# notice, this list of conditions and the following disclaimer in the
12# documentation and/or other materials provided with the distribution.
13# 3. The name of the author may not be used to endorse or promote products
14# derived from this software without specific prior written permission.
15#
16# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26#
27
28class Scanner
29 attr_accessor :outputter, :db, :commitid_hacks, :prev_revision_hacks
30
31 # how long commits by the same author with the same commit message can be
32 # from each other and still be grouped in the same changeset
33 MAX_GROUP_WINDOW = (60 * 5)
34
35 def initialize(dbf, root)
36 @db = Db.new dbf
37 @root = (root + "/").gsub(/\/\//, "/")
38 @outputter = Outputter.new(self)
39 @prev_revision_hacks = {}
40 @commitid_hacks = {}
41 end
42
43 def recursively_scan(dir = nil)
44 if !dir
45 dir = @root
46 end
47
48 puts "recursing into #{dir}"
49
50 Dir.glob((dir + "/*").gsub(/\/\//, "/")).each do |f|
51 if Dir.exists?(f)
52 recursively_scan(f)
53 elsif f.match(/,v$/)
54 scan(f)
55 end
56 end
57 end
58
59 def scan(f)
60 cksum = ""
61 IO.popen([ "cksum", "-q", f ]) do |c|
62 parts = c.read.force_encoding("iso-8859-1").split(" ")
63 if parts.length != 2
64 raise "invalid output from cksum: #{parts.inspect}"
65 end
66
67 cksum = parts[0].encode("utf-8")
68 end
69
70 canfile = f[@root.length, f.length - @root.length].gsub(/(^|\/)Attic\//,
71 "/").gsub(/^\/*/, "")
72
73 fid = @db.execute("SELECT id, first_undead_version, cksum FROM files " +
74 "WHERE file = ?", [ canfile ]).first
75 if fid && fid["cksum"].to_s == cksum
76 return
77 end
78
79 puts " scanning file #{canfile}"
80
81 rcs = RCSFile.new(f)
82
83 @db.execute("BEGIN")
84
85 if fid
86 if fid["first_undead_version"] != rcs.first_undead_version
87 @db.execute("UPDATE files SET first_undead_version = ? WHERE id = ?",
88 [ rcs.first_undead_version, fid["id"] ])
89 end
90 else
91 @db.execute("INSERT INTO files (file, first_undead_version) VALUES " +
92 "(?, ?)", [ canfile, rcs.first_undead_version ])
93 fid = @db.execute("SELECT id FROM files WHERE file = ?",
94 [ canfile ]).first
95 end
96 raise if !fid
97
98 if @commitid_hacks && @commitid_hacks[canfile]
99 @commitid_hacks[canfile].each do |v,cid|
100 if rcs.revisions[v].commitid &&
101 rcs.revisions[v].commitid != cid
102 raise "hack for #{canfile}:#{v} commitid of #{cid.inspect} would " +
103 "overwrite #{rcs.revisions[v].commitid}"
104 end
105
106 puts " faking commitid for revision #{v} -> #{cid}"
107 rcs.revisions[v].commitid = cid
108 end
109 end
110
111 rcs.revisions.each do |r,rev|
112 rid = @db.execute("SELECT id, commitid FROM revisions WHERE " +
113 "file_id = ? AND version = ?", [ fid["id"], r ]).first
114
115 if rid
116 if rid["commitid"] != rev.commitid
117 puts " updated #{r} to commitid #{rev.commitid}" +
118 (rid["commitid"].to_s == "" ? "" : " from #{rid["commitid"]}")
119
120 @db.execute("UPDATE revisions SET commitid = ? WHERE file_id = ? " +
121 "AND version = ?", [ rev.commitid, fid["id"], rev.version ])
122 end
123 else
124 # files added on branches/imports have unhelpful commit messages with
125 # the helpful ones on the branch versions, so copy them over while
126 # we're here
127 if rev.log.to_s == "Initial revision"
128 if r == "1.1" && rcs.revisions["1.1.1.1"]
129 rev.log = rcs.revisions["1.1.1.1"].log
130 puts " revision #{r} using log from 1.1.1.1"
131 else
132 puts " revision #{r} keeping log #{rev.log.inspect}, no 1.1.1.1"
133 end
134 elsif m = rev.log.to_s.
135 match(/\Afile .+? was initially added on branch ([^\.]+)\.\z/)
136 brver = nil
137 if br = rcs.symbols[m[1]]
138 brver = RCSRevision.first_branch_version_of(br)
139 if !rcs.revisions[brver]
140 if rcs.revisions[brver + ".1"]
141 brver += ".1"
142 else
143 puts " revision #{r} keeping log #{rev.log.inspect}, no #{brver}"
144 brver = nil
145 end
146 end
147 end
148
149 if brver
150 rev.log = rcs.revisions[brver].log
151 puts " revision #{r} using log from #{brver}"
152
153 # but consider this trunk revision on the branch the file was added
154 # on, just so we keep it in the same changeset
155 rev.branch = rcs.revisions[brver].branch
156 else
157 puts " revision #{r} keeping log #{rev.log.inspect}, no #{m[1]}"
158 end
159 end
160
161 puts " inserted #{r}" +
162 (rev.branch ? " (branch #{rev.branch})" : "") +
163 ", authored #{rev.date} by #{rev.author}" +
164 (rev.commitid ? ", commitid #{rev.commitid}" : "")
165
166 @db.execute("INSERT INTO revisions (file_id, date, version, author, " +
167 "commitid, state, log, branch) VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
168 [ fid["id"], rev.date, rev.version, rev.author, rev.commitid,
169 rev.state, rev.log, rev.branch ])
170 rid = { "id" => @db.last_insert_row_id }
171 end
172
173 vbs = @db.execute("SELECT branch FROM vendor_branches WHERE " +
174 "revision_id = ?", [ rid["id"] ]).map{|r| r["branch"] }.flatten
175
176 rev.vendor_branches.each do |vb|
177 if !vbs.include?(vb)
178 puts " inserting vendor branch #{vb}"
179 @db.execute("INSERT INTO vendor_branches (revision_id, branch) " +
180 "VALUES (?, ?)", [ rid["id"], vb ])
181 end
182 end
183
184 vbs.each do |vb|
185 if !rev.vendor_branches.include?(vb)
186 @db.execute("DELETE FROM vendor_branches WHERE revision_id = ? " +
187 "AND branch = ?", [ rid["id"], vb ])
188 end
189 end
190 end
191
192 @db.execute("UPDATE files SET cksum = ? WHERE id = ?",
193 [ cksum, fid["id"] ])
194
195 @db.execute("COMMIT")
196 end
197
198 def group_into_changesets
199 puts "grouping into changesets"
200
201 new_sets = []
202 last_row = {}
203 cur_set = []
204
205 @db.execute("BEGIN")
206
207 # commits by the same author with the same log message within a small
208 # timeframe are grouped together
209 @db.execute("SELECT * FROM revisions WHERE changeset_id IS NULL ORDER " +
210 "BY author ASC, branch ASC, commitid ASC, date ASC") do |row|
211 if last_row.any? &&
212 row["author"] == last_row["author"] &&
213 row["branch"] == last_row["branch"] &&
214 row["log"] == last_row["log"] &&
215 row["commitid"] == last_row["commitid"] &&
216 row["date"].to_i - last_row["date"].to_i <= MAX_GROUP_WINDOW
217 cur_set.push row["id"].to_i
218 elsif !last_row.any?
219 cur_set.push row["id"].to_i
220 else
221 if cur_set.any?
222 new_sets.push cur_set
223 cur_set = []
224 end
225 cur_set.push row["id"].to_i
226 end
227
228 last_row = row
229 end
230
231 if cur_set.any?
232 new_sets.push cur_set
233 end
234
235 new_sets.each do |s|
236 puts " new set with revision ids #{s.inspect}"
237 @db.execute("INSERT INTO changesets (id) VALUES (NULL)")
238 id = @db.execute("SELECT last_insert_rowid() AS id").first["id"]
239 raise if !id
240
241 # avoid an exception caused by passing too many variables
242 s.each_slice(100) do |chunk|
243 @db.execute("UPDATE revisions SET changeset_id = ? WHERE id IN (" +
244 chunk.map{|a| "?" }.join(",") + ")", [ id ] + chunk)
245 end
246 end
247
248 if @db.execute("SELECT * FROM revisions WHERE changeset_id IS NULL").any?
249 raise "still have revisions with empty changesets"
250 end
251
252 @db.execute("COMMIT")
253 end
254
255 def stray_commitids_to_changesets
256 @db.execute("BEGIN")
257
258 puts "finding stray commitids"
259
260 stray_commitids = @db.execute("SELECT DISTINCT author, commitid FROM " +
261 "revisions WHERE commitid IS NOT NULL AND changeset_id IS NULL")
262 stray_commitids.each do |row|
263 csid = @db.execute("SELECT id FROM changesets WHERE commitid = ?",
264 [ row["commitid"] ]).first
265 if !csid
266 @db.execute("INSERT INTO changesets (author, commitid) VALUES (?, ?)",
267 [ row["author"], row["commitid"] ])
268 csid = @db.execute("SELECT id FROM changesets WHERE commitid = ?",
269 [ row["commitid"] ]).first
270 end
271 raise if !csid
272
273 puts " commitid #{row["commitid"]} -> changeset #{csid["id"]}"
274
275 @db.execute("UPDATE revisions SET changeset_id = ? WHERE commitid = ?",
276 [ csid["id"], row["commitid"] ])
277 end
278
279 @db.execute("COMMIT")
280 end
281
282 def fill_in_changeset_data
283 puts "assigning dates to changesets"
284
285 @db.execute("BEGIN")
286
287 cses = {}
288 @db.execute("SELECT id, commitid FROM changesets WHERE date IS NULL") do |c|
289 cses[c["id"]] = c["commitid"]
290 end
291
292 # create canonical dates for each changeset, so we can pull them back out
293 # in order
294 cses.each do |csid,comid|
295 date = nil
296 commitid = comid
297 log = nil
298 author = nil
299 branch = nil
300
301 @db.execute("SELECT * FROM revisions WHERE changeset_id = ? ORDER BY " +
302 "date ASC", [ csid ]) do |rev|
303 if !date
304 date = rev["date"]
305 end
306
307 if log && rev["log"] != log
308 raise "logs different between revs of #{csid}"
309 else
310 log = rev["log"]
311 end
312
313 if author && rev["author"] != author
314 raise "authors different between revs of #{csid}"
315 else
316 author = rev["author"]
317 end
318
319 if branch && rev["branch"] != branch
320 raise "branches different between revs of #{csid}"
321 else
322 branch = rev["branch"]
323 end
324 end
325
326 if !date
327 raise "no date for changeset #{csid}"
328 end
329
330 @db.execute("UPDATE changesets SET date = ?, log = ?, author = ?, " +
331 "branch = ? WHERE id = ?", [ date, log, author, branch, csid ])
332 end
333
334 @db.execute("COMMIT")
335
336 puts "assigning changeset order"
337
338 cses = []
339 @db.execute("SELECT id FROM changesets WHERE csorder IS NULL ORDER BY " +
340 "date, author") do |c|
341 cses.push c["id"]
342 end
343
344 highestcs = @db.execute("SELECT MAX(csorder) AS lastcs FROM changesets " +
345 "WHERE csorder IS NOT NULL").first["lastcs"].to_i
346
347 @db.execute("BEGIN")
348 cses.each do |cs|
349 highestcs += 1
350 @db.execute("UPDATE changesets SET csorder = ?, commitid = NULL WHERE " +
351 "id = ?", [ highestcs, cs ])
352 end
353 @db.execute("COMMIT")
354 end
355
356 def stage_tmp_cvs(tmp_dir, cvs_root, tree)
357 # for a deleted file to be operated by with cvs admin, it must be
358 # present in the CVS/Entries files, so check out all files at rev 1.1 so we
359 # know they will not be deleted. otherwise cvs admin will fail silently
360 if File.exists?("#{tmp_dir}/#{tree}/CVS/Entries")
361 puts "updating #{tmp_dir}#{tree} from #{cvs_root}"
362 Dir.chdir("#{tmp_dir}/#{tree}")
363 system("cvs", "-Q", "-d", cvs_root, "update", "-PAd", "-r1.1") ||
364 raise("cvs update returned non-zero")
365 else
366 puts "checking out #{cvs_root}#{tree} to #{tmp_dir}"
367 Dir.chdir(tmp_dir)
368 system("cvs", "-Q", "-d", cvs_root, "co", "-r1.1", tree) ||
369 raise("cvs checkout returned non-zero")
370 end
371
372 Dir.chdir(tmp_dir)
373
374 # but if any files were added on a branch or somehow have a weird history,
375 # their 1.1 revision will be dead so check out any non-dead revision of
376 # those files
377 dead11s = {}
378 @db.execute("SELECT
379 file, first_undead_version
380 FROM files
381 WHERE first_undead_version NOT LIKE '1.1' AND
382 id IN (SELECT file_id FROM revisions WHERE commitid IS NULL)") do |rev|
383 dead11s[rev["file"]] = rev["first_undead_version"]
384 end
385
386 dead11s.each do |file,rev|
387 confile = file.gsub(/,v$/, "")
388
389 puts " checking out non-dead revision #{rev} of #{confile}"
390
391 system("cvs", "-Q", "-d", cvs_root, "co", "-r#{rev}",
392 "#{tree}/#{confile}") ||
393 raise("cvs co -r#{rev} #{confile} failed")
394 end
395
396 Dir.chdir("#{tmp_dir}/#{tree}")
397 end
398
399 def recalculate_commitids(tmp_dir, cvs_root, tree, genesis)
400 Dir.chdir(tmp_dir + "/#{tree}")
401
402 puts "recalculating new commitids from genesis #{genesis}"
403
404 gfn = "#{cvs_root}/CVSROOT/commitid_genesis"
405 if File.exists?(gfn) && File.read(gfn).strip != genesis
406 raise "genesis in #{gfn} is not #{genesis.inspect}"
407 else
408 File.write("#{cvs_root}/CVSROOT/commitid_genesis", genesis + "\n")
409 end
410
411 changesets = []
412 @db.execute("SELECT id, csorder, commitid FROM changesets
413 ORDER BY csorder ASC") do |cs|
414 changesets.push cs
415 end
416
417 puts " writing commitids-#{tree} (#{changesets.length} " +
418 "changeset#{changesets.length == 1 ? "" : "s"})"
419
420 commitids = File.open("#{cvs_root}/CVSROOT/commitids-#{tree}", "w+")
421
422 # every changeset needs to know the revisions of its files from the
423 # previous change, taking into account branches. we can easily calculate
424 # this, but we should make sure that calculated revision actually exists
425 files = {}
426 @db.execute("SELECT id, file FROM files") do |row|
427 files[row["id"]] = row["file"]
428 end
429 files.each do |id,file|
430 vers = []
431
432 @db.execute("SELECT version FROM revisions WHERE file_id = ?",
433 [ id ]) do |rev|
434 vers.push rev["version"]
435 end
436
437 vers.each do |rev|
438 if prev_revision_hacks[file] && (hpre = prev_revision_hacks[file][rev])
439 puts " faking previous revision of #{file} #{rev} -> #{hpre}"
440 pre = hpre
441 else
442 pre = RCSRevision.previous_of(rev)
443 end
444
445 if pre != "0" && !vers.include?(pre)
446 raise "#{file}: revision #{rev} previous #{pre} not found"
447 end
448 end
449 end
450 files = {}
451
452 # for each changeset with no commitid, store it in the commitids-* file
453 # with a temporary commitid of just its changeset number, do a 'cvs show'
454 # on it to calculate the actual commitid, then overwrite that hash in the
455 # commitids file, and store our new one
456 changesets.each do |cs|
457 cline = []
458 commitid = ""
459 if cs["commitid"].to_s != ""
460 commitid = cs["commitid"]
461 else
462 commitid = sprintf("01-%064d-%07d", cs["csorder"], cs["csorder"])
463 end
464
465 # order by length(revisions.version) to put 1.1 first, then 1.1.1.1, to
466 # match 'cvs import'
467 @db.execute("SELECT
468 files.file, revisions.version, revisions.branch
469 FROM revisions
470 LEFT OUTER JOIN files ON files.id = revisions.file_id
471 WHERE revisions.changeset_id = ?
472 ORDER BY files.file ASC, LENGTH(revisions.version) ASC,
473 revisions.version ASC", [ cs["id"] ]) do |rev|
474 if cline.length == 0
475 cline.push commitid
476 end
477
478 cline.push [ RCSRevision.previous_of(rev["version"]), rev["version"],
479 rev["branch"].to_s, rev["file"].gsub(/,v$/, "") ].join(":")
480 end
481
482 pos = commitids.pos
483 commitids.puts cline.join("\t")
484
485 if cs["commitid"].to_s == ""
486 commitids.fsync
487
488 newcsum = `cvs show #{commitid} | tail -n +2 | cksum -a sha512/256`.strip
489 if $?.exitstatus != 0
490 raise "failed running cvs show #{commitid}"
491 end
492
493 # null
494 if newcsum == "c672b8d1ef56ed28ab87c3622c5114069bdd3ad7b8f9737498d0c01ecef0967a"
495 raise "failed getting new commitid from #{commitid}"
496 end
497
498 newid = sprintf("01-%64s-%07d", newcsum, cs["csorder"])
499
500 @db.execute("UPDATE changesets SET commitid = ? WHERE id = ?",
501 [ newid, cs["id"] ])
502
503 puts " changeset #{cs["csorder"]} -> #{newid}"
504
505 # go back, rewrite just our commitid, then get ready for the next line
506 commitids.seek(pos)
507 commitids.write(newid)
508 commitids.seek(0, IO::SEEK_END)
509 commitids.fsync
510 else
511 puts " changeset #{cs["csorder"]} == #{cs["commitid"]}"
512 end
513 end
514
515 commitids.close
516 end
517
518 def repo_surgery(tmp_dir, cvs_root, tree)
519 puts "updating commitids in rcs files at #{cvs_root} via #{tmp_dir}"
520
521 Dir.chdir("#{tmp_dir}/#{tree}")
522
523 # for each revision we have in the db (picked up from a scan) that has a
524 # different commitid from what we assigned to its changeset, update the
525 # commitid in the rcs file in the repo, and then our revisions records
526 @db.execute("
527 SELECT
528 files.file, changesets.commitid, revisions.version, revisions.id AS revid,
529 revisions.commitid AS revcommitid
530 FROM revisions
531 LEFT OUTER JOIN files ON files.id = revisions.file_id
532 LEFT OUTER JOIN changesets ON revisions.changeset_id = changesets.id
533 WHERE changesets.commitid != IFNULL(revisions.commitid, '')
534 ORDER BY changesets.date ASC, files.file ASC") do |rev|
535 puts [ "", rev["file"], rev["version"], rev["revcommitid"], "->",
536 rev["commitid"] ].join(" ")
537
538 output = nil
539 IO.popen(ca = [ "cvs", "admin", "-C",
540 "#{rev["version"]}:#{rev["commitid"]}",
541 rev["file"].gsub(/,v$/, "") ]) do |admin|
542 output = admin.read
543 end
544
545 if !output.match(/RCS file:/)
546 raise "failed cvs admin command #{ca.inspect}"
547 end
548 end
549
550 # re-read commitids and update file checksums since we probably just
551 # changed many of them, which will then update commitids in revisions table
552 sc.recursively_scan
553 end
554end