script to retroactively add commitids to past openbsd commits
at master 554 lines 18 kB view raw
1# 2# Copyright (c) 2014, 2016 joshua stein <jcs@jcs.org> 3# 4# Redistribution and use in source and binary forms, with or without 5# modification, are permitted provided that the following conditions 6# are met: 7# 8# 1. Redistributions of source code must retain the above copyright 9# notice, this list of conditions and the following disclaimer. 10# 2. Redistributions in binary form must reproduce the above copyright 11# notice, this list of conditions and the following disclaimer in the 12# documentation and/or other materials provided with the distribution. 13# 3. The name of the author may not be used to endorse or promote products 14# derived from this software without specific prior written permission. 15# 16# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 21# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 25# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26# 27 28class Scanner 29 attr_accessor :outputter, :db, :commitid_hacks, :prev_revision_hacks 30 31 # how long commits by the same author with the same commit message can be 32 # from each other and still be grouped in the same changeset 33 MAX_GROUP_WINDOW = (60 * 5) 34 35 def initialize(dbf, root) 36 @db = Db.new dbf 37 @root = (root + "/").gsub(/\/\//, "/") 38 @outputter = Outputter.new(self) 39 @prev_revision_hacks = {} 40 @commitid_hacks = {} 41 end 42 43 def recursively_scan(dir = nil) 44 if !dir 45 dir = @root 46 end 47 48 puts "recursing into #{dir}" 49 50 Dir.glob((dir + "/*").gsub(/\/\//, "/")).each do |f| 51 if Dir.exists?(f) 52 recursively_scan(f) 53 elsif f.match(/,v$/) 54 scan(f) 55 end 56 end 57 end 58 59 def scan(f) 60 cksum = "" 61 IO.popen([ "cksum", "-q", f ]) do |c| 62 parts = c.read.force_encoding("iso-8859-1").split(" ") 63 if parts.length != 2 64 raise "invalid output from cksum: #{parts.inspect}" 65 end 66 67 cksum = parts[0].encode("utf-8") 68 end 69 70 canfile = f[@root.length, f.length - @root.length].gsub(/(^|\/)Attic\//, 71 "/").gsub(/^\/*/, "") 72 73 fid = @db.execute("SELECT id, first_undead_version, cksum FROM files " + 74 "WHERE file = ?", [ canfile ]).first 75 if fid && fid["cksum"].to_s == cksum 76 return 77 end 78 79 puts " scanning file #{canfile}" 80 81 rcs = RCSFile.new(f) 82 83 @db.execute("BEGIN") 84 85 if fid 86 if fid["first_undead_version"] != rcs.first_undead_version 87 @db.execute("UPDATE files SET first_undead_version = ? WHERE id = ?", 88 [ rcs.first_undead_version, fid["id"] ]) 89 end 90 else 91 @db.execute("INSERT INTO files (file, first_undead_version) VALUES " + 92 "(?, ?)", [ canfile, rcs.first_undead_version ]) 93 fid = @db.execute("SELECT id FROM files WHERE file = ?", 94 [ canfile ]).first 95 end 96 raise if !fid 97 98 if @commitid_hacks && @commitid_hacks[canfile] 99 @commitid_hacks[canfile].each do |v,cid| 100 if rcs.revisions[v].commitid && 101 rcs.revisions[v].commitid != cid 102 raise "hack for #{canfile}:#{v} commitid of #{cid.inspect} would " + 103 "overwrite #{rcs.revisions[v].commitid}" 104 end 105 106 puts " faking commitid for revision #{v} -> #{cid}" 107 rcs.revisions[v].commitid = cid 108 end 109 end 110 111 rcs.revisions.each do |r,rev| 112 rid = @db.execute("SELECT id, commitid FROM revisions WHERE " + 113 "file_id = ? AND version = ?", [ fid["id"], r ]).first 114 115 if rid 116 if rid["commitid"] != rev.commitid 117 puts " updated #{r} to commitid #{rev.commitid}" + 118 (rid["commitid"].to_s == "" ? "" : " from #{rid["commitid"]}") 119 120 @db.execute("UPDATE revisions SET commitid = ? WHERE file_id = ? " + 121 "AND version = ?", [ rev.commitid, fid["id"], rev.version ]) 122 end 123 else 124 # files added on branches/imports have unhelpful commit messages with 125 # the helpful ones on the branch versions, so copy them over while 126 # we're here 127 if rev.log.to_s == "Initial revision" 128 if r == "1.1" && rcs.revisions["1.1.1.1"] 129 rev.log = rcs.revisions["1.1.1.1"].log 130 puts " revision #{r} using log from 1.1.1.1" 131 else 132 puts " revision #{r} keeping log #{rev.log.inspect}, no 1.1.1.1" 133 end 134 elsif m = rev.log.to_s. 135 match(/\Afile .+? was initially added on branch ([^\.]+)\.\z/) 136 brver = nil 137 if br = rcs.symbols[m[1]] 138 brver = RCSRevision.first_branch_version_of(br) 139 if !rcs.revisions[brver] 140 if rcs.revisions[brver + ".1"] 141 brver += ".1" 142 else 143 puts " revision #{r} keeping log #{rev.log.inspect}, no #{brver}" 144 brver = nil 145 end 146 end 147 end 148 149 if brver 150 rev.log = rcs.revisions[brver].log 151 puts " revision #{r} using log from #{brver}" 152 153 # but consider this trunk revision on the branch the file was added 154 # on, just so we keep it in the same changeset 155 rev.branch = rcs.revisions[brver].branch 156 else 157 puts " revision #{r} keeping log #{rev.log.inspect}, no #{m[1]}" 158 end 159 end 160 161 puts " inserted #{r}" + 162 (rev.branch ? " (branch #{rev.branch})" : "") + 163 ", authored #{rev.date} by #{rev.author}" + 164 (rev.commitid ? ", commitid #{rev.commitid}" : "") 165 166 @db.execute("INSERT INTO revisions (file_id, date, version, author, " + 167 "commitid, state, log, branch) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", 168 [ fid["id"], rev.date, rev.version, rev.author, rev.commitid, 169 rev.state, rev.log, rev.branch ]) 170 rid = { "id" => @db.last_insert_row_id } 171 end 172 173 vbs = @db.execute("SELECT branch FROM vendor_branches WHERE " + 174 "revision_id = ?", [ rid["id"] ]).map{|r| r["branch"] }.flatten 175 176 rev.vendor_branches.each do |vb| 177 if !vbs.include?(vb) 178 puts " inserting vendor branch #{vb}" 179 @db.execute("INSERT INTO vendor_branches (revision_id, branch) " + 180 "VALUES (?, ?)", [ rid["id"], vb ]) 181 end 182 end 183 184 vbs.each do |vb| 185 if !rev.vendor_branches.include?(vb) 186 @db.execute("DELETE FROM vendor_branches WHERE revision_id = ? " + 187 "AND branch = ?", [ rid["id"], vb ]) 188 end 189 end 190 end 191 192 @db.execute("UPDATE files SET cksum = ? WHERE id = ?", 193 [ cksum, fid["id"] ]) 194 195 @db.execute("COMMIT") 196 end 197 198 def group_into_changesets 199 puts "grouping into changesets" 200 201 new_sets = [] 202 last_row = {} 203 cur_set = [] 204 205 @db.execute("BEGIN") 206 207 # commits by the same author with the same log message within a small 208 # timeframe are grouped together 209 @db.execute("SELECT * FROM revisions WHERE changeset_id IS NULL ORDER " + 210 "BY author ASC, branch ASC, commitid ASC, date ASC") do |row| 211 if last_row.any? && 212 row["author"] == last_row["author"] && 213 row["branch"] == last_row["branch"] && 214 row["log"] == last_row["log"] && 215 row["commitid"] == last_row["commitid"] && 216 row["date"].to_i - last_row["date"].to_i <= MAX_GROUP_WINDOW 217 cur_set.push row["id"].to_i 218 elsif !last_row.any? 219 cur_set.push row["id"].to_i 220 else 221 if cur_set.any? 222 new_sets.push cur_set 223 cur_set = [] 224 end 225 cur_set.push row["id"].to_i 226 end 227 228 last_row = row 229 end 230 231 if cur_set.any? 232 new_sets.push cur_set 233 end 234 235 new_sets.each do |s| 236 puts " new set with revision ids #{s.inspect}" 237 @db.execute("INSERT INTO changesets (id) VALUES (NULL)") 238 id = @db.execute("SELECT last_insert_rowid() AS id").first["id"] 239 raise if !id 240 241 # avoid an exception caused by passing too many variables 242 s.each_slice(100) do |chunk| 243 @db.execute("UPDATE revisions SET changeset_id = ? WHERE id IN (" + 244 chunk.map{|a| "?" }.join(",") + ")", [ id ] + chunk) 245 end 246 end 247 248 if @db.execute("SELECT * FROM revisions WHERE changeset_id IS NULL").any? 249 raise "still have revisions with empty changesets" 250 end 251 252 @db.execute("COMMIT") 253 end 254 255 def stray_commitids_to_changesets 256 @db.execute("BEGIN") 257 258 puts "finding stray commitids" 259 260 stray_commitids = @db.execute("SELECT DISTINCT author, commitid FROM " + 261 "revisions WHERE commitid IS NOT NULL AND changeset_id IS NULL") 262 stray_commitids.each do |row| 263 csid = @db.execute("SELECT id FROM changesets WHERE commitid = ?", 264 [ row["commitid"] ]).first 265 if !csid 266 @db.execute("INSERT INTO changesets (author, commitid) VALUES (?, ?)", 267 [ row["author"], row["commitid"] ]) 268 csid = @db.execute("SELECT id FROM changesets WHERE commitid = ?", 269 [ row["commitid"] ]).first 270 end 271 raise if !csid 272 273 puts " commitid #{row["commitid"]} -> changeset #{csid["id"]}" 274 275 @db.execute("UPDATE revisions SET changeset_id = ? WHERE commitid = ?", 276 [ csid["id"], row["commitid"] ]) 277 end 278 279 @db.execute("COMMIT") 280 end 281 282 def fill_in_changeset_data 283 puts "assigning dates to changesets" 284 285 @db.execute("BEGIN") 286 287 cses = {} 288 @db.execute("SELECT id, commitid FROM changesets WHERE date IS NULL") do |c| 289 cses[c["id"]] = c["commitid"] 290 end 291 292 # create canonical dates for each changeset, so we can pull them back out 293 # in order 294 cses.each do |csid,comid| 295 date = nil 296 commitid = comid 297 log = nil 298 author = nil 299 branch = nil 300 301 @db.execute("SELECT * FROM revisions WHERE changeset_id = ? ORDER BY " + 302 "date ASC", [ csid ]) do |rev| 303 if !date 304 date = rev["date"] 305 end 306 307 if log && rev["log"] != log 308 raise "logs different between revs of #{csid}" 309 else 310 log = rev["log"] 311 end 312 313 if author && rev["author"] != author 314 raise "authors different between revs of #{csid}" 315 else 316 author = rev["author"] 317 end 318 319 if branch && rev["branch"] != branch 320 raise "branches different between revs of #{csid}" 321 else 322 branch = rev["branch"] 323 end 324 end 325 326 if !date 327 raise "no date for changeset #{csid}" 328 end 329 330 @db.execute("UPDATE changesets SET date = ?, log = ?, author = ?, " + 331 "branch = ? WHERE id = ?", [ date, log, author, branch, csid ]) 332 end 333 334 @db.execute("COMMIT") 335 336 puts "assigning changeset order" 337 338 cses = [] 339 @db.execute("SELECT id FROM changesets WHERE csorder IS NULL ORDER BY " + 340 "date, author") do |c| 341 cses.push c["id"] 342 end 343 344 highestcs = @db.execute("SELECT MAX(csorder) AS lastcs FROM changesets " + 345 "WHERE csorder IS NOT NULL").first["lastcs"].to_i 346 347 @db.execute("BEGIN") 348 cses.each do |cs| 349 highestcs += 1 350 @db.execute("UPDATE changesets SET csorder = ?, commitid = NULL WHERE " + 351 "id = ?", [ highestcs, cs ]) 352 end 353 @db.execute("COMMIT") 354 end 355 356 def stage_tmp_cvs(tmp_dir, cvs_root, tree) 357 # for a deleted file to be operated by with cvs admin, it must be 358 # present in the CVS/Entries files, so check out all files at rev 1.1 so we 359 # know they will not be deleted. otherwise cvs admin will fail silently 360 if File.exists?("#{tmp_dir}/#{tree}/CVS/Entries") 361 puts "updating #{tmp_dir}#{tree} from #{cvs_root}" 362 Dir.chdir("#{tmp_dir}/#{tree}") 363 system("cvs", "-Q", "-d", cvs_root, "update", "-PAd", "-r1.1") || 364 raise("cvs update returned non-zero") 365 else 366 puts "checking out #{cvs_root}#{tree} to #{tmp_dir}" 367 Dir.chdir(tmp_dir) 368 system("cvs", "-Q", "-d", cvs_root, "co", "-r1.1", tree) || 369 raise("cvs checkout returned non-zero") 370 end 371 372 Dir.chdir(tmp_dir) 373 374 # but if any files were added on a branch or somehow have a weird history, 375 # their 1.1 revision will be dead so check out any non-dead revision of 376 # those files 377 dead11s = {} 378 @db.execute("SELECT 379 file, first_undead_version 380 FROM files 381 WHERE first_undead_version NOT LIKE '1.1' AND 382 id IN (SELECT file_id FROM revisions WHERE commitid IS NULL)") do |rev| 383 dead11s[rev["file"]] = rev["first_undead_version"] 384 end 385 386 dead11s.each do |file,rev| 387 confile = file.gsub(/,v$/, "") 388 389 puts " checking out non-dead revision #{rev} of #{confile}" 390 391 system("cvs", "-Q", "-d", cvs_root, "co", "-r#{rev}", 392 "#{tree}/#{confile}") || 393 raise("cvs co -r#{rev} #{confile} failed") 394 end 395 396 Dir.chdir("#{tmp_dir}/#{tree}") 397 end 398 399 def recalculate_commitids(tmp_dir, cvs_root, tree, genesis) 400 Dir.chdir(tmp_dir + "/#{tree}") 401 402 puts "recalculating new commitids from genesis #{genesis}" 403 404 gfn = "#{cvs_root}/CVSROOT/commitid_genesis" 405 if File.exists?(gfn) && File.read(gfn).strip != genesis 406 raise "genesis in #{gfn} is not #{genesis.inspect}" 407 else 408 File.write("#{cvs_root}/CVSROOT/commitid_genesis", genesis + "\n") 409 end 410 411 changesets = [] 412 @db.execute("SELECT id, csorder, commitid FROM changesets 413 ORDER BY csorder ASC") do |cs| 414 changesets.push cs 415 end 416 417 puts " writing commitids-#{tree} (#{changesets.length} " + 418 "changeset#{changesets.length == 1 ? "" : "s"})" 419 420 commitids = File.open("#{cvs_root}/CVSROOT/commitids-#{tree}", "w+") 421 422 # every changeset needs to know the revisions of its files from the 423 # previous change, taking into account branches. we can easily calculate 424 # this, but we should make sure that calculated revision actually exists 425 files = {} 426 @db.execute("SELECT id, file FROM files") do |row| 427 files[row["id"]] = row["file"] 428 end 429 files.each do |id,file| 430 vers = [] 431 432 @db.execute("SELECT version FROM revisions WHERE file_id = ?", 433 [ id ]) do |rev| 434 vers.push rev["version"] 435 end 436 437 vers.each do |rev| 438 if prev_revision_hacks[file] && (hpre = prev_revision_hacks[file][rev]) 439 puts " faking previous revision of #{file} #{rev} -> #{hpre}" 440 pre = hpre 441 else 442 pre = RCSRevision.previous_of(rev) 443 end 444 445 if pre != "0" && !vers.include?(pre) 446 raise "#{file}: revision #{rev} previous #{pre} not found" 447 end 448 end 449 end 450 files = {} 451 452 # for each changeset with no commitid, store it in the commitids-* file 453 # with a temporary commitid of just its changeset number, do a 'cvs show' 454 # on it to calculate the actual commitid, then overwrite that hash in the 455 # commitids file, and store our new one 456 changesets.each do |cs| 457 cline = [] 458 commitid = "" 459 if cs["commitid"].to_s != "" 460 commitid = cs["commitid"] 461 else 462 commitid = sprintf("01-%064d-%07d", cs["csorder"], cs["csorder"]) 463 end 464 465 # order by length(revisions.version) to put 1.1 first, then 1.1.1.1, to 466 # match 'cvs import' 467 @db.execute("SELECT 468 files.file, revisions.version, revisions.branch 469 FROM revisions 470 LEFT OUTER JOIN files ON files.id = revisions.file_id 471 WHERE revisions.changeset_id = ? 472 ORDER BY files.file ASC, LENGTH(revisions.version) ASC, 473 revisions.version ASC", [ cs["id"] ]) do |rev| 474 if cline.length == 0 475 cline.push commitid 476 end 477 478 cline.push [ RCSRevision.previous_of(rev["version"]), rev["version"], 479 rev["branch"].to_s, rev["file"].gsub(/,v$/, "") ].join(":") 480 end 481 482 pos = commitids.pos 483 commitids.puts cline.join("\t") 484 485 if cs["commitid"].to_s == "" 486 commitids.fsync 487 488 newcsum = `cvs show #{commitid} | tail -n +2 | cksum -a sha512/256`.strip 489 if $?.exitstatus != 0 490 raise "failed running cvs show #{commitid}" 491 end 492 493 # null 494 if newcsum == "c672b8d1ef56ed28ab87c3622c5114069bdd3ad7b8f9737498d0c01ecef0967a" 495 raise "failed getting new commitid from #{commitid}" 496 end 497 498 newid = sprintf("01-%64s-%07d", newcsum, cs["csorder"]) 499 500 @db.execute("UPDATE changesets SET commitid = ? WHERE id = ?", 501 [ newid, cs["id"] ]) 502 503 puts " changeset #{cs["csorder"]} -> #{newid}" 504 505 # go back, rewrite just our commitid, then get ready for the next line 506 commitids.seek(pos) 507 commitids.write(newid) 508 commitids.seek(0, IO::SEEK_END) 509 commitids.fsync 510 else 511 puts " changeset #{cs["csorder"]} == #{cs["commitid"]}" 512 end 513 end 514 515 commitids.close 516 end 517 518 def repo_surgery(tmp_dir, cvs_root, tree) 519 puts "updating commitids in rcs files at #{cvs_root} via #{tmp_dir}" 520 521 Dir.chdir("#{tmp_dir}/#{tree}") 522 523 # for each revision we have in the db (picked up from a scan) that has a 524 # different commitid from what we assigned to its changeset, update the 525 # commitid in the rcs file in the repo, and then our revisions records 526 @db.execute(" 527 SELECT 528 files.file, changesets.commitid, revisions.version, revisions.id AS revid, 529 revisions.commitid AS revcommitid 530 FROM revisions 531 LEFT OUTER JOIN files ON files.id = revisions.file_id 532 LEFT OUTER JOIN changesets ON revisions.changeset_id = changesets.id 533 WHERE changesets.commitid != IFNULL(revisions.commitid, '') 534 ORDER BY changesets.date ASC, files.file ASC") do |rev| 535 puts [ "", rev["file"], rev["version"], rev["revcommitid"], "->", 536 rev["commitid"] ].join(" ") 537 538 output = nil 539 IO.popen(ca = [ "cvs", "admin", "-C", 540 "#{rev["version"]}:#{rev["commitid"]}", 541 rev["file"].gsub(/,v$/, "") ]) do |admin| 542 output = admin.read 543 end 544 545 if !output.match(/RCS file:/) 546 raise "failed cvs admin command #{ca.inspect}" 547 end 548 end 549 550 # re-read commitids and update file checksums since we probably just 551 # changed many of them, which will then update commitids in revisions table 552 sc.recursively_scan 553 end 554end