# # Copyright (c) 2014, 2016 joshua stein # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # class Scanner attr_accessor :outputter, :db, :commitid_hacks, :prev_revision_hacks # how long commits by the same author with the same commit message can be # from each other and still be grouped in the same changeset MAX_GROUP_WINDOW = (60 * 5) def initialize(dbf, root) @db = Db.new dbf @root = (root + "/").gsub(/\/\//, "/") @outputter = Outputter.new(self) @prev_revision_hacks = {} @commitid_hacks = {} end def recursively_scan(dir = nil) if !dir dir = @root end puts "recursing into #{dir}" Dir.glob((dir + "/*").gsub(/\/\//, "/")).each do |f| if Dir.exists?(f) recursively_scan(f) elsif f.match(/,v$/) scan(f) end end end def scan(f) cksum = "" IO.popen([ "cksum", "-q", f ]) do |c| parts = c.read.force_encoding("iso-8859-1").split(" ") if parts.length != 2 raise "invalid output from cksum: #{parts.inspect}" end cksum = parts[0].encode("utf-8") end canfile = f[@root.length, f.length - @root.length].gsub(/(^|\/)Attic\//, "/").gsub(/^\/*/, "") fid = @db.execute("SELECT id, first_undead_version, cksum FROM files " + "WHERE file = ?", [ canfile ]).first if fid && fid["cksum"].to_s == cksum return end puts " scanning file #{canfile}" rcs = RCSFile.new(f) @db.execute("BEGIN") if fid if fid["first_undead_version"] != rcs.first_undead_version @db.execute("UPDATE files SET first_undead_version = ? WHERE id = ?", [ rcs.first_undead_version, fid["id"] ]) end else @db.execute("INSERT INTO files (file, first_undead_version) VALUES " + "(?, ?)", [ canfile, rcs.first_undead_version ]) fid = @db.execute("SELECT id FROM files WHERE file = ?", [ canfile ]).first end raise if !fid if @commitid_hacks && @commitid_hacks[canfile] @commitid_hacks[canfile].each do |v,cid| if rcs.revisions[v].commitid && rcs.revisions[v].commitid != cid raise "hack for #{canfile}:#{v} commitid of #{cid.inspect} would " + "overwrite #{rcs.revisions[v].commitid}" end puts " faking commitid for revision #{v} -> #{cid}" rcs.revisions[v].commitid = cid end end rcs.revisions.each do |r,rev| rid = @db.execute("SELECT id, commitid FROM revisions WHERE " + "file_id = ? AND version = ?", [ fid["id"], r ]).first if rid if rid["commitid"] != rev.commitid puts " updated #{r} to commitid #{rev.commitid}" + (rid["commitid"].to_s == "" ? "" : " from #{rid["commitid"]}") @db.execute("UPDATE revisions SET commitid = ? WHERE file_id = ? " + "AND version = ?", [ rev.commitid, fid["id"], rev.version ]) end else # files added on branches/imports have unhelpful commit messages with # the helpful ones on the branch versions, so copy them over while # we're here if rev.log.to_s == "Initial revision" if r == "1.1" && rcs.revisions["1.1.1.1"] rev.log = rcs.revisions["1.1.1.1"].log puts " revision #{r} using log from 1.1.1.1" else puts " revision #{r} keeping log #{rev.log.inspect}, no 1.1.1.1" end elsif m = rev.log.to_s. match(/\Afile .+? was initially added on branch ([^\.]+)\.\z/) brver = nil if br = rcs.symbols[m[1]] brver = RCSRevision.first_branch_version_of(br) if !rcs.revisions[brver] if rcs.revisions[brver + ".1"] brver += ".1" else puts " revision #{r} keeping log #{rev.log.inspect}, no #{brver}" brver = nil end end end if brver rev.log = rcs.revisions[brver].log puts " revision #{r} using log from #{brver}" # but consider this trunk revision on the branch the file was added # on, just so we keep it in the same changeset rev.branch = rcs.revisions[brver].branch else puts " revision #{r} keeping log #{rev.log.inspect}, no #{m[1]}" end end puts " inserted #{r}" + (rev.branch ? " (branch #{rev.branch})" : "") + ", authored #{rev.date} by #{rev.author}" + (rev.commitid ? ", commitid #{rev.commitid}" : "") @db.execute("INSERT INTO revisions (file_id, date, version, author, " + "commitid, state, log, branch) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", [ fid["id"], rev.date, rev.version, rev.author, rev.commitid, rev.state, rev.log, rev.branch ]) rid = { "id" => @db.last_insert_row_id } end vbs = @db.execute("SELECT branch FROM vendor_branches WHERE " + "revision_id = ?", [ rid["id"] ]).map{|r| r["branch"] }.flatten rev.vendor_branches.each do |vb| if !vbs.include?(vb) puts " inserting vendor branch #{vb}" @db.execute("INSERT INTO vendor_branches (revision_id, branch) " + "VALUES (?, ?)", [ rid["id"], vb ]) end end vbs.each do |vb| if !rev.vendor_branches.include?(vb) @db.execute("DELETE FROM vendor_branches WHERE revision_id = ? " + "AND branch = ?", [ rid["id"], vb ]) end end end @db.execute("UPDATE files SET cksum = ? WHERE id = ?", [ cksum, fid["id"] ]) @db.execute("COMMIT") end def group_into_changesets puts "grouping into changesets" new_sets = [] last_row = {} cur_set = [] @db.execute("BEGIN") # commits by the same author with the same log message within a small # timeframe are grouped together @db.execute("SELECT * FROM revisions WHERE changeset_id IS NULL ORDER " + "BY author ASC, branch ASC, commitid ASC, date ASC") do |row| if last_row.any? && row["author"] == last_row["author"] && row["branch"] == last_row["branch"] && row["log"] == last_row["log"] && row["commitid"] == last_row["commitid"] && row["date"].to_i - last_row["date"].to_i <= MAX_GROUP_WINDOW cur_set.push row["id"].to_i elsif !last_row.any? cur_set.push row["id"].to_i else if cur_set.any? new_sets.push cur_set cur_set = [] end cur_set.push row["id"].to_i end last_row = row end if cur_set.any? new_sets.push cur_set end new_sets.each do |s| puts " new set with revision ids #{s.inspect}" @db.execute("INSERT INTO changesets (id) VALUES (NULL)") id = @db.execute("SELECT last_insert_rowid() AS id").first["id"] raise if !id # avoid an exception caused by passing too many variables s.each_slice(100) do |chunk| @db.execute("UPDATE revisions SET changeset_id = ? WHERE id IN (" + chunk.map{|a| "?" }.join(",") + ")", [ id ] + chunk) end end if @db.execute("SELECT * FROM revisions WHERE changeset_id IS NULL").any? raise "still have revisions with empty changesets" end @db.execute("COMMIT") end def stray_commitids_to_changesets @db.execute("BEGIN") puts "finding stray commitids" stray_commitids = @db.execute("SELECT DISTINCT author, commitid FROM " + "revisions WHERE commitid IS NOT NULL AND changeset_id IS NULL") stray_commitids.each do |row| csid = @db.execute("SELECT id FROM changesets WHERE commitid = ?", [ row["commitid"] ]).first if !csid @db.execute("INSERT INTO changesets (author, commitid) VALUES (?, ?)", [ row["author"], row["commitid"] ]) csid = @db.execute("SELECT id FROM changesets WHERE commitid = ?", [ row["commitid"] ]).first end raise if !csid puts " commitid #{row["commitid"]} -> changeset #{csid["id"]}" @db.execute("UPDATE revisions SET changeset_id = ? WHERE commitid = ?", [ csid["id"], row["commitid"] ]) end @db.execute("COMMIT") end def fill_in_changeset_data puts "assigning dates to changesets" @db.execute("BEGIN") cses = {} @db.execute("SELECT id, commitid FROM changesets WHERE date IS NULL") do |c| cses[c["id"]] = c["commitid"] end # create canonical dates for each changeset, so we can pull them back out # in order cses.each do |csid,comid| date = nil commitid = comid log = nil author = nil branch = nil @db.execute("SELECT * FROM revisions WHERE changeset_id = ? ORDER BY " + "date ASC", [ csid ]) do |rev| if !date date = rev["date"] end if log && rev["log"] != log raise "logs different between revs of #{csid}" else log = rev["log"] end if author && rev["author"] != author raise "authors different between revs of #{csid}" else author = rev["author"] end if branch && rev["branch"] != branch raise "branches different between revs of #{csid}" else branch = rev["branch"] end end if !date raise "no date for changeset #{csid}" end @db.execute("UPDATE changesets SET date = ?, log = ?, author = ?, " + "branch = ? WHERE id = ?", [ date, log, author, branch, csid ]) end @db.execute("COMMIT") puts "assigning changeset order" cses = [] @db.execute("SELECT id FROM changesets WHERE csorder IS NULL ORDER BY " + "date, author") do |c| cses.push c["id"] end highestcs = @db.execute("SELECT MAX(csorder) AS lastcs FROM changesets " + "WHERE csorder IS NOT NULL").first["lastcs"].to_i @db.execute("BEGIN") cses.each do |cs| highestcs += 1 @db.execute("UPDATE changesets SET csorder = ?, commitid = NULL WHERE " + "id = ?", [ highestcs, cs ]) end @db.execute("COMMIT") end def stage_tmp_cvs(tmp_dir, cvs_root, tree) # for a deleted file to be operated by with cvs admin, it must be # present in the CVS/Entries files, so check out all files at rev 1.1 so we # know they will not be deleted. otherwise cvs admin will fail silently if File.exists?("#{tmp_dir}/#{tree}/CVS/Entries") puts "updating #{tmp_dir}#{tree} from #{cvs_root}" Dir.chdir("#{tmp_dir}/#{tree}") system("cvs", "-Q", "-d", cvs_root, "update", "-PAd", "-r1.1") || raise("cvs update returned non-zero") else puts "checking out #{cvs_root}#{tree} to #{tmp_dir}" Dir.chdir(tmp_dir) system("cvs", "-Q", "-d", cvs_root, "co", "-r1.1", tree) || raise("cvs checkout returned non-zero") end Dir.chdir(tmp_dir) # but if any files were added on a branch or somehow have a weird history, # their 1.1 revision will be dead so check out any non-dead revision of # those files dead11s = {} @db.execute("SELECT file, first_undead_version FROM files WHERE first_undead_version NOT LIKE '1.1' AND id IN (SELECT file_id FROM revisions WHERE commitid IS NULL)") do |rev| dead11s[rev["file"]] = rev["first_undead_version"] end dead11s.each do |file,rev| confile = file.gsub(/,v$/, "") puts " checking out non-dead revision #{rev} of #{confile}" system("cvs", "-Q", "-d", cvs_root, "co", "-r#{rev}", "#{tree}/#{confile}") || raise("cvs co -r#{rev} #{confile} failed") end Dir.chdir("#{tmp_dir}/#{tree}") end def recalculate_commitids(tmp_dir, cvs_root, tree, genesis) Dir.chdir(tmp_dir + "/#{tree}") puts "recalculating new commitids from genesis #{genesis}" gfn = "#{cvs_root}/CVSROOT/commitid_genesis" if File.exists?(gfn) && File.read(gfn).strip != genesis raise "genesis in #{gfn} is not #{genesis.inspect}" else File.write("#{cvs_root}/CVSROOT/commitid_genesis", genesis + "\n") end changesets = [] @db.execute("SELECT id, csorder, commitid FROM changesets ORDER BY csorder ASC") do |cs| changesets.push cs end puts " writing commitids-#{tree} (#{changesets.length} " + "changeset#{changesets.length == 1 ? "" : "s"})" commitids = File.open("#{cvs_root}/CVSROOT/commitids-#{tree}", "w+") # every changeset needs to know the revisions of its files from the # previous change, taking into account branches. we can easily calculate # this, but we should make sure that calculated revision actually exists files = {} @db.execute("SELECT id, file FROM files") do |row| files[row["id"]] = row["file"] end files.each do |id,file| vers = [] @db.execute("SELECT version FROM revisions WHERE file_id = ?", [ id ]) do |rev| vers.push rev["version"] end vers.each do |rev| if prev_revision_hacks[file] && (hpre = prev_revision_hacks[file][rev]) puts " faking previous revision of #{file} #{rev} -> #{hpre}" pre = hpre else pre = RCSRevision.previous_of(rev) end if pre != "0" && !vers.include?(pre) raise "#{file}: revision #{rev} previous #{pre} not found" end end end files = {} # for each changeset with no commitid, store it in the commitids-* file # with a temporary commitid of just its changeset number, do a 'cvs show' # on it to calculate the actual commitid, then overwrite that hash in the # commitids file, and store our new one changesets.each do |cs| cline = [] commitid = "" if cs["commitid"].to_s != "" commitid = cs["commitid"] else commitid = sprintf("01-%064d-%07d", cs["csorder"], cs["csorder"]) end # order by length(revisions.version) to put 1.1 first, then 1.1.1.1, to # match 'cvs import' @db.execute("SELECT files.file, revisions.version, revisions.branch FROM revisions LEFT OUTER JOIN files ON files.id = revisions.file_id WHERE revisions.changeset_id = ? ORDER BY files.file ASC, LENGTH(revisions.version) ASC, revisions.version ASC", [ cs["id"] ]) do |rev| if cline.length == 0 cline.push commitid end cline.push [ RCSRevision.previous_of(rev["version"]), rev["version"], rev["branch"].to_s, rev["file"].gsub(/,v$/, "") ].join(":") end pos = commitids.pos commitids.puts cline.join("\t") if cs["commitid"].to_s == "" commitids.fsync newcsum = `cvs show #{commitid} | tail -n +2 | cksum -a sha512/256`.strip if $?.exitstatus != 0 raise "failed running cvs show #{commitid}" end # null if newcsum == "c672b8d1ef56ed28ab87c3622c5114069bdd3ad7b8f9737498d0c01ecef0967a" raise "failed getting new commitid from #{commitid}" end newid = sprintf("01-%64s-%07d", newcsum, cs["csorder"]) @db.execute("UPDATE changesets SET commitid = ? WHERE id = ?", [ newid, cs["id"] ]) puts " changeset #{cs["csorder"]} -> #{newid}" # go back, rewrite just our commitid, then get ready for the next line commitids.seek(pos) commitids.write(newid) commitids.seek(0, IO::SEEK_END) commitids.fsync else puts " changeset #{cs["csorder"]} == #{cs["commitid"]}" end end commitids.close end def repo_surgery(tmp_dir, cvs_root, tree) puts "updating commitids in rcs files at #{cvs_root} via #{tmp_dir}" Dir.chdir("#{tmp_dir}/#{tree}") # for each revision we have in the db (picked up from a scan) that has a # different commitid from what we assigned to its changeset, update the # commitid in the rcs file in the repo, and then our revisions records @db.execute(" SELECT files.file, changesets.commitid, revisions.version, revisions.id AS revid, revisions.commitid AS revcommitid FROM revisions LEFT OUTER JOIN files ON files.id = revisions.file_id LEFT OUTER JOIN changesets ON revisions.changeset_id = changesets.id WHERE changesets.commitid != IFNULL(revisions.commitid, '') ORDER BY changesets.date ASC, files.file ASC") do |rev| puts [ "", rev["file"], rev["version"], rev["revcommitid"], "->", rev["commitid"] ].join(" ") output = nil IO.popen(ca = [ "cvs", "admin", "-C", "#{rev["version"]}:#{rev["commitid"]}", rev["file"].gsub(/,v$/, "") ]) do |admin| output = admin.read end if !output.match(/RCS file:/) raise "failed cvs admin command #{ca.inspect}" end end # re-read commitids and update file checksums since we probably just # changed many of them, which will then update commitids in revisions table sc.recursively_scan end end