changeset 890:78db88de9622

Partial metadata rebuilding For highly active subversion repositories, it can be excruciatingly slow to pull updates one at a time from subversion. One way around this is to setup another mercurial repo that pulls new commits from svn periodicly (say every 5 minutes). When you want to update your repository, you can pull commits from this mercurial repository via native mercurial protocols, which will be much faster than pulling directly from svn. Unfortunately, your metadata will be out of date after doing so. Highly active repositories also tend to be very large, which means that it takes a long time to rebuild your metadata from scratch. To address this, this adds support to do a partial rebuild on the metadata by processing only revisions that have been added to the repository after the last revision we processed. With the rev map 1k revisions (~2 days) behind tip updatemeta is dramatically faster than rebuild meta: $ hg --time svn updatemeta Time: real 0.570 secs (user 0.480+0.000 sys 0.060+0.000) $ hg --time svn rebuildmeta Time: real 129.160 secs (user 128.570+0.000 sys 0.320+0.000)
author David Schleimer <dschleimer@fb.com>
date Sat, 12 May 2012 07:28:23 -0700
parents 7a98fbadcae9
children 83cc6e9e8425
files hgsubversion/svncommands.py tests/test_rebuildmeta.py tests/test_util.py
diffstat 3 files changed, 97 insertions(+), 17 deletions(-) [+]
line wrap: on
line diff
--- a/hgsubversion/svncommands.py
+++ b/hgsubversion/svncommands.py
@@ -90,11 +90,26 @@ def verify(ui, repo, args=None, **opts):
 
     return result
 
+def updatemeta(ui, repo, args, **opts):
+    """Do a partial rebuild of the subversion metadata.
+
+    Assumes that the metadata that currently exists is valid, but that
+    some is missing, e.g. because you have pulled some revisions via a
+    native mercurial method.
+
+    """
+
+    return _buildmeta(ui, repo, args, partial=True)
+
 
 def rebuildmeta(ui, repo, args, **opts):
     """rebuild hgsubversion metadata using values stored in revisions
     """
 
+    return _buildmeta(ui, repo, args, partial=False)
+
+def _buildmeta(ui, repo, args, partial=False):
+
     if repo is None:
         raise error.RepoError("There is no Mercurial repository"
                               " here (.hg not found)")
@@ -113,14 +128,32 @@ def rebuildmeta(ui, repo, args, **opts):
     if not os.path.exists(svnmetadir):
         os.makedirs(svnmetadir)
 
+    youngest = 0
+    startrev = 0
+    sofar = []
+    branchinfo = {}
+    if partial:
+        try:
+            youngestpath = os.path.join(svnmetadir, 'lastpulled')
+            youngest = int(util.load_string(youngestpath).strip())
+            sofar = list(maps.RevMap.readmapfile(repo))
+            lasthash = sofar[-1].split(' ', 2)[1]
+            startrev = repo[lasthash].rev() + 1
+            branchinfo = pickle.load(open(os.path.join(svnmetadir,
+                                                       'branch_info')))
+        except IOError, err:
+            if err.errno != errno.ENOENT:
+                raise
+            ui.status('missing some metadata -- doing a full rebuild')
+
+
     lastpulled = open(os.path.join(svnmetadir, 'lastpulled'), 'wb')
     revmap = open(os.path.join(svnmetadir, 'rev_map'), 'w')
     revmap.write('1\n')
+    revmap.writelines(sofar)
     last_rev = -1
-    branchinfo = {}
-    noderevnums = {}
     tagfile = os.path.join(svnmetadir, 'tagmap')
-    if os.path.exists(maps.Tags.filepath(repo)):
+    if not partial and os.path.exists(maps.Tags.filepath(repo)) :
         os.unlink(maps.Tags.filepath(repo))
     tags = maps.Tags(repo)
 
@@ -129,7 +162,7 @@ def rebuildmeta(ui, repo, args, **opts):
     skipped = set()
     closed = set()
 
-    numrevs = len(repo)
+    numrevs = len(repo) - startrev
 
     subdirfile = open(os.path.join(svnmetadir, 'subdir'), 'w')
     subdirfile.write(subdir.strip('/'))
@@ -139,9 +172,8 @@ def rebuildmeta(ui, repo, args, **opts):
     # it would make us use O(revisions^2) time, so we perform an extra traversal
     # of the repository instead. During this traversal, we find all converted
     # changesets that close a branch, and store their first parent
-    youngest = 0
-    for rev in repo:
-        util.progress(ui, 'prepare', rev, total=numrevs)
+    for rev in xrange(startrev, len(repo)):
+        util.progress(ui, 'prepare', rev - startrev, total=numrevs)
         ctx = repo[rev]
         convinfo = util.getsvnrev(ctx, None)
         if not convinfo:
@@ -157,13 +189,19 @@ def rebuildmeta(ui, repo, args, **opts):
         parentinfo = util.getsvnrev(parentctx, '@')
 
         if droprev(parentinfo) == droprev(convinfo):
-            closed.add(parentctx.rev())
+            if parentctx.rev() < startrev:
+                parentbranch = parentctx.branch()
+                if parentbranch == 'default':
+                    parentbranch = None
+                branchinfo.pop(parentbranch)
+            else:
+                closed.add(parentctx.rev())
 
     lastpulled.write(str(youngest) + '\n')
     util.progress(ui, 'prepare', None, total=numrevs)
 
-    for rev in repo:
-        util.progress(ui, 'rebuild', rev, total=numrevs)
+    for rev in xrange(startrev, len(repo)):
+        util.progress(ui, 'rebuild', rev-startrev, total=numrevs)
         ctx = repo[rev]
         convinfo = util.getsvnrev(ctx, None)
         if not convinfo:
@@ -241,7 +279,6 @@ def rebuildmeta(ui, repo, args, **opts):
         revmap.write('%s %s %s\n' % (revision, ctx.hex(), commitpath))
 
         revision = int(revision)
-        noderevnums[ctx.node()] = revision
         if revision > last_rev:
             last_rev = revision
 
@@ -279,15 +316,19 @@ def rebuildmeta(ui, repo, args, **opts):
             pass
         elif branch not in branchinfo:
             parent = ctx.parents()[0]
-            if (parent.node() in noderevnums
+            if (parent.node() not in skipped
+                and util.getsvnrev(parent, '').startswith('svn:')
                 and parent.branch() != ctx.branch()):
                 parentbranch = parent.branch()
                 if parentbranch == 'default':
                     parentbranch = None
             else:
                 parentbranch = None
+            # branchinfo is a map from mercurial branch to a
+            # (svn branch, svn parent revision, svn revision) tuple
+            parentrev = util.getsvnrev(parent, '@').split('@')[1] or 0
             branchinfo[branch] = (parentbranch,
-                                  noderevnums.get(parent.node(), 0),
+                                  int(parentrev),
                                   revision)
 
     util.progress(ui, 'rebuild', None, total=numrevs)
@@ -522,6 +563,7 @@ table = {
     'listauthors': listauthors,
     'update': update,
     'help': help_,
+    'updatemeta': updatemeta,
     'rebuildmeta': rebuildmeta,
     'updateexternals': svnexternals.updateexternals,
     'verify': verify,
--- a/tests/test_rebuildmeta.py
+++ b/tests/test_rebuildmeta.py
@@ -51,6 +51,43 @@ def _do_case(self, name, stupid, single)
         # remove the wrapper
         context.changectx.children = origchildren
 
+    self._run_assertions(name, stupid, single, src, dest, u)
+
+    wc3_path = self.wc_path + '_partial'
+    src, dest = test_util.hgclone(u,
+                                  self.wc_path,
+                                  wc3_path,
+                                  update=False,
+                                  rev=[0])
+
+    # insert a wrapper that prevents calling changectx.children()
+    extensions.wrapfunction(context.changectx, 'children', failfn)
+
+    try:
+        svncommands.rebuildmeta(u, dest,
+                                args=[test_util.fileurl(repo_path +
+                                                        subdir), ])
+    finally:
+        # remove the wrapper
+        context.changectx.children = origchildren
+
+    dest.pull(src)
+
+    # insert a wrapper that prevents calling changectx.children()
+    extensions.wrapfunction(context.changectx, 'children', failfn)
+    try:
+        svncommands.updatemeta(u, dest,
+                               args=[test_util.fileurl(repo_path +
+                                                        subdir), ])
+    finally:
+        # remove the wrapper
+        context.changectx.children = origchildren
+
+    self._run_assertions(name, stupid, single, src, dest, u)
+
+
+def _run_assertions(self, name, stupid, single, src, dest, u):
+
     self.assertTrue(os.path.isdir(os.path.join(src.path, 'svn')),
                     'no .hg/svn directory in the source!')
     self.assertTrue(os.path.isdir(os.path.join(src.path, 'svn')),
@@ -68,7 +105,7 @@ def _do_case(self, name, stupid, single)
             self.assertNotEqual(old, new,
                                 'rebuildmeta unexpected match on youngest rev!')
             continue
-        self.assertMultiLineEqual(old, new)
+        self.assertMultiLineEqual(old, new, tf + ' differs')
         self.assertEqual(src.branchtags(), dest.branchtags())
     srcbi = pickle.load(open(os.path.join(src.path, 'svn', 'branch_info')))
     destbi = pickle.load(open(os.path.join(dest.path, 'svn', 'branch_info')))
@@ -102,6 +139,7 @@ def buildmethod(case, name, stupid, sing
 
 
 attrs = {'_do_case': _do_case,
+         '_run_assertions': _run_assertions,
          }
 for case in [f for f in os.listdir(test_util.FIXTURES) if f.endswith('.svndump')]:
     # this fixture results in an empty repository, don't use it
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -199,12 +199,12 @@ def _verify_our_modules():
             'from the wrong path!'
         )
 
-def hgclone(ui, source, dest, update=True):
+def hgclone(ui, source, dest, update=True, rev=None):
     if getattr(hg, 'peer', None):
         # Since 1.9 (d976542986d2)
-        src, dest = hg.clone(ui, {}, source, dest, update=update)
+        src, dest = hg.clone(ui, {}, source, dest, update=update, rev=rev)
     else:
-        src, dest = hg.clone(ui, source, dest, update=update)
+        src, dest = hg.clone(ui, source, dest, update=update, rev=rev)
     return src, dest
 
 def svnls(repo_path, path, rev='HEAD'):