changeset 203:907c160c6289

Refactor branch handling to be much more dynamic (and hopefully robust). This should allow fixing of several outstanding issues with branch handling. Note that this is a *massive* change to one of the oldest parts of hgsubversion, so it might introduce bugs not caught by the testsuite.
author Augie Fackler <durin42@gmail.com>
date Mon, 02 Mar 2009 23:58:38 -0600
parents 125cf3cb7bee
children 496ff87e2532
files fetch_command.py hg_delta_editor.py tests/test_fetch_command.py tests/test_fetch_mappings.py
diffstat 4 files changed, 220 insertions(+), 71 deletions(-) [+]
line wrap: on
line diff
--- a/fetch_command.py
+++ b/fetch_command.py
@@ -69,7 +69,7 @@ def fetch_revisions(ui, svn_url, hg_repo
 
     # start converting revisions
     for r in svn.revisions(start=start):
-        valid = False
+        valid = True
         hg_editor.update_branch_tag_map_for_rev(r)
         for p in r.paths:
             if hg_editor._is_path_valid(p):
@@ -224,11 +224,12 @@ def stupid_diff_branchrev(ui, svn, hg_ed
     callable to retrieve individual file information. Raise BadPatchApply upon
     error.
     """
-    def make_diff_path(b):
-        if b == None:
+    def make_diff_path(branch):
+        if branch == 'trunk' or branch is None:
             return 'trunk'
-        return 'branches/' + b
-
+        elif branch.startswith('../'):
+            return branch[3:]
+        return 'branches/%s' % branch
     parent_rev, br_p = hg_editor.get_parent_svn_branch_and_rev(r.revnum, branch)
     diff_path = make_diff_path(branch)
     try:
@@ -554,8 +555,41 @@ def stupid_fetch_branchrev(svn, hg_edito
 
 def stupid_svn_server_pull_rev(ui, svn, hg_editor, r):
     # this server fails at replay
-    branches = hg_editor.branches_in_paths(r.paths)
+    branches = hg_editor.branches_in_paths(r.paths, r.revnum, svn.checkpath, svn.list_files)
     deleted_branches = {}
+    brpaths = branches.values()
+    bad_branch_paths = {}
+    for br, bp in branches.iteritems():
+        bad_branch_paths[br] = []
+
+        # This next block might be needed, but for now I'm omitting it until it can be
+        # proven necessary.
+        # for bad in brpaths:
+        #     if bad.startswith(bp) and len(bad) > len(bp):
+        #         bad_branch_paths[br].append(bad[len(bp)+1:])
+
+        # We've go a branch that contains other branches. We have to be careful to
+        # get results similar to real replay in this case.
+        for existingbr in hg_editor.branches:
+            bad = hg_editor._remotename(existingbr)
+            if bad.startswith(bp) and len(bad) > len(bp):
+                bad_branch_paths[br].append(bad[len(bp)+1:])
+    for p in r.paths:
+        if hg_editor._is_path_tag(p):
+            continue
+        branch = hg_editor._localname(p)
+        if r.paths[p].action == 'R' and branch in hg_editor.branches:
+            branchedits = sorted(filter(lambda x: x[0][1] == branch and x[0][0] < r.revnum,
+                                        hg_editor.revmap.iteritems()), reverse=True)
+            is_closed = False
+            if len(branchedits) > 0:
+                branchtip = branchedits[0][1]
+                for child in hg_editor.repo[branchtip].children():
+                    if child.branch() == 'closed-branches':
+                        is_closed = True
+                        break
+                if not is_closed:
+                    deleted_branches[branch] = branchtip
     date = r.date.replace('T', ' ').replace('Z', '').split('.')[0]
     date += ' -0000'
     check_deleted_branches = set()
@@ -589,12 +623,15 @@ def stupid_svn_server_pull_rev(ui, svn, 
                         raise IOError()
                     return context.memfilectx(path=path, data=externals.write(),
                                               islink=False, isexec=False, copied=None)
+                for bad in bad_branch_paths[b]:
+                    if path.startswith(bad):
+                        raise IOError()
                 return filectxfn2(repo, memctx, path)
 
         extra = util.build_extra(r.revnum, b, svn.uuid, svn.subdir)
         if '' in files_touched:
             files_touched.remove('')
-        excluded = [f for f in files_touched 
+        excluded = [f for f in files_touched
                     if not hg_editor._is_file_included(f)]
         for f in excluded:
             files_touched.remove(f)
@@ -612,6 +649,9 @@ def stupid_svn_server_pull_rev(ui, svn, 
                                          date,
                                          extra)
             ha = hg_editor.repo.commitctx(current_ctx)
+            branch = extra.get('branch', None)
+            if not branch in hg_editor.branches:
+                hg_editor.branches[branch] = None, 0, r.revnum
             hg_editor.add_to_revmap(r.revnum, b, ha)
             hg_editor._save_metadata()
             util.describe_commit(ui, ha, b)
--- a/hg_delta_editor.py
+++ b/hg_delta_editor.py
@@ -166,41 +166,111 @@ class HgChangeReceiver(delta.Editor):
         pickle_atomic(self.branches, self.branch_info_file, self.meta_data_dir)
         pickle_atomic(self.tags, self.tag_info_file, self.meta_data_dir)
 
-    def branches_in_paths(self, paths):
+    def branches_in_paths(self, paths, revnum, checkpath, listdir):
         '''Given a list of paths, return mapping of all branches touched
         to their branch path.
         '''
         branches = {}
+        paths_need_discovery = []
         for p in paths:
             relpath, branch, branchpath = self._split_branch_path(p)
             if relpath is not None:
                 branches[branch] = branchpath
+            elif paths[p].action == 'D' and not self._is_path_tag(p):
+                ln = self._localname(p)
+                # must check in branches_to_delete as well, because this runs after we
+                # already updated the branch map
+                if ln in self.branches or ln in self.branches_to_delete:
+                    branches[self._localname(p)] = p
+            else:
+                paths_need_discovery.append(p)
+        if paths_need_discovery:
+            paths_need_discovery = [(len(p), p) for p in paths_need_discovery]
+            paths_need_discovery.sort()
+            paths_need_discovery = [p[1] for p in paths_need_discovery]
+            actually_files = []
+            while paths_need_discovery:
+                p = paths_need_discovery.pop(0)
+                path_could_be_file = True
+                # TODO(augie) Figure out if you can use break here in a for loop, quick
+                # testing of that failed earlier.
+                ind = 0
+                while ind < len(paths_need_discovery) and not paths_need_discovery:
+                    if op.startswith(p):
+                        path_could_be_file = False
+                    ind += 1
+                if path_could_be_file:
+                    if checkpath(p, revnum) == 'f':
+                        actually_files.append(p)
+                    # if there's a copyfrom_path and there were files inside that copyfrom,
+                    # we need to detect those branches. It's a little thorny and slow, but
+                    # seems to be the best option.
+                    elif paths[p].copyfrom_path and not p.startswith('tags/'):
+                        paths_need_discovery.extend(['%s/%s' % (p,x[0])
+                                                     for x in listdir(p, revnum)
+                                                     if x[1] == 'f'])
+            if actually_files:
+                filepaths = [p.split('/') for p in actually_files]
+                filepaths = [(len(p), p) for p in filepaths]
+                filepaths.sort()
+                filepaths = [p[1] for p in filepaths]
+                while filepaths:
+                    path = filepaths.pop(0)
+                    parentdir = '/'.join(path[:-1])
+                    filepaths = [p for p in filepaths if not '/'.join(p).startswith(parentdir)]
+                    branchpath = self._normalize_path(parentdir)
+                    branchname = self._localname(branchpath)
+                    branches[branchname] = branchpath
+
         return branches
 
-    def _path_and_branch_for_path(self, path):
-        return self._split_branch_path(path)[:2]
+    def _path_and_branch_for_path(self, path, existing=True):
+        return self._split_branch_path(path, existing=existing)[:2]
+
+    def _localname(self, path):
+        """Compute the local name for a branch located at path.
+        """
+        assert not path.startswith('tags/')
+        if path == 'trunk':
+            return None
+        elif path.startswith('branches/'):
+            return path[len('branches/'):]
+        return  '../%s' % path
+
+    def _remotename(self, branch):
+        if branch == 'default' or branch is None:
+            return 'trunk'
+        elif branch.startswith('../'):
+            return branch[3:]
+        return 'branches/%s' % branch
 
-    def _split_branch_path(self, path):
+    def _split_branch_path(self, path, existing=True):
         """Figure out which branch inside our repo this path represents, and
         also figure out which path inside that branch it is.
 
-        Raises an exception if it can't perform its job.
+        Returns a tuple of (path within branch, local branch name, server-side branch path).
+
+        If existing=True, will return None, None, None if the file isn't on some known
+        branch. If existing=False, then it will guess what the branch would be if it were
+        known.
         """
         path = self._normalize_path(path)
-        if path.startswith('trunk'):
-            p = path[len('trunk'):]
-            if p and p[0] == '/':
-                p = p[1:]
-            return p, None, 'trunk'
-        elif path.startswith('branches/'):
-            p = path[len('branches/'):]
-            br = p.split('/')[0]
-            if br:
-                p = p[len(br)+1:]
-                if p and p[0] == '/':
-                    p = p[1:]
-                return p, br, 'branches/' + br
-        return None, None, None
+        if path.startswith('tags/'):
+            return None, None, None
+        test = ''
+        path_comps = path.split('/')
+        while self._localname(test) not in self.branches and len(path_comps):
+            if not test:
+                test = path_comps.pop(0)
+            else:
+                test += '/%s' % path_comps.pop(0)
+        if self._localname(test) in self.branches:
+            return path[len(test)+1:], self._localname(test), test
+        if existing:
+            return None, None, None
+        path = test.split('/')[-1]
+        test = '/'.join(test.split('/')[:-1])
+        return path, self._localname(test), test
 
     def set_current_rev(self, rev):
         """Set the revision we're currently converting.
@@ -234,7 +304,7 @@ class HgChangeReceiver(delta.Editor):
         if path and path[0] == '/':
             path = path[1:]
         return path
-        
+
     def _is_file_included(self, subpath):
         def checkpathinmap(path, mapping):
             def rpairs(name):
@@ -243,14 +313,14 @@ class HgChangeReceiver(delta.Editor):
                 while e != -1:
                     yield name[:e], name[e+1:]
                     e = name.rfind('/', 0, e)
-            
+
             for pre, suf in rpairs(path):
                 try:
                     return mapping[pre]
                 except KeyError, err:
                     pass
             return None
-        
+
         if len(self.includepaths) and len(subpath):
             inc = checkpathinmap(subpath, self.includepaths)
         else:
@@ -268,10 +338,13 @@ class HgChangeReceiver(delta.Editor):
         if subpath is None:
             return False
         return self._is_file_included(subpath)
-        
+
 
     def _is_path_tag(self, path):
-        """If path represents the path to a tag, returns the tag name.
+        """If path could represent the path to a tag, returns the potential tag name.
+
+        Note that it's only a tag if it was copied from the path '' in a branch (or tag)
+        we have, for our purposes.
 
         Otherwise, returns False.
         """
@@ -279,7 +352,7 @@ class HgChangeReceiver(delta.Editor):
         for tags_path in self.tag_locations:
             if path and (path.startswith(tags_path) and
                          len(path) > len('%s/' % tags_path)):
-                return path[len(tags_path)+1:].split('/')[0]
+                return path[len(tags_path)+1:]
         return False
 
     def get_parent_svn_branch_and_rev(self, number, branch):
@@ -321,6 +394,28 @@ class HgChangeReceiver(delta.Editor):
             return self.revmap[r, br]
         return revlog.nullid
 
+    def _svnpath(self, branch):
+        """Return the relative path in svn of branch.
+        """
+        if branch == None or branch == 'default':
+            return 'trunk'
+        elif branch.startswith('../'):
+            return branch[3:]
+        return 'branches/%s' % branch
+
+    def __determine_parent_branch(self, p, src_path, src_rev, revnum):
+        if src_path is not None:
+            src_file, src_branch = self._path_and_branch_for_path(src_path)
+            src_tag = self._is_path_tag(src_path)
+            if src_tag != False:
+                # also case 2
+                src_branch, src_rev = self.tags[src_tag]
+                return {self._localname(p): (src_branch, src_rev, revnum )}
+            if src_file == '':
+                # case 2
+                return {self._localname(p): (src_branch, src_rev, revnum )}
+        return {}
+
     def update_branch_tag_map_for_rev(self, revision):
         paths = revision.paths
         added_branches = {}
@@ -328,37 +423,8 @@ class HgChangeReceiver(delta.Editor):
         self.branches_to_delete = set()
         tags_to_delete = set()
         for p in sorted(paths):
-            fi, br = self._path_and_branch_for_path(p)
-            if fi is not None:
-                if fi == '' and paths[p].action != 'D':
-                    src_p = paths[p].copyfrom_path
-                    src_rev = paths[p].copyfrom_rev
-                    src_tag = self._is_path_tag(src_p)
-
-                    if not ((src_p and self._is_path_valid(src_p)) or
-                            (src_tag and src_tag in self.tags)):
-                        # The branch starts here and is not a copy
-                        src_branch = None
-                        src_rev = 0
-                    elif src_tag:
-                        # this is a branch created from a tag. Note that this
-                        # really does happen (see Django)
-                        src_branch, src_rev = self.tags[src_tag]
-                    else:
-                        # Not from a tag, and from a valid repo path
-                        (src_p,
-                        src_branch) = self._path_and_branch_for_path(src_p)
-                        if src_p is None:
-                            continue
-                    if (br not in self.branches or
-                        not (src_rev == 0 and src_branch == None)):
-                        added_branches[br] = src_branch, src_rev, revision.revnum
-                elif fi == '' and br in self.branches:
-                    self.branches_to_delete.add(br)
-            else:
-                t_name = self._is_path_tag(p)
-                if t_name == False:
-                    continue
+            t_name = self._is_path_tag(p)
+            if t_name != False:
                 src_p, src_rev = paths[p].copyfrom_path, paths[p].copyfrom_rev
                 # if you commit to a tag, I'm calling you stupid and ignoring
                 # you.
@@ -378,6 +444,41 @@ class HgChangeReceiver(delta.Editor):
                 elif (paths[p].action == 'D' and p.endswith(t_name)
                       and t_name in self.tags):
                         tags_to_delete.add(t_name)
+                continue
+            # At this point we know the path is not a tag. In that case, we only care if it
+            # is the root of a new branch (in this function). This is determined by the
+            # following checks:
+            # 1. Is the file located inside any currently known branch?
+            #     If yes, then we're done with it, this isn't interesting.
+            # 2. Does the file have copyfrom information that means it is a copy from the root
+            #    of some other branch?
+            #     If yes, then we're done: this is a new branch, and we record the copyfrom in
+            #     added_branches
+            # 3. Neither of the above. This could be a branch, but it might never work out for
+            #    us. It's only ever a branch (as far as we're concerned) if it gets committed
+            #    to, which we have to detect at file-write time anyway. So we do nothing here.
+            # 4. It's the root of an already-known branch, with an action of 'D'. We mark the
+            #    branch as deleted.
+            # 5. It's the parent directory of one or more already-known branches, so we mark them
+            #    as deleted.
+            # 6. It's a branch being replaced by another branch - the action will be 'R'.
+            fi, br = self._path_and_branch_for_path(p)
+            if fi is not None:
+                if fi == '':
+                    if paths[p].action == 'D':
+                        self.branches_to_delete.add(br) # case 4
+                    elif paths[p].action == 'R':
+                        added_branches.update(self.__determine_parent_branch(p, paths[p].copyfrom_path,
+                                                                             paths[p].copyfrom_rev,
+                                                                             revision.revnum))
+                continue # case 1
+            if paths[p].action == 'D':
+                # check for case 5
+                for known in self.branches:
+                    if self._svnpath(known).startswith(p):
+                        self.branches_to_delete.add(br) # case 5
+            added_branches.update(self.__determine_parent_branch(p, paths[p].copyfrom_path,
+                                                                 paths[p].copyfrom_rev, revision.revnum))
         for t in tags_to_delete:
             del self.tags[t]
         for br in self.branches_to_delete:
@@ -556,6 +657,7 @@ class HgChangeReceiver(delta.Editor):
             our_util.describe_commit(self.ui, new_hash, branch)
             if (rev.revnum, branch) not in self.revmap:
                 self.add_to_revmap(rev.revnum, branch, new_hash)
+        self._save_metadata()
         self.clear_current_info()
 
     def authorforsvnauthor(self, author):
@@ -741,9 +843,12 @@ class HgChangeReceiver(delta.Editor):
         self.base_revision = None
         if path in self.deleted_files:
             del self.deleted_files[path]
-        fpath, branch = self._path_and_branch_for_path(path)
+        fpath, branch = self._path_and_branch_for_path(path, existing=False)
         if not fpath:
             return
+        if branch not in self.branches:
+            # we know this branch will exist now, because it has at least one file. Rock.
+            self.branches[branch] = None, 0, self.current_rev.revnum
         self.current_file = path
         self.should_edit_most_recent_plaintext = False
         if not copyfrom_path:
@@ -790,13 +895,14 @@ class HgChangeReceiver(delta.Editor):
             if not self._is_path_valid(copyfrom_path) and not tag:
                 self.missing_plaintexts.add('%s/' % path)
                 return path
-
         if tag:
             source_branch, source_rev = self.tags[tag]
             cp_f = ''
         else:
             source_rev = copyfrom_revision
             cp_f, source_branch = self._path_and_branch_for_path(copyfrom_path)
+            if cp_f == '' and br_path == '':
+                self.branches[branch] = source_branch, source_rev, self.current_rev.revnum
         new_hash = self.get_parent_revision(source_rev + 1,
                                             source_branch)
         if new_hash == node.nullid:
--- a/tests/test_fetch_command.py
+++ b/tests/test_fetch_command.py
@@ -82,9 +82,12 @@ class TestBasicRepoLayout(test_util.Test
 
     def test_file_mixed_with_branches(self):
         repo = self._load_fixture_and_fetch('file_mixed_with_branches.svndump')
-        self.assertEqual(node.hex(repo['tip'].node()),
+        self.assertEqual(node.hex(repo['default'].node()),
                          '434ed487136c1b47c1e8f952edb4dc5a8e6328df')
         assert 'README' not in repo
+        self.assertEqual(repo['tip'].branch(),
+                         '../branches')
+
 
     def test_files_copied_from_outside_btt(self):
         repo = self._load_fixture_and_fetch(
--- a/tests/test_fetch_mappings.py
+++ b/tests/test_fetch_mappings.py
@@ -53,7 +53,7 @@ class MapTests(test_util.TestBase):
 
     def test_author_map_closing_author_stupid(self):
         self.test_author_map_closing_author(True)
-        
+
     def test_file_map(self, stupid=False):
         test_util.load_svndump_fixture(self.repo_path, 'replace_trunk_with_branch.svndump')
         filemap = open(self.filemap, 'w')
@@ -66,10 +66,10 @@ class MapTests(test_util.TestBase):
                                       filemap=self.filemap)
         self.assertEqual(node.hex(self.repo[0].node()), '88e2c7492d83e4bf30fbb2dcbf6aa24d60ac688d')
         self.assertEqual(node.hex(self.repo['default'].node()), 'e524296152246b3837fe9503c83b727075835155')
-        
+
     def test_file_map_stupid(self):
         self.test_file_map(True)
-        
+
     def test_file_map_exclude(self, stupid=False):
         test_util.load_svndump_fixture(self.repo_path, 'replace_trunk_with_branch.svndump')
         filemap = open(self.filemap, 'w')
@@ -81,8 +81,8 @@ class MapTests(test_util.TestBase):
                                       stupid=stupid,
                                       filemap=self.filemap)
         self.assertEqual(node.hex(self.repo[0].node()), '2c48f3525926ab6c8b8424bcf5eb34b149b61841')
-        self.assertEqual(node.hex(self.repo['default'].node()), '86fc12d173716139d5bd1d36866038d355009f45')
-        
+        self.assertEqual(node.hex(self.repo['default'].node()), 'b37a3c0297b71f989064d9b545b5a478bbed7cc1')
+
     def test_file_map_exclude_stupid(self):
         self.test_file_map_exclude(True)