# HG changeset patch # User Patrick Mezard # Date 1347816709 -7200 # Node ID e1dbd9646d6a6105e6df446030568bdc9bbe53f6 # Parent 5bacb9c63e3eef7991dbe17bb1e6d08e673c1a45 svnwrap: use custom StringIO class in get_file() The wrappers were calling ra.get_file() with a cStringIO object. Empirically, svn 1.7.5 is writing 16kB blocks to the stream object, and cStringIO reallocates its internal buffer and doubles its size whenever it is filled. With large committed files this requires two large memory blocks at the same time. SimpleStringIO implements the mimimum StringIO interface used by ra.get_file() but instead stores all the blocks and "join" them at the end. It means more fragmentation but requires only one large block, without overallocation. Also, 16kB blocks should be friendly to most allocators. In practice, this simple change let me convert a revision containing multiple moderately large files, the largest being around 450MB, with a 32-bits Windows setup, python 2.7, swig svn 1.7.5, in stupid mode, while it was previously aborting with "not enough memory". The same revision still fails in replay mode. diff --git a/hgsubversion/svnwrap/common.py b/hgsubversion/svnwrap/common.py --- a/hgsubversion/svnwrap/common.py +++ b/hgsubversion/svnwrap/common.py @@ -144,3 +144,26 @@ def parse_autoprops(prop_list): properties[prop.strip()] = value return properties +class SimpleStringIO(object): + """SimpleStringIO can replace a StringIO in write mode. + + cStringIO reallocates and doubles the size of its internal buffer + when it needs to append new data which requires two large blocks for + large inputs. SimpleStringIO stores each individual blocks and joins + them once done. This might cause more memory fragmentation but + requires only one large block. In practice, ra.get_file() seems to + write in 16kB blocks (svn 1.7.5) which should be friendly to memory + allocators. + """ + def __init__(self): + self._blocks = [] + + def write(self, s): + self._blocks.append(s) + + def getvalue(self): + return ''.join(self._blocks) + + def close(self): + del self._blocks + diff --git a/hgsubversion/svnwrap/subvertpy_wrapper.py b/hgsubversion/svnwrap/subvertpy_wrapper.py --- a/hgsubversion/svnwrap/subvertpy_wrapper.py +++ b/hgsubversion/svnwrap/subvertpy_wrapper.py @@ -472,7 +472,7 @@ class SubversionRepo(object): """ mode = '' try: - out = cStringIO.StringIO() + out = common.SimpleStringIO() rev, info = self.remote.get_file(path, out, revision) data = out.getvalue() out.close() diff --git a/hgsubversion/svnwrap/svn_swig_wrapper.py b/hgsubversion/svnwrap/svn_swig_wrapper.py --- a/hgsubversion/svnwrap/svn_swig_wrapper.py +++ b/hgsubversion/svnwrap/svn_swig_wrapper.py @@ -505,7 +505,7 @@ class SubversionRepo(object): assert not path.startswith('/') mode = '' try: - out = cStringIO.StringIO() + out = common.SimpleStringIO() info = ra.get_file(self.ra, path, revision, out) data = out.getvalue() out.close()