changeset 1475:ea4d6142c6d9

maps: do not ask sqlite for row count "SELECT COUNT(1) FROM x" is not O(1) for sqlite and can be slow on large tables. This patch changes the count to be backed by a file instead. The change exposes a risk that the number may become inaccurate, if __setitem__ is called with a same key multiple times. But we don't do that during pull, and only use __len__ to calculate how many revisions pulled, or test if the map is empty. So it would be fine.
author Jun Wu <quark@fb.com>
date Thu, 23 Jun 2016 10:46:07 +0100 (2016-06-23)
parents f21605bcda24
children 581f72f9478b
files hgsubversion/maps.py
diffstat 1 files changed, 18 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- a/hgsubversion/maps.py
+++ b/hgsubversion/maps.py
@@ -426,6 +426,7 @@ class RevMap(dict):
             revmap.exportrevmapv1(tmppath)
             os.rename(tmppath, self._filepath)
             hgutil.unlinkpath(revmap._dbpath)
+            hgutil.unlinkpath(revmap._rowcountpath, ignoremissing=True)
             return self._readmapfile()
         if ver != self.VERSION:
             raise hgutil.Abort('revmap too new -- please upgrade')
@@ -554,10 +555,13 @@ class SqliteRevMap(collections.MutableMa
 
     lastpulled = util.fileproperty('_lastpulled', lambda x: x._lastpulledpath,
                                    default=0, deserializer=int)
+    rowcount = util.fileproperty('_rowcount', lambda x: x._rowcountpath,
+                                 default=0, deserializer=int)
 
     def __init__(self, revmap_path, lastpulled_path):
         self._filepath = revmap_path
         self._dbpath = revmap_path + '.db'
+        self._rowcountpath = self._dbpath + '.rowcount'
         self._lastpulledpath = lastpulled_path
 
         self._db = None
@@ -608,6 +612,7 @@ class SqliteRevMap(collections.MutableMa
     def clear(self):
         hgutil.unlinkpath(self._filepath, ignoremissing=True)
         hgutil.unlinkpath(self._dbpath, ignoremissing=True)
+        hgutil.unlinkpath(self._rowcountpath, ignoremissing=True)
         self._db = None
         self._hashes = None
         self._firstpull = None
@@ -635,10 +640,8 @@ class SqliteRevMap(collections.MutableMa
         return iter(rows)
 
     def __len__(self):
-        # 'WHERE rev >= 0' hints sqlite to use the rev index
-        with self._transaction() as db:
-            return db.execute('SELECT COUNT(1) FROM revmap ' +
-                              'WHERE rev >= 0').fetchone()[0]
+        # rowcount is faster than "SELECT COUNT(1)". the latter is not O(1)
+        return self.rowcount
 
     def __setitem__(self, key, binha):
         revnum, branch = key
@@ -653,6 +656,8 @@ class SqliteRevMap(collections.MutableMa
 
     def __delitem__(self, key):
         for row in self._querybykey('DELETE', key):
+            if self.rowcount > 0:
+                self.rowcount -= 1
             return
         # For performance reason, self._hashes is not updated
         raise KeyError(key)
@@ -687,6 +692,10 @@ class SqliteRevMap(collections.MutableMa
         self._db.executemany(
             'INSERT OR REPLACE INTO revmap (rev, branch, hash) ' +
             'VALUES (?, ?, ?)', rows)
+        # If REPLACE happens, rowcount can be wrong. But it is only used to
+        # calculate how many revisions pulled, and during pull we don't
+        # replace rows. So it is fine.
+        self.rowcount += len(rows)
 
     def _opendb(self):
         '''Open the database and make sure the table is created on demand.'''
@@ -723,7 +732,12 @@ class SqliteRevMap(collections.MutableMa
         with self._transaction('EXCLUSIVE'):
             map(self._db.execute, self.TABLESCHEMA)
             if version == RevMap.VERSION:
+                self.rowcount = 0
                 self._importrevmapv1()
+            elif not self.rowcount:
+                self.rowcount = self._db.execute(
+                    'SELECT COUNT(1) FROM revmap').fetchone()[0]
+
             # "bulk insert; then create index" is about 2.4x as fast as
             # "create index; then bulk insert" on a large repo
             map(self._db.execute, self.INDEXSCHEMA)