sync: Order projects according to last fetch time
Some projects may consistently take longer to fetch than others, for
example a more active project may have many more Gerrit changes than a
less active project, which take longer to transfer. Use a simple
heuristic based on the last fetch time to fetch slower projects first,
so we do not tend to spend the end of the sync fetching a small number
of outliers.
This algorithm is probably not optimal, and due to inter-run latency
variance and Python thread scheduling, we may not even have good
estimates of a project sync time.
Change-Id: I9a463f214b3ed742e4d807c42925b62cb8b1745b
diff --git a/subcmds/sync.py b/subcmds/sync.py
index e68a025..a8022d9 100644
--- a/subcmds/sync.py
+++ b/subcmds/sync.py
@@ -16,6 +16,7 @@
import netrc
from optparse import SUPPRESS_HELP
import os
+import pickle
import re
import shutil
import socket
@@ -47,6 +48,8 @@
from project import SyncBuffer
from progress import Progress
+_ONE_DAY_S = 24 * 60 * 60
+
class _FetchError(Exception):
"""Internal error thrown in _FetchHelper() when we don't want stack trace."""
pass
@@ -212,10 +215,12 @@
# - We always make sure we unlock the lock if we locked it.
try:
try:
+ start = time.time()
success = project.Sync_NetworkHalf(
quiet=opt.quiet,
current_branch_only=opt.current_branch_only,
clone_bundle=not opt.no_clone_bundle)
+ self._fetch_times.Set(project, time.time() - start)
# Lock around all the rest of the code, since printing, updating a set
# and Progress.update() are not thread safe.
@@ -293,6 +298,7 @@
sys.exit(1)
pm.end()
+ self._fetch_times.Save()
for project in projects:
project.bare_git.gc('--auto')
return fetched
@@ -496,12 +502,15 @@
self.jobs = self.manifest.default.sync_j
all_projects = self.GetProjects(args, missing_ok=True)
+ self._fetch_times = _FetchTimes(self.manifest)
if not opt.local_only:
to_fetch = []
now = time.time()
- if (24 * 60 * 60) <= (now - rp.LastFetch):
+ if _ONE_DAY_S <= (now - rp.LastFetch):
to_fetch.append(rp)
to_fetch.extend(all_projects)
+ to_fetch.sort(key=self._fetch_times.Get, reverse=True)
+ self._fetch_times.Clear()
self._Fetch(to_fetch, opt)
_PostRepoFetch(rp, opt.no_repo_verify)
@@ -602,3 +611,53 @@
print >>sys.stderr
return False
return True
+
+class _FetchTimes(object):
+ def __init__(self, manifest):
+ self._path = os.path.join(manifest.repodir, '.repopickle_fetchtimes')
+ self._times = None
+
+ def Clear(self):
+ self._times = {}
+
+ def Get(self, project):
+ self._Load()
+ return self._times.get(project.name, _ONE_DAY_S)
+
+ def Set(self, project, t):
+ self._times[project.name] = t
+
+ def _Load(self):
+ if self._times is None:
+ try:
+ f = open(self._path)
+ except IOError:
+ self._times = {}
+ return self._times
+ try:
+ try:
+ self._times = pickle.load(f)
+ except:
+ try:
+ os.remove(self._path)
+ except OSError:
+ pass
+ self._times = {}
+ finally:
+ f.close()
+ return self._times
+
+ def Save(self):
+ if self._times is None:
+ return
+ try:
+ f = open(self._path, 'wb')
+ try:
+ pickle.dump(self._times, f)
+ except (IOError, OSError, pickle.PickleError):
+ try:
+ os.remove(self._path)
+ except OSError:
+ pass
+ finally:
+ f.close()