aboutsummaryrefslogtreecommitdiff
path: root/scripts/analyze-rcynic-history.py
diff options
context:
space:
mode:
authorRob Austein <sra@hactrn.net>2012-01-31 02:15:12 +0000
committerRob Austein <sra@hactrn.net>2012-01-31 02:15:12 +0000
commitd47b19fe99f05c4bc49dbdf01ac959e6bfdcaf21 (patch)
tree783b7ef5d925d2bd7112ca1bc0509a8199072014 /scripts/analyze-rcynic-history.py
parent4c9ae6733fcbba2b7720e600cc43ce3230b827ec (diff)
Rewrite to avoid running out of memory. Painfully slow with three
month's worth of data, but at least it now runs again. svn path=/trunk/; revision=4273
Diffstat (limited to 'scripts/analyze-rcynic-history.py')
-rw-r--r--scripts/analyze-rcynic-history.py147
1 files changed, 57 insertions, 90 deletions
diff --git a/scripts/analyze-rcynic-history.py b/scripts/analyze-rcynic-history.py
index 1713e7ce..e06c9c7d 100644
--- a/scripts/analyze-rcynic-history.py
+++ b/scripts/analyze-rcynic-history.py
@@ -4,7 +4,7 @@ summaries and run gnuplot to draw some pictures.
$Id$
-Copyright (C) 2011 Internet Systems Consortium ("ISC")
+Copyright (C) 2011-2012 Internet Systems Consortium ("ISC")
Permission to use, copy, modify, and distribute this software for any
purpose with or without fee is hereby granted, provided that the above
@@ -19,8 +19,6 @@ OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
"""
-show_summary = True
-show_sessions = True
show_plot = True
plot_all_hosts = False
plot_to_one = True
@@ -31,65 +29,51 @@ import mailbox, sys, urlparse, os, getopt, datetime, subprocess
from xml.etree.cElementTree import (ElementTree as ElementTree,
fromstring as ElementTreeFromString)
-class Rsync_History(object):
+def parse_utc(s):
+ return datetime.datetime.strptime(s, "%Y-%m-%dT%H:%M:%SZ")
- timestamp_format = "%Y-%m-%dT%H:%M:%SZ"
+class Rsync_History(object):
+ """
+ An Rsync_History object represents one rsync connection.
+ """
def __init__(self, elt):
- self.started = datetime.datetime.strptime(elt.get("started"), self.timestamp_format)
- self.finished = datetime.datetime.strptime(elt.get("finished"), self.timestamp_format)
self.error = elt.get("error")
self.uri = elt.text.strip()
self.hostname = urlparse.urlparse(self.uri).hostname or None
- self.elapsed = self.finished - self.started
-
- def __cmp__(self, other):
- return (cmp(self.started, other.started) or
- cmp(self.finished, other.finished) or
- cmp(self.hostname, other.hostname))
+ self.elapsed = parse_utc(elt.get("finished")) - parse_utc(elt.get("started"))
class Host(object):
+ """
+ A host object represents all the data collected for one host. Note
+ that it (usually) contains a list of all the sessions in which this
+ host appears.
- def __init__(self, hostname, session_id = None):
+ This is probably keeping far too much data, and needs to be pruned
+ to keep memory consumption to something sane.
+ """
+
+ def __init__(self, hostname, session_id):
self.hostname = hostname
- self.session_ids = []
- if session_id is not None:
- self.session_ids.append(session_id)
+ self.session_id = session_id
self.elapsed = datetime.timedelta(0)
self.connection_count = 0
self.dead_connections = 0
self.uris = set()
- self.connections = []
-
- def __add__(self, other):
- assert self.hostname == other.hostname
- result = self.__class__(self.hostname)
- for a in ("elapsed", "connection_count", "dead_connections", "session_ids", "connections"):
- setattr(result, a, getattr(self, a) + getattr(other, a))
- result.uris = self.uris | other.uris
- return result
+ self.total_connection_time = datetime.timedelta(0)
def add_rsync_history(self, h):
- self.connection_count += 1
- self.elapsed += h.elapsed
- self.dead_connections += int(h.error is not None)
- self.connections.append(h)
+ self.connection_count += 1
+ self.elapsed += h.elapsed
+ self.dead_connections += int(h.error is not None)
+ self.total_connection_time += h.elapsed
def add_uri(self, u):
self.uris.add(u)
- @property
- def session_id(self):
- assert len(self.session_ids) == 1
- return self.session_ids[0]
-
- @property
- def session_count(self):
- return len(self.session_ids)
-
- @property
- def object_count(self):
- return len(self.uris)
+ def finalize(self):
+ self.object_count = len(self.uris)
+ del self.uris
@property
def failure_rate_percentage(self):
@@ -97,28 +81,15 @@ class Host(object):
@property
def seconds_per_object(self):
- return (float((self.elapsed.days * 24 * 3600 + self.elapsed.seconds) * 10**6 +
- self.elapsed.microseconds) /
- float(self.object_count * self.session_count * 10**6))
+ return float(self.elapsed.total_seconds()) / float(self.object_count)
@property
def objects_per_connection(self):
- return (float(self.object_count * self.session_count) /
- float(self.connection_count))
-
- @property
- def scaled_connections(self):
- return float(self.connection_count) / float(self.session_count)
-
- @property
- def scaled_elapsed(self):
- return self.elapsed / self.session_count
+ return float(self.object_count) / float(self.connection_count)
@property
def average_connection_time(self):
- return (float(sum(((c.elapsed.days * 24 * 3600 + c.elapsed.seconds) * 10**6 + c.elapsed.microseconds)
- for c in self.connections)) /
- float(self.connection_count * 10**6))
+ return float(self.total_connection_time.total_seconds()) / float(self.connection_count)
class Format(object):
@@ -135,8 +106,7 @@ class Host(object):
except ZeroDivisionError:
return self.oops
- format = (Format("scaled_elapsed", "Rsync Time", ".10s"),
- Format("scaled_connections", "Connections", "d"),
+ format = (Format("connection_count", "Connections", "d"),
Format("object_count", "Objects", "d"),
Format("objects_per_connection", "Objects/Connection", ".3f"),
Format("seconds_per_object", "Seconds/Object", ".3f"),
@@ -157,6 +127,15 @@ class Host(object):
return self.format_dict[name](self).strip()
class Session(dict):
+ """
+ A session corresponds to one XML file. This is a dictionary of Host
+ objects, keyed by hostname.
+
+ We might need some kind of .finalize() method which throws away
+ unnecessary data to keep memory consumption down after we've read
+ the whole session. Most likely this would just be a pass through to
+ a Host.finalize() method which would do the real work.
+ """
def __init__(self, session_id = None):
self.session_id = session_id
@@ -168,17 +147,6 @@ class Session(dict):
def get_plot_row(self, name, hostnames):
return (self.session_id,) + tuple(self[h].format_field(name) if h in self else "" for h in hostnames)
- def __add__(self, other):
- result = self.__class__()
- for h in self.hostnames | other.hostnames:
- if h in self and h in other:
- result[h] = self[h] + other[h]
- elif h in self:
- result[h] = self[h]
- else:
- result[h] = other[h]
- return result
-
def add_rsync_history(self, h):
if h.hostname not in self:
self[h.hostname] = Host(h.hostname, self.session_id)
@@ -189,18 +157,17 @@ class Session(dict):
if h and h in self:
self[h].add_uri(u)
- def dump(self, title, f = sys.stdout):
- f.write("\n" + title + "\n" + Host.header + "\n")
- for h in sorted(self):
- f.write(str(self[h]) + "\n")
+ def finalize(self):
+ for h in self.itervalues():
+ h.finalize()
mb = mailbox.Maildir("/u/sra/rpki/rcynic-xml", factory = None, create = False)
sessions = []
-for msg in mb.itervalues():
+for i, msg in enumerate(mb.itervalues()):
- sys.stderr.write(".")
+ sys.stderr.write("\r%s %d/%d..." % ("|\\-/"[i & 3], i, len(mb)))
assert not msg.is_multipart()
@@ -216,16 +183,9 @@ for msg in mb.itervalues():
if elt.get("generation") == "current":
session.add_uri(elt.text.strip())
-sys.stderr.write("\n")
-
-summary = sum(sessions, Session())
-
-if show_summary:
- summary.dump("Summary (%d sessions)" % len(sessions))
+ session.finalize()
-if show_sessions:
- for i, session in enumerate(sessions, 1):
- session.dump("Session #%d (%s)" % (i, session.session_id))
+sys.stderr.write("\n")
def plotter(f, hostnames, field, logscale = False):
plotlines = sorted(session.get_plot_row(field, hostnames) for session in sessions)
@@ -267,18 +227,25 @@ def plot_one(hostnames, fields):
gnuplot.stdin.write("set terminal pdf\n")
gnuplot.stdin.write("set output 'analyze-rcynic-history.pdf'\n")
for field in fields:
- if field not in ("scaled_elapsed", "hostname"):
+ if field != "hostname":
plotter(gnuplot.stdin, hostnames, field, logscale = False)
plotter(gnuplot.stdin, hostnames, field, logscale = True)
gnuplot.stdin.close()
gnuplot.wait()
if show_plot:
+
if plot_all_hosts:
- hostnames = sorted(summary.hostnames)
+ hostnames = set()
+ for session in sessions:
+ hostnames.update(session.hostnames)
+ hostnames = sorted(hostnames)
+
else:
- hostnames = ("rpki.apnic.net", "rpki.ripe.net", "repository.lacnic.net", "rpki.afrinic.net", "arin.rpki.net", "rgnet.rpki.net")
- fields = [fmt.attr for fmt in Host.format if fmt.attr not in ("scaled_elapsed", "hostname")]
+ hostnames = ("rpki.apnic.net", "rpki.ripe.net", "repository.lacnic.net",
+ "rpki.afrinic.net", "arin.rpki.net", "rgnet.rpki.net")
+
+ fields = [fmt.attr for fmt in Host.format if fmt.attr != "hostname"]
if plot_to_one:
plot_one(hostnames, fields)
if plot_to_many: