aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRob Austein <sra@hactrn.net>2012-03-09 03:45:42 +0000
committerRob Austein <sra@hactrn.net>2012-03-09 03:45:42 +0000
commit0eecc7942adf27d15c3fc86ec51affed180abc7e (patch)
tree41dcda75c411d0a6cbcab23af1238d9a0edabcf2
parent2ec3eb5619c9a5f1505d7eda238cb1a555cc5395 (diff)
Use Python "shelve" module to avoid processing XML we've already seen
in a previous run, which speeds this tediously slow script up by something close to an order of magnitude. svn path=/trunk/; revision=4390
-rw-r--r--scripts/analyze-rcynic-history.py55
1 files changed, 36 insertions, 19 deletions
diff --git a/scripts/analyze-rcynic-history.py b/scripts/analyze-rcynic-history.py
index 7d918198..6e8d3598 100644
--- a/scripts/analyze-rcynic-history.py
+++ b/scripts/analyze-rcynic-history.py
@@ -24,7 +24,15 @@ plot_to_one = True
plot_to_many = True
write_rcynic_xml = True
-import mailbox, sys, urlparse, os, getopt, datetime, subprocess
+import mailbox
+import sys
+import urlparse
+import os
+import getopt
+import datetime
+import subprocess
+import shelve
+import whichdb
from xml.etree.cElementTree import (ElementTree as ElementTree,
fromstring as ElementTreeFromString)
@@ -202,39 +210,46 @@ def plot_one(hostnames, fields):
mb = mailbox.Maildir("/u/sra/rpki/rcynic-xml", factory = None, create = False)
+gdbm_file = "rcynic-xml.gdbm"
+
+# Disgusting workaround for dumb bug, see http://bugs.python.org/issue13007
+if whichdb.whichdb(gdbm_file) == "":
+ whichdb.whichdb = lambda filename: "gdbm"
+
+shelf = shelve.open(gdbm_file)
+
sessions = []
latest = None
for i, key in enumerate(mb.iterkeys(), 1):
-
sys.stderr.write("\r%s %d/%d..." % ("|\\-/"[i & 3], i, len(mb)))
- assert not mb[key].is_multipart()
+ if key in shelf:
+ session = shelf[key]
- input = ElementTreeFromString(mb[key].get_payload())
-
- date = input.get("date")
-
- sys.stderr.write("%s..." % date)
+ else:
+ assert not mb[key].is_multipart()
+ input = ElementTreeFromString(mb[key].get_payload())
+ date = input.get("date")
+ sys.stderr.write("%s..." % date)
+ session = Session(date, key)
+ for elt in input.findall("rsync_history"):
+ session.add_rsync_history(Rsync_History(elt))
+ for elt in input.findall("validation_status"):
+ if elt.get("generation") == "current":
+ session.add_uri(elt.text.strip())
+ session.finalize()
+ shelf[key] = session
- session = Session(date, key)
sessions.append(session)
-
if latest is None or session.session_id > latest.session_id:
latest = session
- for elt in input.findall("rsync_history"):
- session.add_rsync_history(Rsync_History(elt))
-
- for elt in input.findall("validation_status"):
- if elt.get("generation") == "current":
- session.add_uri(elt.text.strip())
-
- session.finalize()
-
sys.stderr.write("\n")
+shelf.sync()
+
if plot_all_hosts:
hostnames = set()
for session in sessions:
@@ -256,3 +271,5 @@ if write_rcynic_xml and latest is not None:
f = open("rcynic.xml", "wb")
f.write(mb[latest.msg_key].get_payload())
f.close()
+
+shelf.close()