#!/usr/bin/env python # $Id$ # # Copyright (C) 2013-2014 Dragon Research Labs ("DRL") # Portions copyright (C) 2011-2012 Internet Systems Consortium ("ISC") # # Permission to use, copy, modify, and distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notices and this permission notice appear in all copies. # # THE SOFTWARE IS PROVIDED "AS IS" AND DRL AND ISC DISCLAIM ALL # WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL DRL OR # ISC BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL # DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA # OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER # TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR # PERFORMANCE OF THIS SOFTWARE. """ Parse rcynic XML output, stuff the data that validation_status script would print into an SQL database for subsequent analysis. """ import os import sys import time import mailbox import sqlite3 import argparse import lxml.etree import subprocess class Parser(object): @staticmethod def main(): parser = argparse.ArgumentParser( description = __doc__, formatter_class = argparse.ArgumentDefaultsHelpFormatter) group = parser.add_mutually_exclusive_group(required = True) group.add_argument("--mailbox", "--mb", help = "Maildir mailbox containing rcynic XML output") group.add_argument("--tarballs", help = "directory tree of tar files containing containing rcynic XML output") parser.add_argument("--database", "--db", default = "validation-status-sql.db", help = "SQLite3 database") parser.add_argument("--path-within-tarball", default = "var/rcynic/data/rcynic.xml", help = "rcynic.xml path name within tarball(s)") parser.add_argument("--tar-extensions", nargs = "+", default = ".tar .tar.gz .tgz .tar.bz2 .tbz .tar.xz .txz".split(), help = "extensions to recognize as indicating tar files") args = parser.parse_args() if args.mailbox: ParserMailbox(args) else: ParserTarball(args) def __init__(self, args): self.args = args self.init_sql() self.init_hook() self.index1() self.parsed = 1 for self.current, self.iterval in enumerate(self.iterator, 1): self.parse_xml() if self.parsed > 1: sys.stderr.write("\n") self.index2() self.db.close() def init_sql(self): creating = not os.path.exists(self.args.database) self.db = sqlite3.connect(self.args.database) self.db.text_factory = str self.db.executescript(''' PRAGMA foreign_keys = off; PRAGMA synchronous = off; PRAGMA count_changes = off; ''') if creating: self.db.executescript(''' CREATE TABLE sessions ( session_id INTEGER PRIMARY KEY NOT NULL, session DATETIME NOT NULL, handle TEXT NOT NULL ); CREATE TABLE uris ( uri_id INTEGER PRIMARY KEY NOT NULL, uri TEXT NOT NULL ); CREATE TABLE codes ( code_id INTEGER PRIMARY KEY NOT NULL, code TEXT NOT NULL ); CREATE TABLE generations ( generation_id INTEGER PRIMARY KEY NOT NULL, generation TEXT NOT NULL ); CREATE TABLE events ( id INTEGER PRIMARY KEY NOT NULL, timestamp DATETIME NOT NULL, session_id INTEGER NOT NULL REFERENCES sessions (session_id) ON DELETE RESTRICT ON UPDATE RESTRICT, generation_id INTEGER NOT NULL REFERENCES generations (generation_id) ON DELETE RESTRICT ON UPDATE RESTRICT, code_id INTEGER NOT NULL REFERENCES codes (code_id) ON DELETE RESTRICT ON UPDATE RESTRICT, uri_id INTEGER NOT NULL REFERENCES uris (uri_id) ON DELETE RESTRICT ON UPDATE RESTRICT ); CREATE VIEW status AS SELECT id, handle, session, timestamp, generation, code, uri FROM events NATURAL JOIN sessions NATURAL JOIN uris NATURAL JOIN codes NATURAL JOIN generations; ''') def index1(self): self.db.executescript(''' CREATE UNIQUE INDEX IF NOT EXISTS sessions_index ON sessions (session); CREATE UNIQUE INDEX IF NOT EXISTS handles_index ON sessions (handle); CREATE UNIQUE INDEX IF NOT EXISTS uris_index ON uris (uri); CREATE UNIQUE INDEX IF NOT EXISTS codes_index ON codes (code); CREATE UNIQUE INDEX IF NOT EXISTS generations_index ON generations (generation); ''') def index2(self): self.db.executescript(''' CREATE UNIQUE INDEX IF NOT EXISTS events_index ON events (uri_id, timestamp, code_id, generation_id); ''') def string_id(self, table, value): field = table.rstrip("s") try: return self.db.execute("SELECT %s_id FROM %s WHERE %s = ?" % (field, table, field), (value,)).fetchone()[0] except: return self.db.execute("INSERT INTO %s (%s) VALUES (?)" % (table, field), (value,)).lastrowid def parse_xml(self): sys.stderr.write("\r%s %d/%d/%d...%s " % ("|\\-/"[self.current & 3], self.current, self.parsed, self.total, self.handle)) if self.db.execute("SELECT handle FROM sessions WHERE handle = ?", (self.handle,)).fetchone(): return xml = self.read_xml() with self.db: session_id = self.db.execute("INSERT INTO sessions (session, handle) VALUES (strftime('%s', ?), ?)", (xml.get("date"), self.handle)).lastrowid self.db.executemany("INSERT INTO events (session_id, timestamp, generation_id, code_id, uri_id) " "VALUES (?, strftime('%s', ?), ?, ?, ?)", ((session_id, x.get("timestamp"), self.string_id("generations", x.get("generation", "none")), self.string_id("codes", x.get("status")), self.string_id("uris", x.text.strip())) for x in xml.findall("validation_status"))) self.parsed += 1 class ParserTarball(Parser): def init_hook(self): self.total = 0 for fn in self.iter_tarball_names(): self.total += 1 self.iterator = self.iter_tarball_names() @property def handle(self): return self.iterval def read_xml(self): return lxml.etree.ElementTree( file = subprocess.Popen(("tar", "Oxf", self.iterval, self.args.path_within_tarball), stdout = subprocess.PIPE).stdout).getroot() def iter_tarball_names(self): if os.path.isdir(self.args.tarballs): for root, dirs, files in os.walk(self.args.tarballs): for fn in files: if any(fn.endswith(ext) for ext in self.args.tar_extensions): yield os.path.join(root, fn) else: yield self.args.tarballs class ParserMailbox(Parser): def init_hook(self): self.mb = mailbox.Maildir(self.args.mailbox, factory = None, create = False) self.total = len(self.mb) self.iterator = self.mb.iterkeys() @property def handle(self): return self.mb[self.iterval].get("Message-ID") def read_xml(self): return lxml.etree.XML(self.mb[self.iterval].get_payload()) if __name__ == "__main__": try: Parser.main() except KeyboardInterrupt: pass