From ae65819856649f96d5ac91391606eff91552db39 Mon Sep 17 00:00:00 2001 From: Rob Austein Date: Tue, 19 Aug 2014 21:08:13 +0000 Subject: Add tarball filenames to session table so we don't have to do all the work of extracting and parsing before discovering that we've hit a duplicate. Not sure what equivalent would be for Maildir (maybe Message-ID?) so deferring that for now. svn path=/trunk/; revision=5925 --- potpourri/validation-status-sql.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'potpourri') diff --git a/potpourri/validation-status-sql.py b/potpourri/validation-status-sql.py index d37cb4ba..62e3089e 100755 --- a/potpourri/validation-status-sql.py +++ b/potpourri/validation-status-sql.py @@ -60,23 +60,20 @@ if creating: db.executescript(''' CREATE TABLE sessions ( id INTEGER PRIMARY KEY NOT NULL, - session DATETIME NOT NULL, - UNIQUE (session)); + session DATETIME UNIQUE NOT NULL, + filename TEXT UNIQUE); CREATE TABLE uris ( id INTEGER PRIMARY KEY NOT NULL, - uri TEXT NOT NULL, - UNIQUE (uri)); + uri TEXT UNIQUE NOT NULL); CREATE TABLE codes ( id INTEGER PRIMARY KEY NOT NULL, - code TEXT NOT NULL, - UNIQUE (code)); + code TEXT UNIQUE NOT NULL); CREATE TABLE generations ( id INTEGER PRIMARY KEY NOT NULL, - generation TEXT, - UNIQUE (generation)); + generation TEXT UNIQUE); CREATE TABLE events ( id INTEGER PRIMARY KEY NOT NULL, @@ -108,9 +105,10 @@ def string_id(table, value): return db.execute("INSERT INTO %s (%s) VALUES (?)" % (table, field), (value,)).lastrowid -def parse_xml(xml): +def parse_xml(xml, fn = None): try: - session_id = db.execute("INSERT INTO sessions (session) VALUES (datetime(?))", (xml.get("date"),)).lastrowid + session_id = db.execute("INSERT INTO sessions (session, filename) VALUES (datetime(?), ?)", + (xml.get("date"), fn)).lastrowid except sqlite3.IntegrityError: return @@ -125,9 +123,11 @@ def parse_xml(xml): def parse_tarball(fn): + if db.execute("SELECT filename FROM sessions WHERE filename = ?", (fn,)).fetchone(): + return print "Processing", fn - parse_xml(lxml.etree.ElementTree( - file = subprocess.Popen(("tar", "Oxf", fn, args.path_within_tarball), stdout = subprocess.PIPE).stdout).getroot()) + pipe = subprocess.Popen(("tar", "Oxf", fn, args.path_within_tarball), stdout = subprocess.PIPE).stdout + parse_xml(lxml.etree.ElementTree(file = pipe).getroot(), fn) if args.mailbox: -- cgit v1.2.3 From 19aaabec6feb5038e75f3c0868415bccb015b430 Mon Sep 17 00:00:00 2001 From: Rob Austein Date: Wed, 20 Aug 2014 03:25:26 +0000 Subject: Restructure, use Message-ID instead of filename when reading Maildir. svn path=/trunk/; revision=5927 --- potpourri/validation-status-sql.py | 270 +++++++++++++++++++++---------------- 1 file changed, 157 insertions(+), 113 deletions(-) (limited to 'potpourri') diff --git a/potpourri/validation-status-sql.py b/potpourri/validation-status-sql.py index 62e3089e..858c8a10 100755 --- a/potpourri/validation-status-sql.py +++ b/potpourri/validation-status-sql.py @@ -32,120 +32,164 @@ import argparse import lxml.etree import subprocess -parser = argparse.ArgumentParser( - description = __doc__, - formatter_class = argparse.ArgumentDefaultsHelpFormatter) -group = parser.add_mutually_exclusive_group(required = True) -group.add_argument("--mailbox", "--mb", - help = "Maildir mailbox containing rcynic XML output") -group.add_argument("--tarballs", - help = "directory tree of tar files containing containing rcynic XML output") -parser.add_argument("--database", "--db", - default = "validation-status-sql.db", - help = "name for SQLite3 database") -parser.add_argument("--path-within-tarball", - default = "var/rcynic/data/rcynic.xml", - help = "name of file to extract from tarball(s)") -parser.add_argument("--tar-extensions", nargs = "+", - default = ".tar .tar.gz .tgz .tar.bz2 .tbz .tar.xz .txz".split(), - help = "extensions to recognize as indicating tar files") -args = parser.parse_args() - -creating = not os.path.exists(args.database) -db = sqlite3.connect(args.database) -db.text_factory = str -db.execute("PRAGMA foreign_keys = on") - -if creating: - db.executescript(''' - CREATE TABLE sessions ( - id INTEGER PRIMARY KEY NOT NULL, - session DATETIME UNIQUE NOT NULL, - filename TEXT UNIQUE); - - CREATE TABLE uris ( - id INTEGER PRIMARY KEY NOT NULL, - uri TEXT UNIQUE NOT NULL); - - CREATE TABLE codes ( - id INTEGER PRIMARY KEY NOT NULL, - code TEXT UNIQUE NOT NULL); - - CREATE TABLE generations ( - id INTEGER PRIMARY KEY NOT NULL, - generation TEXT UNIQUE); - - CREATE TABLE events ( - id INTEGER PRIMARY KEY NOT NULL, - timestamp DATETIME NOT NULL, - session_id INTEGER NOT NULL REFERENCES sessions(id) ON DELETE RESTRICT ON UPDATE RESTRICT, - generation_id INTEGER NOT NULL REFERENCES generations(id) ON DELETE RESTRICT ON UPDATE RESTRICT, - code_id INTEGER NOT NULL REFERENCES codes(id) ON DELETE RESTRICT ON UPDATE RESTRICT, - uri_id INTEGER NOT NULL REFERENCES uris(id) ON DELETE RESTRICT ON UPDATE RESTRICT, - UNIQUE (timestamp, generation_id, code_id, uri_id)); - - CREATE VIEW status AS - SELECT events.id, session, timestamp, generation, uri, code - FROM events - JOIN sessions ON sessions.id = events.session_id - JOIN uris ON uris.id = events.uri_id - JOIN codes ON codes.id = events.code_id - JOIN generations ON generations.id = events.generation_id; - ''') - - -def string_id(table, value): - field = table.rstrip("s") - try: - if value is None: - return db.execute("SELECT id FROM %s WHERE %s IS NULL" % (table, field)).fetchone()[0] +class Parser(object): + + @staticmethod + def main(): + parser = argparse.ArgumentParser( + description = __doc__, + formatter_class = argparse.ArgumentDefaultsHelpFormatter) + group = parser.add_mutually_exclusive_group(required = True) + group.add_argument("--mailbox", "--mb", + help = "Maildir mailbox containing rcynic XML output") + group.add_argument("--tarballs", + help = "directory tree of tar files containing containing rcynic XML output") + parser.add_argument("--database", "--db", + default = "validation-status-sql.db", + help = "SQLite3 database") + parser.add_argument("--path-within-tarball", + default = "var/rcynic/data/rcynic.xml", + help = "rcynic.xml path name within tarball(s)") + parser.add_argument("--tar-extensions", nargs = "+", + default = ".tar .tar.gz .tgz .tar.bz2 .tbz .tar.xz .txz".split(), + help = "extensions to recognize as indicating tar files") + args = parser.parse_args() + if args.mailbox: + ParserMailbox(args) + else: + ParserTarball(args) + + def __init__(self, args): + self.args = args + self.init_sql() + self.init_hook() + self.parsed = 1 + for self.current, self.iterval in enumerate(self.iterator, 1): + self.parse_xml() + if self.parsed > 1: + sys.stderr.write("\n") + db.close() + + + def init_sql(self): + creating = not os.path.exists(self.args.database) + self.db = sqlite3.connect(self.args.database) + self.db.text_factory = str + self.db.execute("PRAGMA foreign_keys = on") + + if creating: + self.db.executescript(''' + CREATE TABLE sessions ( + id INTEGER PRIMARY KEY NOT NULL, + session DATETIME UNIQUE NOT NULL, + handle TEXT UNIQUE NOT NULL); + + CREATE TABLE uris ( + id INTEGER PRIMARY KEY NOT NULL, + uri TEXT UNIQUE NOT NULL); + + CREATE TABLE codes ( + id INTEGER PRIMARY KEY NOT NULL, + code TEXT UNIQUE NOT NULL); + + CREATE TABLE generations ( + id INTEGER PRIMARY KEY NOT NULL, + generation TEXT UNIQUE); + + CREATE TABLE events ( + id INTEGER PRIMARY KEY NOT NULL, + timestamp DATETIME NOT NULL, + session_id INTEGER NOT NULL REFERENCES sessions(id) ON DELETE RESTRICT ON UPDATE RESTRICT, + generation_id INTEGER NOT NULL REFERENCES generations(id) ON DELETE RESTRICT ON UPDATE RESTRICT, + code_id INTEGER NOT NULL REFERENCES codes(id) ON DELETE RESTRICT ON UPDATE RESTRICT, + uri_id INTEGER NOT NULL REFERENCES uris(id) ON DELETE RESTRICT ON UPDATE RESTRICT, + UNIQUE (timestamp, generation_id, code_id, uri_id)); + + CREATE VIEW status AS + SELECT events.id, handle, session, timestamp, generation, uri, code + FROM events + JOIN sessions ON sessions.id = events.session_id + JOIN uris ON uris.id = events.uri_id + JOIN codes ON codes.id = events.code_id + JOIN generations ON generations.id = events.generation_id; + ''') + + def string_id(self, table, value): + field = table.rstrip("s") + try: + if value is None: + return self.db.execute("SELECT id FROM %s WHERE %s IS NULL" % (table, field)).fetchone()[0] + else: + return self.db.execute("SELECT id FROM %s WHERE %s = ?" % (table, field), (value,)).fetchone()[0] + except: + return self.db.execute("INSERT INTO %s (%s) VALUES (?)" % (table, field), (value,)).lastrowid + + + def parse_xml(self): + sys.stderr.write("\r%s %d/%d/%d...%s " % ("|\\-/"[self.current & 3], + self.current, self.parsed, self.total, self.handle)) + if self.db.execute("SELECT handle FROM sessions WHERE handle = ?", (self.handle,)).fetchone(): + return + xml = self.read_xml() + with self.db: + session_id = self.db.execute("INSERT INTO sessions (session, handle) VALUES (datetime(?), ?)", + (xml.get("date"), self.handle)).lastrowid + self.db.executemany("INSERT INTO events (session_id, timestamp, generation_id, code_id, uri_id) " + "VALUES (?, datetime(?), ?, ?, ?)", + ((session_id, + x.get("timestamp"), + self.string_id("generations", x.get("generation")), + self.string_id("codes", x.get("status")), + self.string_id("uris", x.text.strip())) + for x in xml.findall("validation_status"))) + self.parsed += 1 + + +class ParserTarball(Parser): + + def init_hook(self): + self.total = 0 + for fn in self.iter_tarball_names(): + self.total += 1 + self.iterator = self.iter_tarball_names() + + @property + def handle(self): + return self.iterval + + def read_xml(self): + return lxml.etree.ElementTree( + file = subprocess.Popen(("tar", "Oxf", self.iterval, self.args.path_within_tarball), + stdout = subprocess.PIPE).stdout).getroot() + + def iter_tarball_names(self): + if os.path.isdir(self.args.tarballs): + for root, dirs, files in os.walk(self.args.tarballs): + for fn in files: + if any(fn.endswith(ext) for ext in self.args.tar_extensions): + yield os.path.join(root, fn) else: - return db.execute("SELECT id FROM %s WHERE %s = ?" % (table, field), (value,)).fetchone()[0] - except: - return db.execute("INSERT INTO %s (%s) VALUES (?)" % (table, field), (value,)).lastrowid + yield self.args.tarballs + + +class ParserMailbox(Parser): + def init_hook(self): + self.mb = mailbox.Maildir(self.args.mailbox, factory = None, create = False) + self.total = len(self.mb) + self.iterator = self.mb.iterkeys() -def parse_xml(xml, fn = None): + @property + def handle(self): + return self.mb[self.iterval].get("Message-ID") + + def read_xml(self): + return lxml.etree.XML(self.mb[self.iterval].get_payload()) + + +if __name__ == "__main__": try: - session_id = db.execute("INSERT INTO sessions (session, filename) VALUES (datetime(?), ?)", - (xml.get("date"), fn)).lastrowid - except sqlite3.IntegrityError: - return - - with db: - db.executemany("INSERT INTO events (session_id, timestamp, generation_id, code_id, uri_id) VALUES (?, datetime(?), ?, ?, ?)", - ((session_id, - x.get("timestamp"), - string_id("generations", x.get("generation")), - string_id("codes", x.get("status")), - string_id("uris", x.text.strip())) - for x in xml.findall("validation_status"))) - - -def parse_tarball(fn): - if db.execute("SELECT filename FROM sessions WHERE filename = ?", (fn,)).fetchone(): - return - print "Processing", fn - pipe = subprocess.Popen(("tar", "Oxf", fn, args.path_within_tarball), stdout = subprocess.PIPE).stdout - parse_xml(lxml.etree.ElementTree(file = pipe).getroot(), fn) - - -if args.mailbox: - mb = mailbox.Maildir(args.mailbox, factory = None, create = False) - for i, key in enumerate(mb.iterkeys(), 1): - sys.stderr.write("\r%s %d/%d..." % ("|\\-/"[i & 3], i, len(mb))) - parse_xml(lxml.etree.XML(mb[key].get_payload())) - sys.stderr.write("\n") - -elif not os.path.isdir(args.tarballs): - parse_tarball(args.tarballs) - -else: - if os.path.isdir(args.tarballs): - for root, dirs, files in os.walk(args.tarballs): - for fn in files: - if any(fn.endswith(ext) for ext in args.tar_extensions): - parse_tarball(os.path.join(root, fn)) - - -db.close() + Parser.main() + except KeyboardInterrupt: + pass + -- cgit v1.2.3 From 117944e68f33063e1881c026c0b427dcc5d283dd Mon Sep 17 00:00:00 2001 From: Rob Austein Date: Wed, 20 Aug 2014 03:25:57 +0000 Subject: Typo in copyright line. svn path=/trunk/; revision=5928 --- potpourri/validation-status-sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'potpourri') diff --git a/potpourri/validation-status-sql.py b/potpourri/validation-status-sql.py index 858c8a10..90d66435 100755 --- a/potpourri/validation-status-sql.py +++ b/potpourri/validation-status-sql.py @@ -2,7 +2,7 @@ # $Id$ # -# Copyright (C) 2013-2014 2014 Dragon Research Labs ("DRL") +# Copyright (C) 2013-2014 Dragon Research Labs ("DRL") # Portions copyright (C) 2011-2012 Internet Systems Consortium ("ISC") # # Permission to use, copy, modify, and distribute this software for any -- cgit v1.2.3 From d3671dfe4db2b60e5ad7b574e31dde3bdd79a00a Mon Sep 17 00:00:00 2001 From: Rob Austein Date: Wed, 20 Aug 2014 13:05:52 +0000 Subject: Get final db.close() right, even if it is unnecessary. svn path=/trunk/; revision=5930 --- potpourri/validation-status-sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'potpourri') diff --git a/potpourri/validation-status-sql.py b/potpourri/validation-status-sql.py index 90d66435..81d79b48 100755 --- a/potpourri/validation-status-sql.py +++ b/potpourri/validation-status-sql.py @@ -68,7 +68,7 @@ class Parser(object): self.parse_xml() if self.parsed > 1: sys.stderr.write("\n") - db.close() + self.db.close() def init_sql(self): -- cgit v1.2.3 From 63c933922c2a89f7612fabc7fc96a3c5ae359556 Mon Sep 17 00:00:00 2001 From: Rob Austein Date: Mon, 25 Aug 2014 04:48:26 +0000 Subject: Solve several minor problems at once by storing timestamps as seconds-since-epoch. svn path=/trunk/; revision=5933 --- potpourri/validation-status-sql.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'potpourri') diff --git a/potpourri/validation-status-sql.py b/potpourri/validation-status-sql.py index 81d79b48..9e68d4ba 100755 --- a/potpourri/validation-status-sql.py +++ b/potpourri/validation-status-sql.py @@ -132,10 +132,10 @@ class Parser(object): return xml = self.read_xml() with self.db: - session_id = self.db.execute("INSERT INTO sessions (session, handle) VALUES (datetime(?), ?)", + session_id = self.db.execute("INSERT INTO sessions (session, handle) VALUES (strftime('%s', ?), ?)", (xml.get("date"), self.handle)).lastrowid self.db.executemany("INSERT INTO events (session_id, timestamp, generation_id, code_id, uri_id) " - "VALUES (?, datetime(?), ?, ?, ?)", + "VALUES (?, strftime('%s', ?), ?, ?, ?)", ((session_id, x.get("timestamp"), self.string_id("generations", x.get("generation")), -- cgit v1.2.3 From 005a8b3b6575eca08abfdf2569564b1e8895de97 Mon Sep 17 00:00:00 2001 From: Rob Austein Date: Tue, 26 Aug 2014 01:31:48 +0000 Subject: Use named indexes to make it possible to add and remove them later. May have finally gotten the right balance of indexes for basic use. Use various optimizations to let us load large data sets before the heat death of the universe. Some of these optimizations are dangerous, in the sense that if this script crashes while constructing the database, you'll have to rebuild the database from scratch. Probably ought to offer both this and the slow-but-safe approach as command line options, but: - The speed improvements look to be worth at least an order of magnitude in the runtime, - The speed improvements also prevent all the fsync() calls in the safe approach from turning the underlying filesystem into cream cheese while the script is running, and - This script is just a research anlysis tool to begin with. So I think the risk is justified in this case. svn path=/trunk/; revision=5934 --- potpourri/validation-status-sql.py | 76 +++++++++++++++++++++++++------------- 1 file changed, 50 insertions(+), 26 deletions(-) (limited to 'potpourri') diff --git a/potpourri/validation-status-sql.py b/potpourri/validation-status-sql.py index 9e68d4ba..fc52e64b 100755 --- a/potpourri/validation-status-sql.py +++ b/potpourri/validation-status-sql.py @@ -63,11 +63,13 @@ class Parser(object): self.args = args self.init_sql() self.init_hook() + self.index1() self.parsed = 1 for self.current, self.iterval in enumerate(self.iterator, 1): self.parse_xml() if self.parsed > 1: sys.stderr.write("\n") + self.index2() self.db.close() @@ -75,52 +77,74 @@ class Parser(object): creating = not os.path.exists(self.args.database) self.db = sqlite3.connect(self.args.database) self.db.text_factory = str - self.db.execute("PRAGMA foreign_keys = on") + self.db.executescript(''' + PRAGMA foreign_keys = off; + PRAGMA synchronous = off; + PRAGMA count_changes = off; + ''') if creating: self.db.executescript(''' CREATE TABLE sessions ( - id INTEGER PRIMARY KEY NOT NULL, - session DATETIME UNIQUE NOT NULL, - handle TEXT UNIQUE NOT NULL); + session_id INTEGER PRIMARY KEY NOT NULL, + session DATETIME NOT NULL, + handle TEXT NOT NULL + ); CREATE TABLE uris ( - id INTEGER PRIMARY KEY NOT NULL, - uri TEXT UNIQUE NOT NULL); + uri_id INTEGER PRIMARY KEY NOT NULL, + uri TEXT NOT NULL + ); CREATE TABLE codes ( - id INTEGER PRIMARY KEY NOT NULL, - code TEXT UNIQUE NOT NULL); + code_id INTEGER PRIMARY KEY NOT NULL, + code TEXT NOT NULL + ); CREATE TABLE generations ( - id INTEGER PRIMARY KEY NOT NULL, - generation TEXT UNIQUE); + generation_id INTEGER PRIMARY KEY NOT NULL, + generation TEXT NOT NULL + ); CREATE TABLE events ( id INTEGER PRIMARY KEY NOT NULL, timestamp DATETIME NOT NULL, - session_id INTEGER NOT NULL REFERENCES sessions(id) ON DELETE RESTRICT ON UPDATE RESTRICT, - generation_id INTEGER NOT NULL REFERENCES generations(id) ON DELETE RESTRICT ON UPDATE RESTRICT, - code_id INTEGER NOT NULL REFERENCES codes(id) ON DELETE RESTRICT ON UPDATE RESTRICT, - uri_id INTEGER NOT NULL REFERENCES uris(id) ON DELETE RESTRICT ON UPDATE RESTRICT, - UNIQUE (timestamp, generation_id, code_id, uri_id)); + session_id INTEGER NOT NULL REFERENCES sessions (session_id) ON DELETE RESTRICT ON UPDATE RESTRICT, + generation_id INTEGER NOT NULL REFERENCES generations (generation_id) ON DELETE RESTRICT ON UPDATE RESTRICT, + code_id INTEGER NOT NULL REFERENCES codes (code_id) ON DELETE RESTRICT ON UPDATE RESTRICT, + uri_id INTEGER NOT NULL REFERENCES uris (uri_id) ON DELETE RESTRICT ON UPDATE RESTRICT + ); CREATE VIEW status AS - SELECT events.id, handle, session, timestamp, generation, uri, code - FROM events - JOIN sessions ON sessions.id = events.session_id - JOIN uris ON uris.id = events.uri_id - JOIN codes ON codes.id = events.code_id - JOIN generations ON generations.id = events.generation_id; + SELECT id, handle, session, timestamp, generation, code, uri + FROM events + NATURAL JOIN sessions + NATURAL JOIN uris + NATURAL JOIN codes + NATURAL JOIN generations; ''') + + def index1(self): + self.db.executescript(''' + CREATE UNIQUE INDEX IF NOT EXISTS sessions_index ON sessions (session); + CREATE UNIQUE INDEX IF NOT EXISTS handles_index ON sessions (handle); + CREATE UNIQUE INDEX IF NOT EXISTS uris_index ON uris (uri); + CREATE UNIQUE INDEX IF NOT EXISTS codes_index ON codes (code); + CREATE UNIQUE INDEX IF NOT EXISTS generations_index ON generations (generation); + ''') + + + def index2(self): + self.db.executescript(''' + CREATE UNIQUE INDEX IF NOT EXISTS events_index ON events (uri_id, timestamp, code_id, generation_id); + ''') + + def string_id(self, table, value): field = table.rstrip("s") try: - if value is None: - return self.db.execute("SELECT id FROM %s WHERE %s IS NULL" % (table, field)).fetchone()[0] - else: - return self.db.execute("SELECT id FROM %s WHERE %s = ?" % (table, field), (value,)).fetchone()[0] + return self.db.execute("SELECT %s_id FROM %s WHERE %s = ?" % (field, table, field), (value,)).fetchone()[0] except: return self.db.execute("INSERT INTO %s (%s) VALUES (?)" % (table, field), (value,)).lastrowid @@ -138,7 +162,7 @@ class Parser(object): "VALUES (?, strftime('%s', ?), ?, ?, ?)", ((session_id, x.get("timestamp"), - self.string_id("generations", x.get("generation")), + self.string_id("generations", x.get("generation", "none")), self.string_id("codes", x.get("status")), self.string_id("uris", x.text.strip())) for x in xml.findall("validation_status"))) -- cgit v1.2.3 From 429adae788694f109174e35467c49d13b9533fe2 Mon Sep 17 00:00:00 2001 From: Rob Austein Date: Sat, 13 Sep 2014 03:56:54 +0000 Subject: Groundwork for Django ORM world conquest: sort out settings.py mess. svn path=/branches/tk713/; revision=5948 --- potpourri/upgrade-add-ghostbusters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'potpourri') diff --git a/potpourri/upgrade-add-ghostbusters.py b/potpourri/upgrade-add-ghostbusters.py index a8c8a92b..2548487c 100644 --- a/potpourri/upgrade-add-ghostbusters.py +++ b/potpourri/upgrade-add-ghostbusters.py @@ -43,7 +43,7 @@ for o, a in opts: if o in ("-c", "--config"): cfg_file = a -cfg = rpki.config.parser(cfg_file, "myrpki") +cfg = rpki.config.parser(filename = cfg_file, section = "myrpki") fix("irdbd", """ CREATE TABLE ghostbuster_request ( -- cgit v1.2.3