Use named indexes to make it possible to add and remove them later.

May have finally gotten the right balance of indexes for basic use. Use various optimizations to let us load large data sets before the heat death of the universe. Some of these optimizations are dangerous, in the sense that if this script crashes while constructing the database, you'll have to rebuild the database from scratch. Probably ought to offer both this and the slow-but-safe approach as command line options, but: - The speed improvements look to be worth at least an order of magnitude in the runtime, - The speed improvements also prevent all the fsync() calls in the safe approach from turning the underlying filesystem into cream cheese while the script is running, and - This script is just a research anlysis tool to begin with. So I think the risk is justified in this case. svn path=/trunk/; revision=5934
author: Rob Austein <sra@hactrn.net> 2014-08-26 01:31:48 +0000
committer: Rob Austein <sra@hactrn.net> 2014-08-26 01:31:48 +0000
commit: 005a8b3b6575eca08abfdf2569564b1e8895de97 (patch)
tree: b4c79b8f9d5398cfe1859da1220561932ee3081a
parent: 63c933922c2a89f7612fabc7fc96a3c5ae359556 (diff)
1 files changed, 50 insertions, 26 deletions
diff --git a/potpourri/validation-status-sql.py b/potpourri/validation-status-sql.py
index 9e68d4ba..fc52e64b 100755
--- a/potpourri/validation-status-sql.py
+++ b/potpourri/validation-status-sql.py
@@ -63,11 +63,13 @@ class Parser(object):
     self.args = args
     self.init_sql()
     self.init_hook()
+    self.index1()
     self.parsed = 1
     for self.current, self.iterval in enumerate(self.iterator, 1):
       self.parse_xml()
     if self.parsed > 1:
       sys.stderr.write("\n")
+    self.index2()
     self.db.close()
 
 
@@ -75,52 +77,74 @@ class Parser(object):
     creating = not os.path.exists(self.args.database)
     self.db = sqlite3.connect(self.args.database)
     self.db.text_factory = str
-    self.db.execute("PRAGMA foreign_keys = on")
+    self.db.executescript('''
+      PRAGMA foreign_keys  = off;
+      PRAGMA synchronous   = off;
+      PRAGMA count_changes = off;
+      ''')
 
     if creating:
       self.db.executescript('''
         CREATE TABLE sessions (
-              id              INTEGER PRIMARY KEY NOT NULL,
-              session         DATETIME UNIQUE NOT NULL,
-              handle          TEXT UNIQUE NOT NULL);
+              session_id      INTEGER PRIMARY KEY NOT NULL,
+              session         DATETIME NOT NULL,
+              handle          TEXT NOT NULL
+        );
 
         CREATE TABLE uris (
-              id              INTEGER PRIMARY KEY NOT NULL,
-              uri             TEXT UNIQUE NOT NULL);
+              uri_id          INTEGER PRIMARY KEY NOT NULL,
+              uri             TEXT NOT NULL
+        );
 
         CREATE TABLE codes (
-              id              INTEGER PRIMARY KEY NOT NULL,
-              code            TEXT UNIQUE NOT NULL);
+              code_id         INTEGER PRIMARY KEY NOT NULL,
+              code            TEXT NOT NULL
+        );
 
         CREATE TABLE generations (
-              id              INTEGER PRIMARY KEY NOT NULL,
-              generation      TEXT UNIQUE);
+              generation_id   INTEGER PRIMARY KEY NOT NULL,
+              generation      TEXT NOT NULL
+        );
 
         CREATE TABLE events (
               id              INTEGER PRIMARY KEY NOT NULL,
               timestamp       DATETIME NOT NULL,
-              session_id      INTEGER NOT NULL REFERENCES sessions(id)    ON DELETE RESTRICT ON UPDATE RESTRICT,
-              generation_id   INTEGER NOT NULL REFERENCES generations(id) ON DELETE RESTRICT ON UPDATE RESTRICT,
-              code_id         INTEGER NOT NULL REFERENCES codes(id)       ON DELETE RESTRICT ON UPDATE RESTRICT,
-              uri_id          INTEGER NOT NULL REFERENCES uris(id)        ON DELETE RESTRICT ON UPDATE RESTRICT,
-              UNIQUE          (timestamp, generation_id, code_id, uri_id));
+              session_id      INTEGER NOT NULL REFERENCES sessions    (session_id)      ON DELETE RESTRICT ON UPDATE RESTRICT,
+              generation_id   INTEGER NOT NULL REFERENCES generations (generation_id)   ON DELETE RESTRICT ON UPDATE RESTRICT,
+              code_id         INTEGER NOT NULL REFERENCES codes       (code_id)         ON DELETE RESTRICT ON UPDATE RESTRICT,
+              uri_id          INTEGER NOT NULL REFERENCES uris        (uri_id)          ON DELETE RESTRICT ON UPDATE RESTRICT
+        );
 
         CREATE VIEW status AS
-             SELECT events.id, handle, session, timestamp, generation, uri, code
-             FROM events
-             JOIN sessions    ON sessions.id    = events.session_id
-             JOIN uris        ON uris.id        = events.uri_id
-             JOIN codes       ON codes.id       = events.code_id
-             JOIN generations ON generations.id = events.generation_id;
+             SELECT id, handle, session, timestamp, generation, code, uri
+             FROM               events
+             NATURAL JOIN       sessions
+             NATURAL JOIN       uris
+             NATURAL JOIN       codes
+             NATURAL JOIN       generations;
         ''')
 
+
+  def index1(self):
+    self.db.executescript('''
+      CREATE UNIQUE INDEX IF NOT EXISTS sessions_index         ON sessions     (session);
+      CREATE UNIQUE INDEX IF NOT EXISTS handles_index          ON sessions     (handle);
+      CREATE UNIQUE INDEX IF NOT EXISTS uris_index             ON uris         (uri);
+      CREATE UNIQUE INDEX IF NOT EXISTS codes_index            ON codes        (code);
+      CREATE UNIQUE INDEX IF NOT EXISTS generations_index      ON generations  (generation);
+      ''')
+
+
+  def index2(self):
+    self.db.executescript('''
+      CREATE UNIQUE INDEX IF NOT EXISTS events_index           ON events       (uri_id, timestamp, code_id, generation_id);
+      ''')
+
+
   def string_id(self, table, value):
     field = table.rstrip("s")
     try:
-      if value is None:
-        return self.db.execute("SELECT id FROM %s WHERE %s IS NULL" % (table, field)).fetchone()[0]
-      else:
-        return self.db.execute("SELECT id FROM %s WHERE %s = ?" % (table, field), (value,)).fetchone()[0]
+      return self.db.execute("SELECT %s_id FROM %s WHERE %s = ?" % (field, table, field), (value,)).fetchone()[0]
     except:
       return self.db.execute("INSERT INTO %s (%s) VALUES (?)" % (table, field), (value,)).lastrowid
 
@@ -138,7 +162,7 @@ class Parser(object):
                           "VALUES (?, strftime('%s', ?), ?, ?, ?)",
                           ((session_id,
                             x.get("timestamp"),
-                            self.string_id("generations", x.get("generation")),
+                            self.string_id("generations", x.get("generation", "none")),
                             self.string_id("codes",       x.get("status")),
                             self.string_id("uris",        x.text.strip()))
                            for x in xml.findall("validation_status")))
author	Rob Austein <sra@hactrn.net>	2014-08-26 01:31:48 +0000
committer	Rob Austein <sra@hactrn.net>	2014-08-26 01:31:48 +0000
commit	005a8b3b6575eca08abfdf2569564b1e8895de97 (patch)
tree	b4c79b8f9d5398cfe1859da1220561932ee3081a
parent	63c933922c2a89f7612fabc7fc96a3c5ae359556 (diff)