3 år sedan · be858bc28a
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,7 @@
 
				+TAGS
			
 
				+__pycache__
			
 
				+attachments
			
 
				+pelican/content/*/*
			
 
				+pelican/pelicanconf.py
			
 
				+trac.db
			
 
				+pelican/website
			
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -0,0 +1,24 @@
 
				+# Sample settings, tweak for particular jobs on make command line or
			
 
				+# via environment variables.
			
 
				+
			
 
				+SOURCE_URL		:= https://git.cryptech.is/
			
 
				+SOURCE_TRAC_DB		:= bikeshed.cryptech.is:/home/trac/db/trac.db
			
 
				+SOURCE_TRAC_ATTACHMENTS := bikeshed.cryptech.is:/home/trac/files/attachments
			
 
				+
			
 
				+all:
			
 
				+	./extract.py --source-url ${SOURCE_URL}
			
 
				+	cd pelican; pelican --output website --settings pelicanconf.py --fatal errors content
			
 
				+
			
 
				+fetch:
			
 
				+	rsync -aP --delete ${SOURCE_TRAC_DB} ${SOURCE_TRAC_ATTACHMENTS} .
			
 
				+
			
 
				+clean:
			
 
				+	rm -rf wiki pelican
			
 
				+
			
 
				+distclean: clean
			
 
				+	rm -rf trac.db attachments
			
 
				+
			
 
				+webfsd:
			
 
				+	webfsd -r pelican/website -4 -L - -F -f index.html
			
 
				+
			
 
				+.PHONY: all clean fetch distclean webfsd
			
--- a/Makefile
+++ b/Makefile
@@ -0,0 +1,6 @@
 
				+# BSD-style Makefile so that `make` works on both FreeBSD and Linux.
			
 
				+# GNU make will just ignore this because of the GNUmakefile.
			
 
				+
			
 
				+.MAIN: all
			
 
				+
			
 
				+${.TARGETS}: ; @gmake $@
			
--- a/README
+++ b/README
@@ -0,0 +1,15 @@
 
				+Tools to convert Permatrac Wiki to Markdown format.
			
 
				+
			
 
				+This is just the tool set for the Wiki, there's another set of scripts
			
 
				+for converting Trac tickets but I haven't yet converted them from the
			
 
				+task-specific form they're in at the moment (from an old Trac site
			
 
				+whose tickets I had to migrate to GitHub issues about five years
			
 
				+ago...).
			
 
				+
			
 
				+As a test, we run the Markdown from the conversion process through
			
 
				+Pelican to generate a static site.  This may or may not be useful in
			
 
				+its own right, but for our purposes simply running the Pelican site
			
 
				+generator is helpful, since it spots a lot of dumb Markdown errors for
			
 
				+us (missing links, horribly borked Markdown syntax, etc).
			
 
				+
			
 
				+Pelican content format: https://docs.getpelican.com/en/latest/content.html
			
--- a/extract.py
+++ b/extract.py
@@ -0,0 +1,128 @@
 
				+#!/usr/bin/env python3
			
 
				+
			
 
				+import fnmatch
			
 
				+import hashlib
			
 
				+import json
			
 
				+import os
			
 
				+import shutil
			
 
				+import sqlite3
			
 
				+import sys
			
 
				+import time
			
 
				+import urllib.parse
			
 
				+import argparse
			
 
				+
			
 
				+import trac2md
			
 
				+
			
 
				+wiki_query = '''
			
 
				+  SELECT
			
 
				+    name, 
			
 
				+    author,
			
 
				+    version, 
			
 
				+    time / 1000000 AS time, 
			
 
				+    text 
			
 
				+  FROM wiki
			
 
				+  ORDER BY
			
 
				+    name, version
			
 
				+'''
			
 
				+
			
 
				+attachment_query = '''
			
 
				+  SELECT
			
 
				+    id,
			
 
				+    filename,
			
 
				+    size,
			
 
				+    author,
			
 
				+    description,
			
 
				+    ipnr,
			
 
				+    time / 1000000 AS createdtime
			
 
				+  FROM
			
 
				+    attachment
			
 
				+  WHERE
			
 
				+    type = 'wiki'
			
 
				+  ORDER BY
			
 
				+    filename, time
			
 
				+'''
			
 
				+
			
 
				+def attachment_link(row):
			
 
				+    h   = lambda whatever: hashlib.sha1(whatever.encode()).hexdigest()
			
 
				+    h1  = h(row.id)
			
 
				+    h2  = h(row.filename)
			
 
				+    fn2 = os.path.splitext(row["filename"])[1]
			
 
				+    return \
			
 
				+        os.path.join("attachments", "wiki", h1[:3], h1, h2 + fn2), \
			
 
				+        os.path.join("pelican", "content", urllib.parse.quote(row.id, ""), row.filename)
			
 
				+
			
 
				+class Filter:
			
 
				+
			
 
				+    def __init__(self, filename = "filter.json"):
			
 
				+        with open(filename) as f:
			
 
				+            filter = json.load(f)
			
 
				+        if not all(action in "-+" for action, pattern in filter):
			
 
				+            sys.exit("Bad action \"{}\" in filter".format(action))
			
 
				+        self.filter = tuple((action == "+", pattern) for action, pattern in filter)
			
 
				+
			
 
				+    def __call__(self, name):
			
 
				+        for action, pattern in self.filter:
			
 
				+            if fnmatch.fnmatch(name, pattern):
			
 
				+                return action
			
 
				+        return True
			
 
				+
			
 
				+class Row(sqlite3.Row):
			
 
				+
			
 
				+    def __getattr__(self, name):
			
 
				+        return self[name]
			
 
				+
			
 
				+    @property
			
 
				+    def isotime(self):
			
 
				+        return time.strftime("%Y-%m-%d %H:%M", time.gmtime(self.time))
			
 
				+
			
 
				+def markdown_header(row, first_published):
			
 
				+    if row.name in first_published:
			
 
				+        modtime = "Modified: {}\n".format(row.isotime)
			
 
				+    else:
			
 
				+        modtime = ""
			
 
				+        first_published[row.name] = row.isotime
			
 
				+    return "Title: {}\nAuthor: {}\nDate: {}\n{}\n".format(row.name, row.author, first_published[row.name], modtime)
			
 
				+
			
 
				+def main():
			
 
				+    ap = argparse.ArgumentParser()
			
 
				+    ap.add_argument("--source-url")
			
 
				+    args = ap.parse_args()
			
 
				+
			
 
				+    for dn in ("wiki", "pelican"):
			
 
				+        shutil.rmtree(dn)
			
 
				+
			
 
				+    for dn in ("wiki", "pelican/content"):
			
 
				+        os.makedirs(dn)
			
 
				+
			
 
				+    os.link("pelicanconf.py", "pelican/pelicanconf.py")
			
 
				+
			
 
				+    wiki_to_markdown = trac2md.Trac2Markdown(args.source_url)
			
 
				+
			
 
				+    keep = Filter()
			
 
				+
			
 
				+    first_published = {}
			
 
				+
			
 
				+    db = sqlite3.connect("trac.db")
			
 
				+    db.row_factory = Row
			
 
				+
			
 
				+    for row in db.execute(wiki_query):
			
 
				+        if keep(row.name):
			
 
				+            slug = urllib.parse.quote(row.name, "")
			
 
				+            #print(slug, row.version)
			
 
				+            with open("wiki/{}.trac".format(slug), "w") as f:
			
 
				+                f.write(row.text)
			
 
				+            md = markdown_header(row, first_published) + wiki_to_markdown(row.text, slug)
			
 
				+            with open("pelican/content/{}.md".format(slug), "w") as f:
			
 
				+                f.write(md)
			
 
				+
			
 
				+    for row in db.execute(attachment_query):
			
 
				+        src, dst = attachment_link(row)
			
 
				+        #print("{} => {}".format(dst, src))
			
 
				+        if not os.path.isdir(os.path.dirname(dst)):
			
 
				+            os.makedirs(os.path.dirname(dst))
			
 
				+        os.link(src, dst)
			
 
				+
			
 
				+    db.close()
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/filter.json
+++ b/filter.json
@@ -0,0 +1,19 @@
 
				+[
			
 
				+    ["+", "WikiStart"],
			
 
				+    ["-", "CamelCase"],
			
 
				+    ["-", "EDAToolchainSurvey\""],
			
 
				+    ["-", "GitRepositories*"],
			
 
				+    ["-", "InterMapTxt"],
			
 
				+    ["-", "InterTrac"],
			
 
				+    ["-", "InterWiki"],
			
 
				+    ["-", "PageTemplates"],
			
 
				+    ["-", "PhotoFolder"],
			
 
				+    ["-", "RecentChanges"],
			
 
				+    ["-", "SandBox"],
			
 
				+    ["-", "ScratchPage"],
			
 
				+    ["-", "TicketQuery"],
			
 
				+    ["-", "TitleIndex"],
			
 
				+    ["-", "Trac*"],
			
 
				+    ["-", "Wiki*"],
			
 
				+    ["+", "*"]
			
 
				+]
			
--- a/pelicanconf.py
+++ b/pelicanconf.py
@@ -0,0 +1,33 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*- #
			
 
				+
			
 
				+AUTHOR = "Cryptech Core Team"
			
 
				+SITENAME = "Cryptech Project"
			
 
				+
			
 
				+PATH = "content"
			
 
				+TIMEZONE = "UTC"
			
 
				+DEFAULT_LANG = "en"
			
 
				+
			
 
				+# Feed generation is usually not desired when developing
			
 
				+SITEURL = ""
			
 
				+RELATIVE_URLS = True
			
 
				+FEED_ALL_ATOM = None
			
 
				+CATEGORY_FEED_ATOM = None
			
 
				+TRANSLATION_FEED_ATOM = None
			
 
				+AUTHOR_FEED_ATOM = None
			
 
				+AUTHOR_FEED_RSS = None
			
 
				+
			
 
				+# Blogroll
			
 
				+LINKS = (("Pelican", "http://getpelican.com/"),
			
 
				+         ("Python.org", "http://python.org/"),
			
 
				+         ("Jinja2", "http://jinja.pocoo.org/"))
			
 
				+LINKS_WIDGET_NAME = "Links"
			
 
				+
			
 
				+# Social widget.  Can't get rid of this with default theme, only change its name.
			
 
				+# Fiddle with themes later
			
 
				+SOCIAL = ()
			
 
				+SOCIAL_WIDGET_NAME = "Subscribe"
			
 
				+
			
 
				+DEFAULT_PAGINATION = 10
			
 
				+
			
 
				+#THEME = "/home/blog/pelican-themes/sundown"
			
--- a/trac2md.py
+++ b/trac2md.py
@@ -0,0 +1,193 @@
 
				+# This started out as https://www.snip2code.com/Snippet/1704331/Convert-trac-markup-to-Markdown/
			
 
				+# which in turn said "This code mostly taken from patches to pagure_importer by mreynolds".
			
 
				+# Has mutated considerably since then.
			
 
				+
			
 
				+import re
			
 
				+from urllib.parse import quote
			
 
				+
			
 
				+class Trac2Markdown:
			
 
				+
			
 
				+    content_linebreak_pattern = re.compile(r"\[\[br\]\]|\\\\", re.I)
			
 
				+    camelcase_pattern = re.compile(r"(?:^|(?<=\s))([A-Z][a-z]+[A-Z][a-z][A-Za-z]*)(?:$|(?=\s))")
			
 
				+
			
 
				+    wikiheading_patterns = tuple(
			
 
				+        (level, re.compile("^{} (.*)[ \t]*=*$".format("=" * level)))
			
 
				+        for level in range(1, 7)
			
 
				+    )
			
 
				+
			
 
				+    def convert_headers(self, line):
			
 
				+        for level_count, header in self.wikiheading_patterns:
			
 
				+            try:
			
 
				+                level = header.search(line).group(1)
			
 
				+                if level:
			
 
				+                    line = "{} {}".format('#' * level_count, level.rstrip("= \r\t"))
			
 
				+                    break          # No need to check other heading levels
			
 
				+            except:
			
 
				+                pass                # Try the next heading level
			
 
				+        return line
			
 
				+
			
 
				+    def convert_to_creole(self, m):
			
 
				+        # Convert Trac's native link form to Creole's, so that rest of the code only has to deal with one format.
			
 
				+        # Creole's is easier to parse and harder to confuse with partially converted Markdown.
			
 
				+
			
 
				+        text = m.group(1).strip()
			
 
				+        if " " in text:
			
 
				+            return "[[{0[0]}|{0[1]}]]".format(text.split(" ", 1))
			
 
				+        elif ":" in text or self.camelcase_pattern.match(text):
			
 
				+            return "[[{}]]".format(text)
			
 
				+        else:
			
 
				+            return m.group(0)
			
 
				+
			
 
				+    # Probably most of the non-wiki scheme tests should become a table in an
			
 
				+    # extended JSON config file which maps
			
 
				+    #
			
 
				+    #   { "source:fee/fie/foe/fum": "https://git.cryptech.is/blarg/blee/blue" }
			
 
				+
			
 
				+    def convert_wikilinks(self, m):
			
 
				+        scheme, link, text = [p.strip() if p else p for p in  m.groups()]
			
 
				+        if text is None:
			
 
				+            text = link
			
 
				+        if any(link.startswith(q) and link.endswith(q) for q in ('"', "'")):
			
 
				+            link = link[1:-1]
			
 
				+        if any(text.startswith(q) and text.endswith(q) for q in ('"', "'")):
			
 
				+            text = text[1:-1]
			
 
				+        if text == link and link.startswith("http") and "://" in link:
			
 
				+            return "<{}>".format(link)
			
 
				+        elif scheme == "attachment:":
			
 
				+            return "[{}]({{attach}}{}/{})".format(text, self.slug, link)
			
 
				+        elif scheme in ("source:", "browser:"):
			
 
				+            return "[{}]({}/{})".format(text, self.source_url.rstrip("/"), link.lstrip("/"))
			
 
				+        elif scheme == "wiki:" or (scheme is None and self.camelcase_pattern.match(link)):
			
 
				+            return "[{}]({{filename}}{}.md)".format(text, link)
			
 
				+        else:
			
 
				+            return "[{}]({})".format(text, link)
			
 
				+
			
 
				+    def convert_image(self, m):
			
 
				+        text = m.group(1).split(",")[0].strip()
			
 
				+        if "://" in text:
			
 
				+            return "<img src=\"{}\">".format(text)
			
 
				+        else:
			
 
				+            return "![{}]({{attach}}{}/{})".format(text, self.slug, quote(text, ""))
			
 
				+
			
 
				+    def __init__(self, source_url):
			
 
				+        self.source_url = source_url
			
 
				+        self.pattern_actions = (
			
 
				+
			
 
				+            # Convert TracLinks to WikiCreole syntax to simplify remaining processing
			
 
				+            (re.compile(r"(?<!\[)\[([^][]+)\]"),                                        self.convert_to_creole),
			
 
				+
			
 
				+            # Convert CamelCase links to explicit links
			
 
				+            (self.camelcase_pattern,                                                    r"[[\1]]"),
			
 
				+
			
 
				+            # Convert !x quoting
			
 
				+            (re.compile(r"!((?:\w|[#])+)"),                                             r"\1"),
			
 
				+
			
 
				+            # Convert (limited subset of) spans
			
 
				+            (re.compile(r"\[\[span\((?:[^][]*,)*([^(),]+)\)\]\]"),                      r"\1"),
			
 
				+
			
 
				+            # Convert images
			
 
				+            (re.compile(r"\[\[Image\((.*)\)\]\]"),                                      self.convert_image),
			
 
				+
			
 
				+            # Delete Trac macros that have no useful counterpart
			
 
				+            (re.compile(r"\[\[PageOutline\]\]", re.I),                                  r""),
			
 
				+
			
 
				+            # Convert wiki links
			
 
				+            (re.compile(r"\[\[(wiki:|attachment:|source:|browser:)?([^]|[]+)(?:[|]([^][]+))?\]\]"),     self.convert_wikilinks),
			
 
				+
			
 
				+            # Convert striked through text
			
 
				+            (re.compile(r"~~([^~]+)~~"),                                                r"<s>\1</s>"),
			
 
				+
			
 
				+            # Convert line breaks -- Markdown spec says linebreak is <SPACE><SPACE><RETURN>, who am I to argue?
			
 
				+            (re.compile(r"\\\\$"),                                                      r"  "),
			
 
				+
			
 
				+            # Convert bold and italic text (do this last)
			
 
				+            (re.compile(r"'''"),                                                        r"**"),
			
 
				+            (re.compile(r"''"),                                                         r"*"),
			
 
				+        )
			
 
				+
			
 
				+    def __call__(self, content, slug):
			
 
				+        self.slug = slug
			
 
				+
			
 
				+        old_content = self.content_linebreak_pattern.sub("\\\\\\\\\n", content).splitlines()
			
 
				+        new_content = []
			
 
				+
			
 
				+        code_block = False
			
 
				+        in_list = False
			
 
				+        in_table = False
			
 
				+        nested_level = 0
			
 
				+        prev_indent = 0
			
 
				+
			
 
				+        while old_content:
			
 
				+            line = old_content.pop(0).rstrip()
			
 
				+            tail = ["\n"]
			
 
				+            while "{{{" in line or "}}}" in line:
			
 
				+                if "{{{" in line:
			
 
				+                    code_block = True
			
 
				+                    line = line.replace("{{{", "```")
			
 
				+                if "}}}" in line:
			
 
				+                    code_block = False
			
 
				+                    line = line.replace("}}}", "```")
			
 
				+            if not code_block:
			
 
				+
			
 
				+                # Convert tables.  References:
			
 
				+                #   https://github.github.com/gfm/#tables-extension-
			
 
				+                #   https://permatrac.noc.ietf.org/wiki/WikiFormatting#Tables
			
 
				+                # Table start: line containing "||"; table end: blank line?
			
 
				+                #
			
 
				+                # Figuring out whether there's a real header line is fun, trac doesn't require one, markdown does.  Guess we can
			
 
				+                # add a dummy header if no better idea.  Markdown requires delimiter line, which we add immediately after the
			
 
				+                # header, both appear to be mandatory.  Trac can have label cells anywhere, not just in header, might need to
			
 
				+                # add "*" to those or just ignore the issue.  Justification we can sort of figure out from the header,
			
 
				+                # if the rows do anything different, ouch, because markdown specifies in delimiter line.
			
 
				+                #
			
 
				+                # Might do something clever with the "=" markers and alignment, start with just getting the basic table
			
 
				+                # structure to something markdown will believe.
			
 
				+
			
 
				+                if line.strip().startswith("||"):
			
 
				+                    line = line.replace("=|", "|").replace("|=", "|")
			
 
				+                    line = line.replace("||", "|")
			
 
				+                    if not in_table:
			
 
				+                        tail.append("|---" * (line.count("|") - 1) + "|\n")
			
 
				+                    in_table = True
			
 
				+                elif in_table and not line.strip().startswith("||"):
			
 
				+                    new_content.append("\n")
			
 
				+                    in_table = False
			
 
				+
			
 
				+                #
			
 
				+                # Convert bullet lists.  The start and end of a list needs an empty line.
			
 
				+                #
			
 
				+                nested_line = line.lstrip(' ')
			
 
				+                if nested_line.startswith('- ') or nested_line.startswith('* '):
			
 
				+                    if not in_list:
			
 
				+                        new_content.append("\n")
			
 
				+                        nested_level = 0
			
 
				+                        prev_indent = 0
			
 
				+                        in_list = True
			
 
				+                    indent = len(line) - len(nested_line)
			
 
				+                    text_indent = len(line) - len(nested_line[1:].lstrip())
			
 
				+                    if indent > prev_indent:
			
 
				+                        nested_level += 1
			
 
				+                    elif indent < prev_indent:
			
 
				+                        nested_level -= 1
			
 
				+                    prev_indent = indent
			
 
				+                    line = '    ' * nested_level + nested_line
			
 
				+                elif in_list and len(line) < len(nested_line) + text_indent:
			
 
				+                    new_content.append("\n")
			
 
				+                    in_list = False
			
 
				+                    nested_level = 0
			
 
				+                    prev_indent = 0
			
 
				+                    text_indent = 0
			
 
				+
			
 
				+                # Convert headers
			
 
				+                line = self.convert_headers(line)
			
 
				+
			
 
				+                # Rest is regexp-driven conversions
			
 
				+                for pattern, action in self.pattern_actions:
			
 
				+                    line = pattern.sub(action, line)
			
 
				+
			
 
				+            new_content.append(line)
			
 
				+            new_content.extend(tail)
			
 
				+
			
 
				+        del self.slug
			
 
				+
			
 
				+        return "".join(new_content)