From be858bc28af94a498b613fa6541ed3e730fd9473 Mon Sep 17 00:00:00 2001
From: Rob Austein <sra@hactrn.net>
Date: Wed, 2 Jun 2021 20:09:35 +0000
Subject: First cut at project-independent version of these scripts

---
 .gitignore     |   7 +++
 GNUmakefile    |  24 +++++++
 Makefile       |   6 ++
 README         |  15 +++++
 extract.py     | 128 ++++++++++++++++++++++++++++++++++++++
 filter.json    |  19 ++++++
 pelicanconf.py |  33 ++++++++++
 trac2md.py     | 193 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 425 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 GNUmakefile
 create mode 100644 Makefile
 create mode 100644 README
 create mode 100755 extract.py
 create mode 100644 filter.json
 create mode 100644 pelicanconf.py
 create mode 100755 trac2md.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..3888036
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+TAGS
+__pycache__
+attachments
+pelican/content/*/*
+pelican/pelicanconf.py
+trac.db
+pelican/website
diff --git a/GNUmakefile b/GNUmakefile
new file mode 100644
index 0000000..a6e7257
--- /dev/null
+++ b/GNUmakefile
@@ -0,0 +1,24 @@
+# Sample settings, tweak for particular jobs on make command line or
+# via environment variables.
+
+SOURCE_URL		:= https://git.cryptech.is/
+SOURCE_TRAC_DB		:= bikeshed.cryptech.is:/home/trac/db/trac.db
+SOURCE_TRAC_ATTACHMENTS := bikeshed.cryptech.is:/home/trac/files/attachments
+
+all:
+	./extract.py --source-url ${SOURCE_URL}
+	cd pelican; pelican --output website --settings pelicanconf.py --fatal errors content
+
+fetch:
+	rsync -aP --delete ${SOURCE_TRAC_DB} ${SOURCE_TRAC_ATTACHMENTS} .
+
+clean:
+	rm -rf wiki pelican
+
+distclean: clean
+	rm -rf trac.db attachments
+
+webfsd:
+	webfsd -r pelican/website -4 -L - -F -f index.html
+
+.PHONY: all clean fetch distclean webfsd
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..45988f6
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,6 @@
+# BSD-style Makefile so that `make` works on both FreeBSD and Linux.
+# GNU make will just ignore this because of the GNUmakefile.
+
+.MAIN: all
+
+${.TARGETS}: ; @gmake $@
diff --git a/README b/README
new file mode 100644
index 0000000..ecf209a
--- /dev/null
+++ b/README
@@ -0,0 +1,15 @@
+Tools to convert Permatrac Wiki to Markdown format.
+
+This is just the tool set for the Wiki, there's another set of scripts
+for converting Trac tickets but I haven't yet converted them from the
+task-specific form they're in at the moment (from an old Trac site
+whose tickets I had to migrate to GitHub issues about five years
+ago...).
+
+As a test, we run the Markdown from the conversion process through
+Pelican to generate a static site.  This may or may not be useful in
+its own right, but for our purposes simply running the Pelican site
+generator is helpful, since it spots a lot of dumb Markdown errors for
+us (missing links, horribly borked Markdown syntax, etc).
+
+Pelican content format: https://docs.getpelican.com/en/latest/content.html
diff --git a/extract.py b/extract.py
new file mode 100755
index 0000000..368211c
--- /dev/null
+++ b/extract.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+
+import fnmatch
+import hashlib
+import json
+import os
+import shutil
+import sqlite3
+import sys
+import time
+import urllib.parse
+import argparse
+
+import trac2md
+
+wiki_query = '''
+  SELECT
+    name, 
+    author,
+    version, 
+    time / 1000000 AS time, 
+    text 
+  FROM wiki
+  ORDER BY
+    name, version
+'''
+
+attachment_query = '''
+  SELECT
+    id,
+    filename,
+    size,
+    author,
+    description,
+    ipnr,
+    time / 1000000 AS createdtime
+  FROM
+    attachment
+  WHERE
+    type = 'wiki'
+  ORDER BY
+    filename, time
+'''
+
+def attachment_link(row):
+    h   = lambda whatever: hashlib.sha1(whatever.encode()).hexdigest()
+    h1  = h(row.id)
+    h2  = h(row.filename)
+    fn2 = os.path.splitext(row["filename"])[1]
+    return \
+        os.path.join("attachments", "wiki", h1[:3], h1, h2 + fn2), \
+        os.path.join("pelican", "content", urllib.parse.quote(row.id, ""), row.filename)
+
+class Filter:
+
+    def __init__(self, filename = "filter.json"):
+        with open(filename) as f:
+            filter = json.load(f)
+        if not all(action in "-+" for action, pattern in filter):
+            sys.exit("Bad action \"{}\" in filter".format(action))
+        self.filter = tuple((action == "+", pattern) for action, pattern in filter)
+
+    def __call__(self, name):
+        for action, pattern in self.filter:
+            if fnmatch.fnmatch(name, pattern):
+                return action
+        return True
+
+class Row(sqlite3.Row):
+
+    def __getattr__(self, name):
+        return self[name]
+
+    @property
+    def isotime(self):
+        return time.strftime("%Y-%m-%d %H:%M", time.gmtime(self.time))
+
+def markdown_header(row, first_published):
+    if row.name in first_published:
+        modtime = "Modified: {}\n".format(row.isotime)
+    else:
+        modtime = ""
+        first_published[row.name] = row.isotime
+    return "Title: {}\nAuthor: {}\nDate: {}\n{}\n".format(row.name, row.author, first_published[row.name], modtime)
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--source-url")
+    args = ap.parse_args()
+
+    for dn in ("wiki", "pelican"):
+        shutil.rmtree(dn)
+
+    for dn in ("wiki", "pelican/content"):
+        os.makedirs(dn)
+
+    os.link("pelicanconf.py", "pelican/pelicanconf.py")
+
+    wiki_to_markdown = trac2md.Trac2Markdown(args.source_url)
+
+    keep = Filter()
+
+    first_published = {}
+
+    db = sqlite3.connect("trac.db")
+    db.row_factory = Row
+
+    for row in db.execute(wiki_query):
+        if keep(row.name):
+            slug = urllib.parse.quote(row.name, "")
+            #print(slug, row.version)
+            with open("wiki/{}.trac".format(slug), "w") as f:
+                f.write(row.text)
+            md = markdown_header(row, first_published) + wiki_to_markdown(row.text, slug)
+            with open("pelican/content/{}.md".format(slug), "w") as f:
+                f.write(md)
+
+    for row in db.execute(attachment_query):
+        src, dst = attachment_link(row)
+        #print("{} => {}".format(dst, src))
+        if not os.path.isdir(os.path.dirname(dst)):
+            os.makedirs(os.path.dirname(dst))
+        os.link(src, dst)
+
+    db.close()
+
+if __name__ == "__main__":
+    main()
diff --git a/filter.json b/filter.json
new file mode 100644
index 0000000..708428e
--- /dev/null
+++ b/filter.json
@@ -0,0 +1,19 @@
+[
+    ["+", "WikiStart"],
+    ["-", "CamelCase"],
+    ["-", "EDAToolchainSurvey\""],
+    ["-", "GitRepositories*"],
+    ["-", "InterMapTxt"],
+    ["-", "InterTrac"],
+    ["-", "InterWiki"],
+    ["-", "PageTemplates"],
+    ["-", "PhotoFolder"],
+    ["-", "RecentChanges"],
+    ["-", "SandBox"],
+    ["-", "ScratchPage"],
+    ["-", "TicketQuery"],
+    ["-", "TitleIndex"],
+    ["-", "Trac*"],
+    ["-", "Wiki*"],
+    ["+", "*"]
+]
diff --git a/pelicanconf.py b/pelicanconf.py
new file mode 100644
index 0000000..ab8afd2
--- /dev/null
+++ b/pelicanconf.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*- #
+
+AUTHOR = "Cryptech Core Team"
+SITENAME = "Cryptech Project"
+
+PATH = "content"
+TIMEZONE = "UTC"
+DEFAULT_LANG = "en"
+
+# Feed generation is usually not desired when developing
+SITEURL = ""
+RELATIVE_URLS = True
+FEED_ALL_ATOM = None
+CATEGORY_FEED_ATOM = None
+TRANSLATION_FEED_ATOM = None
+AUTHOR_FEED_ATOM = None
+AUTHOR_FEED_RSS = None
+
+# Blogroll
+LINKS = (("Pelican", "http://getpelican.com/"),
+         ("Python.org", "http://python.org/"),
+         ("Jinja2", "http://jinja.pocoo.org/"))
+LINKS_WIDGET_NAME = "Links"
+
+# Social widget.  Can't get rid of this with default theme, only change its name.
+# Fiddle with themes later
+SOCIAL = ()
+SOCIAL_WIDGET_NAME = "Subscribe"
+
+DEFAULT_PAGINATION = 10
+
+#THEME = "/home/blog/pelican-themes/sundown"
diff --git a/trac2md.py b/trac2md.py
new file mode 100755
index 0000000..5a00754
--- /dev/null
+++ b/trac2md.py
@@ -0,0 +1,193 @@
+# This started out as https://www.snip2code.com/Snippet/1704331/Convert-trac-markup-to-Markdown/
+# which in turn said "This code mostly taken from patches to pagure_importer by mreynolds".
+# Has mutated considerably since then.
+
+import re
+from urllib.parse import quote
+
+class Trac2Markdown:
+
+    content_linebreak_pattern = re.compile(r"\[\[br\]\]|\\\\", re.I)
+    camelcase_pattern = re.compile(r"(?:^|(?<=\s))([A-Z][a-z]+[A-Z][a-z][A-Za-z]*)(?:$|(?=\s))")
+
+    wikiheading_patterns = tuple(
+        (level, re.compile("^{} (.*)[ \t]*=*$".format("=" * level)))
+        for level in range(1, 7)
+    )
+
+    def convert_headers(self, line):
+        for level_count, header in self.wikiheading_patterns:
+            try:
+                level = header.search(line).group(1)
+                if level:
+                    line = "{} {}".format('#' * level_count, level.rstrip("= \r\t"))
+                    break          # No need to check other heading levels
+            except:
+                pass                # Try the next heading level
+        return line
+
+    def convert_to_creole(self, m):
+        # Convert Trac's native link form to Creole's, so that rest of the code only has to deal with one format.
+        # Creole's is easier to parse and harder to confuse with partially converted Markdown.
+
+        text = m.group(1).strip()
+        if " " in text:
+            return "[[{0[0]}|{0[1]}]]".format(text.split(" ", 1))
+        elif ":" in text or self.camelcase_pattern.match(text):
+            return "[[{}]]".format(text)
+        else:
+            return m.group(0)
+
+    # Probably most of the non-wiki scheme tests should become a table in an
+    # extended JSON config file which maps
+    #
+    #   { "source:fee/fie/foe/fum": "https://git.cryptech.is/blarg/blee/blue" }
+
+    def convert_wikilinks(self, m):
+        scheme, link, text = [p.strip() if p else p for p in  m.groups()]
+        if text is None:
+            text = link
+        if any(link.startswith(q) and link.endswith(q) for q in ('"', "'")):
+            link = link[1:-1]
+        if any(text.startswith(q) and text.endswith(q) for q in ('"', "'")):
+            text = text[1:-1]
+        if text == link and link.startswith("http") and "://" in link:
+            return "<{}>".format(link)
+        elif scheme == "attachment:":
+            return "[{}]({{attach}}{}/{})".format(text, self.slug, link)
+        elif scheme in ("source:", "browser:"):
+            return "[{}]({}/{})".format(text, self.source_url.rstrip("/"), link.lstrip("/"))
+        elif scheme == "wiki:" or (scheme is None and self.camelcase_pattern.match(link)):
+            return "[{}]({{filename}}{}.md)".format(text, link)
+        else:
+            return "[{}]({})".format(text, link)
+
+    def convert_image(self, m):
+        text = m.group(1).split(",")[0].strip()
+        if "://" in text:
+            return "<img src=\"{}\">".format(text)
+        else:
+            return "![{}]({{attach}}{}/{})".format(text, self.slug, quote(text, ""))
+
+    def __init__(self, source_url):
+        self.source_url = source_url
+        self.pattern_actions = (
+
+            # Convert TracLinks to WikiCreole syntax to simplify remaining processing
+            (re.compile(r"(?<!\[)\[([^][]+)\]"),                                        self.convert_to_creole),
+
+            # Convert CamelCase links to explicit links
+            (self.camelcase_pattern,                                                    r"[[\1]]"),
+
+            # Convert !x quoting
+            (re.compile(r"!((?:\w|[#])+)"),                                             r"\1"),
+
+            # Convert (limited subset of) spans
+            (re.compile(r"\[\[span\((?:[^][]*,)*([^(),]+)\)\]\]"),                      r"\1"),
+
+            # Convert images
+            (re.compile(r"\[\[Image\((.*)\)\]\]"),                                      self.convert_image),
+
+            # Delete Trac macros that have no useful counterpart
+            (re.compile(r"\[\[PageOutline\]\]", re.I),                                  r""),
+
+            # Convert wiki links
+            (re.compile(r"\[\[(wiki:|attachment:|source:|browser:)?([^]|[]+)(?:[|]([^][]+))?\]\]"),     self.convert_wikilinks),
+
+            # Convert striked through text
+            (re.compile(r"~~([^~]+)~~"),                                                r"<s>\1</s>"),
+
+            # Convert line breaks -- Markdown spec says linebreak is <SPACE><SPACE><RETURN>, who am I to argue?
+            (re.compile(r"\\\\$"),                                                      r"  "),
+
+            # Convert bold and italic text (do this last)
+            (re.compile(r"'''"),                                                        r"**"),
+            (re.compile(r"''"),                                                         r"*"),
+        )
+
+    def __call__(self, content, slug):
+        self.slug = slug
+
+        old_content = self.content_linebreak_pattern.sub("\\\\\\\\\n", content).splitlines()
+        new_content = []
+
+        code_block = False
+        in_list = False
+        in_table = False
+        nested_level = 0
+        prev_indent = 0
+
+        while old_content:
+            line = old_content.pop(0).rstrip()
+            tail = ["\n"]
+            while "{{{" in line or "}}}" in line:
+                if "{{{" in line:
+                    code_block = True
+                    line = line.replace("{{{", "```")
+                if "}}}" in line:
+                    code_block = False
+                    line = line.replace("}}}", "```")
+            if not code_block:
+
+                # Convert tables.  References:
+                #   https://github.github.com/gfm/#tables-extension-
+                #   https://permatrac.noc.ietf.org/wiki/WikiFormatting#Tables
+                # Table start: line containing "||"; table end: blank line?
+                #
+                # Figuring out whether there's a real header line is fun, trac doesn't require one, markdown does.  Guess we can
+                # add a dummy header if no better idea.  Markdown requires delimiter line, which we add immediately after the
+                # header, both appear to be mandatory.  Trac can have label cells anywhere, not just in header, might need to
+                # add "*" to those or just ignore the issue.  Justification we can sort of figure out from the header,
+                # if the rows do anything different, ouch, because markdown specifies in delimiter line.
+                #
+                # Might do something clever with the "=" markers and alignment, start with just getting the basic table
+                # structure to something markdown will believe.
+
+                if line.strip().startswith("||"):
+                    line = line.replace("=|", "|").replace("|=", "|")
+                    line = line.replace("||", "|")
+                    if not in_table:
+                        tail.append("|---" * (line.count("|") - 1) + "|\n")
+                    in_table = True
+                elif in_table and not line.strip().startswith("||"):
+                    new_content.append("\n")
+                    in_table = False
+
+                #
+                # Convert bullet lists.  The start and end of a list needs an empty line.
+                #
+                nested_line = line.lstrip(' ')
+                if nested_line.startswith('- ') or nested_line.startswith('* '):
+                    if not in_list:
+                        new_content.append("\n")
+                        nested_level = 0
+                        prev_indent = 0
+                        in_list = True
+                    indent = len(line) - len(nested_line)
+                    text_indent = len(line) - len(nested_line[1:].lstrip())
+                    if indent > prev_indent:
+                        nested_level += 1
+                    elif indent < prev_indent:
+                        nested_level -= 1
+                    prev_indent = indent
+                    line = '    ' * nested_level + nested_line
+                elif in_list and len(line) < len(nested_line) + text_indent:
+                    new_content.append("\n")
+                    in_list = False
+                    nested_level = 0
+                    prev_indent = 0
+                    text_indent = 0
+
+                # Convert headers
+                line = self.convert_headers(line)
+
+                # Rest is regexp-driven conversions
+                for pattern, action in self.pattern_actions:
+                    line = pattern.sub(action, line)
+
+            new_content.append(line)
+            new_content.extend(tail)
+
+        del self.slug
+
+        return "".join(new_content)
-- 
cgit v1.2.3