Browse Source

First cut at project-independent version of these scripts

Rob Austein 2 years ago
commit
be858bc28a
8 changed files with 425 additions and 0 deletions
  1. 7 0
      .gitignore
  2. 24 0
      GNUmakefile
  3. 6 0
      Makefile
  4. 15 0
      README
  5. 128 0
      extract.py
  6. 19 0
      filter.json
  7. 33 0
      pelicanconf.py
  8. 193 0
      trac2md.py

+ 7 - 0
.gitignore

@@ -0,0 +1,7 @@
+TAGS
+__pycache__
+attachments
+pelican/content/*/*
+pelican/pelicanconf.py
+trac.db
+pelican/website

+ 24 - 0
GNUmakefile

@@ -0,0 +1,24 @@
+# Sample settings, tweak for particular jobs on make command line or
+# via environment variables.
+
+SOURCE_URL		:= https://git.cryptech.is/
+SOURCE_TRAC_DB		:= bikeshed.cryptech.is:/home/trac/db/trac.db
+SOURCE_TRAC_ATTACHMENTS := bikeshed.cryptech.is:/home/trac/files/attachments
+
+all:
+	./extract.py --source-url ${SOURCE_URL}
+	cd pelican; pelican --output website --settings pelicanconf.py --fatal errors content
+
+fetch:
+	rsync -aP --delete ${SOURCE_TRAC_DB} ${SOURCE_TRAC_ATTACHMENTS} .
+
+clean:
+	rm -rf wiki pelican
+
+distclean: clean
+	rm -rf trac.db attachments
+
+webfsd:
+	webfsd -r pelican/website -4 -L - -F -f index.html
+
+.PHONY: all clean fetch distclean webfsd

+ 6 - 0
Makefile

@@ -0,0 +1,6 @@
+# BSD-style Makefile so that `make` works on both FreeBSD and Linux.
+# GNU make will just ignore this because of the GNUmakefile.
+
+.MAIN: all
+
+${.TARGETS}: ; @gmake $@

+ 15 - 0
README

@@ -0,0 +1,15 @@
+Tools to convert Permatrac Wiki to Markdown format.
+
+This is just the tool set for the Wiki, there's another set of scripts
+for converting Trac tickets but I haven't yet converted them from the
+task-specific form they're in at the moment (from an old Trac site
+whose tickets I had to migrate to GitHub issues about five years
+ago...).
+
+As a test, we run the Markdown from the conversion process through
+Pelican to generate a static site.  This may or may not be useful in
+its own right, but for our purposes simply running the Pelican site
+generator is helpful, since it spots a lot of dumb Markdown errors for
+us (missing links, horribly borked Markdown syntax, etc).
+
+Pelican content format: https://docs.getpelican.com/en/latest/content.html

+ 128 - 0
extract.py

@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+
+import fnmatch
+import hashlib
+import json
+import os
+import shutil
+import sqlite3
+import sys
+import time
+import urllib.parse
+import argparse
+
+import trac2md
+
+wiki_query = '''
+  SELECT
+    name, 
+    author,
+    version, 
+    time / 1000000 AS time, 
+    text 
+  FROM wiki
+  ORDER BY
+    name, version
+'''
+
+attachment_query = '''
+  SELECT
+    id,
+    filename,
+    size,
+    author,
+    description,
+    ipnr,
+    time / 1000000 AS createdtime
+  FROM
+    attachment
+  WHERE
+    type = 'wiki'
+  ORDER BY
+    filename, time
+'''
+
+def attachment_link(row):
+    h   = lambda whatever: hashlib.sha1(whatever.encode()).hexdigest()
+    h1  = h(row.id)
+    h2  = h(row.filename)
+    fn2 = os.path.splitext(row["filename"])[1]
+    return \
+        os.path.join("attachments", "wiki", h1[:3], h1, h2 + fn2), \
+        os.path.join("pelican", "content", urllib.parse.quote(row.id, ""), row.filename)
+
+class Filter:
+
+    def __init__(self, filename = "filter.json"):
+        with open(filename) as f:
+            filter = json.load(f)
+        if not all(action in "-+" for action, pattern in filter):
+            sys.exit("Bad action \"{}\" in filter".format(action))
+        self.filter = tuple((action == "+", pattern) for action, pattern in filter)
+
+    def __call__(self, name):
+        for action, pattern in self.filter:
+            if fnmatch.fnmatch(name, pattern):
+                return action
+        return True
+
+class Row(sqlite3.Row):
+
+    def __getattr__(self, name):
+        return self[name]
+
+    @property
+    def isotime(self):
+        return time.strftime("%Y-%m-%d %H:%M", time.gmtime(self.time))
+
+def markdown_header(row, first_published):
+    if row.name in first_published:
+        modtime = "Modified: {}\n".format(row.isotime)
+    else:
+        modtime = ""
+        first_published[row.name] = row.isotime
+    return "Title: {}\nAuthor: {}\nDate: {}\n{}\n".format(row.name, row.author, first_published[row.name], modtime)
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--source-url")
+    args = ap.parse_args()
+
+    for dn in ("wiki", "pelican"):
+        shutil.rmtree(dn)
+
+    for dn in ("wiki", "pelican/content"):
+        os.makedirs(dn)
+
+    os.link("pelicanconf.py", "pelican/pelicanconf.py")
+
+    wiki_to_markdown = trac2md.Trac2Markdown(args.source_url)
+
+    keep = Filter()
+
+    first_published = {}
+
+    db = sqlite3.connect("trac.db")
+    db.row_factory = Row
+
+    for row in db.execute(wiki_query):
+        if keep(row.name):
+            slug = urllib.parse.quote(row.name, "")
+            #print(slug, row.version)
+            with open("wiki/{}.trac".format(slug), "w") as f:
+                f.write(row.text)
+            md = markdown_header(row, first_published) + wiki_to_markdown(row.text, slug)
+            with open("pelican/content/{}.md".format(slug), "w") as f:
+                f.write(md)
+
+    for row in db.execute(attachment_query):
+        src, dst = attachment_link(row)
+        #print("{} => {}".format(dst, src))
+        if not os.path.isdir(os.path.dirname(dst)):
+            os.makedirs(os.path.dirname(dst))
+        os.link(src, dst)
+
+    db.close()
+
+if __name__ == "__main__":
+    main()

+ 19 - 0
filter.json

@@ -0,0 +1,19 @@
+[
+    ["+", "WikiStart"],
+    ["-", "CamelCase"],
+    ["-", "EDAToolchainSurvey\""],
+    ["-", "GitRepositories*"],
+    ["-", "InterMapTxt"],
+    ["-", "InterTrac"],
+    ["-", "InterWiki"],
+    ["-", "PageTemplates"],
+    ["-", "PhotoFolder"],
+    ["-", "RecentChanges"],
+    ["-", "SandBox"],
+    ["-", "ScratchPage"],
+    ["-", "TicketQuery"],
+    ["-", "TitleIndex"],
+    ["-", "Trac*"],
+    ["-", "Wiki*"],
+    ["+", "*"]
+]

+ 33 - 0
pelicanconf.py

@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*- #
+
+AUTHOR = "Cryptech Core Team"
+SITENAME = "Cryptech Project"
+
+PATH = "content"
+TIMEZONE = "UTC"
+DEFAULT_LANG = "en"
+
+# Feed generation is usually not desired when developing
+SITEURL = ""
+RELATIVE_URLS = True
+FEED_ALL_ATOM = None
+CATEGORY_FEED_ATOM = None
+TRANSLATION_FEED_ATOM = None
+AUTHOR_FEED_ATOM = None
+AUTHOR_FEED_RSS = None
+
+# Blogroll
+LINKS = (("Pelican", "http://getpelican.com/"),
+         ("Python.org", "http://python.org/"),
+         ("Jinja2", "http://jinja.pocoo.org/"))
+LINKS_WIDGET_NAME = "Links"
+
+# Social widget.  Can't get rid of this with default theme, only change its name.
+# Fiddle with themes later
+SOCIAL = ()
+SOCIAL_WIDGET_NAME = "Subscribe"
+
+DEFAULT_PAGINATION = 10
+
+#THEME = "/home/blog/pelican-themes/sundown"

+ 193 - 0
trac2md.py

@@ -0,0 +1,193 @@
+# This started out as https://www.snip2code.com/Snippet/1704331/Convert-trac-markup-to-Markdown/
+# which in turn said "This code mostly taken from patches to pagure_importer by mreynolds".
+# Has mutated considerably since then.
+
+import re
+from urllib.parse import quote
+
+class Trac2Markdown:
+
+    content_linebreak_pattern = re.compile(r"\[\[br\]\]|\\\\", re.I)
+    camelcase_pattern = re.compile(r"(?:^|(?<=\s))([A-Z][a-z]+[A-Z][a-z][A-Za-z]*)(?:$|(?=\s))")
+
+    wikiheading_patterns = tuple(
+        (level, re.compile("^{} (.*)[ \t]*=*$".format("=" * level)))
+        for level in range(1, 7)
+    )
+
+    def convert_headers(self, line):
+        for level_count, header in self.wikiheading_patterns:
+            try:
+                level = header.search(line).group(1)
+                if level:
+                    line = "{} {}".format('#' * level_count, level.rstrip("= \r\t"))
+                    break          # No need to check other heading levels
+            except:
+                pass                # Try the next heading level
+        return line
+
+    def convert_to_creole(self, m):
+        # Convert Trac's native link form to Creole's, so that rest of the code only has to deal with one format.
+        # Creole's is easier to parse and harder to confuse with partially converted Markdown.
+
+        text = m.group(1).strip()
+        if " " in text:
+            return "[[{0[0]}|{0[1]}]]".format(text.split(" ", 1))
+        elif ":" in text or self.camelcase_pattern.match(text):
+            return "[[{}]]".format(text)
+        else:
+            return m.group(0)
+
+    # Probably most of the non-wiki scheme tests should become a table in an
+    # extended JSON config file which maps
+    #
+    #   { "source:fee/fie/foe/fum": "https://git.cryptech.is/blarg/blee/blue" }
+
+    def convert_wikilinks(self, m):
+        scheme, link, text = [p.strip() if p else p for p in  m.groups()]
+        if text is None:
+            text = link
+        if any(link.startswith(q) and link.endswith(q) for q in ('"', "'")):
+            link = link[1:-1]
+        if any(text.startswith(q) and text.endswith(q) for q in ('"', "'")):
+            text = text[1:-1]
+        if text == link and link.startswith("http") and "://" in link:
+            return "<{}>".format(link)
+        elif scheme == "attachment:":
+            return "[{}]({{attach}}{}/{})".format(text, self.slug, link)
+        elif scheme in ("source:", "browser:"):
+            return "[{}]({}/{})".format(text, self.source_url.rstrip("/"), link.lstrip("/"))
+        elif scheme == "wiki:" or (scheme is None and self.camelcase_pattern.match(link)):
+            return "[{}]({{filename}}{}.md)".format(text, link)
+        else:
+            return "[{}]({})".format(text, link)
+
+    def convert_image(self, m):
+        text = m.group(1).split(",")[0].strip()
+        if "://" in text:
+            return "<img src=\"{}\">".format(text)
+        else:
+            return "![{}]({{attach}}{}/{})".format(text, self.slug, quote(text, ""))
+
+    def __init__(self, source_url):
+        self.source_url = source_url
+        self.pattern_actions = (
+
+            # Convert TracLinks to WikiCreole syntax to simplify remaining processing
+            (re.compile(r"(?<!\[)\[([^][]+)\]"),                                        self.convert_to_creole),
+
+            # Convert CamelCase links to explicit links
+            (self.camelcase_pattern,                                                    r"[[\1]]"),
+
+            # Convert !x quoting
+            (re.compile(r"!((?:\w|[#])+)"),                                             r"\1"),
+
+            # Convert (limited subset of) spans
+            (re.compile(r"\[\[span\((?:[^][]*,)*([^(),]+)\)\]\]"),                      r"\1"),
+
+            # Convert images
+            (re.compile(r"\[\[Image\((.*)\)\]\]"),                                      self.convert_image),
+
+            # Delete Trac macros that have no useful counterpart
+            (re.compile(r"\[\[PageOutline\]\]", re.I),                                  r""),
+
+            # Convert wiki links
+            (re.compile(r"\[\[(wiki:|attachment:|source:|browser:)?([^]|[]+)(?:[|]([^][]+))?\]\]"),     self.convert_wikilinks),
+
+            # Convert striked through text
+            (re.compile(r"~~([^~]+)~~"),                                                r"<s>\1</s>"),
+
+            # Convert line breaks -- Markdown spec says linebreak is <SPACE><SPACE><RETURN>, who am I to argue?
+            (re.compile(r"\\\\$"),                                                      r"  "),
+
+            # Convert bold and italic text (do this last)
+            (re.compile(r"'''"),                                                        r"**"),
+            (re.compile(r"''"),                                                         r"*"),
+        )
+
+    def __call__(self, content, slug):
+        self.slug = slug
+
+        old_content = self.content_linebreak_pattern.sub("\\\\\\\\\n", content).splitlines()
+        new_content = []
+
+        code_block = False
+        in_list = False
+        in_table = False
+        nested_level = 0
+        prev_indent = 0
+
+        while old_content:
+            line = old_content.pop(0).rstrip()
+            tail = ["\n"]
+            while "{{{" in line or "}}}" in line:
+                if "{{{" in line:
+                    code_block = True
+                    line = line.replace("{{{", "```")
+                if "}}}" in line:
+                    code_block = False
+                    line = line.replace("}}}", "```")
+            if not code_block:
+
+                # Convert tables.  References:
+                #   https://github.github.com/gfm/#tables-extension-
+                #   https://permatrac.noc.ietf.org/wiki/WikiFormatting#Tables
+                # Table start: line containing "||"; table end: blank line?
+                #
+                # Figuring out whether there's a real header line is fun, trac doesn't require one, markdown does.  Guess we can
+                # add a dummy header if no better idea.  Markdown requires delimiter line, which we add immediately after the
+                # header, both appear to be mandatory.  Trac can have label cells anywhere, not just in header, might need to
+                # add "*" to those or just ignore the issue.  Justification we can sort of figure out from the header,
+                # if the rows do anything different, ouch, because markdown specifies in delimiter line.
+                #
+                # Might do something clever with the "=" markers and alignment, start with just getting the basic table
+                # structure to something markdown will believe.
+
+                if line.strip().startswith("||"):
+                    line = line.replace("=|", "|").replace("|=", "|")
+                    line = line.replace("||", "|")
+                    if not in_table:
+                        tail.append("|---" * (line.count("|") - 1) + "|\n")
+                    in_table = True
+                elif in_table and not line.strip().startswith("||"):
+                    new_content.append("\n")
+                    in_table = False
+
+                #
+                # Convert bullet lists.  The start and end of a list needs an empty line.
+                #
+                nested_line = line.lstrip(' ')
+                if nested_line.startswith('- ') or nested_line.startswith('* '):
+                    if not in_list:
+                        new_content.append("\n")
+                        nested_level = 0
+                        prev_indent = 0
+                        in_list = True
+                    indent = len(line) - len(nested_line)
+                    text_indent = len(line) - len(nested_line[1:].lstrip())
+                    if indent > prev_indent:
+                        nested_level += 1
+                    elif indent < prev_indent:
+                        nested_level -= 1
+                    prev_indent = indent
+                    line = '    ' * nested_level + nested_line
+                elif in_list and len(line) < len(nested_line) + text_indent:
+                    new_content.append("\n")
+                    in_list = False
+                    nested_level = 0
+                    prev_indent = 0
+                    text_indent = 0
+
+                # Convert headers
+                line = self.convert_headers(line)
+
+                # Rest is regexp-driven conversions
+                for pattern, action in self.pattern_actions:
+                    line = pattern.sub(action, line)
+
+            new_content.append(line)
+            new_content.extend(tail)
+
+        del self.slug
+
+        return "".join(new_content)