summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore7
-rw-r--r--GNUmakefile24
-rw-r--r--Makefile6
-rw-r--r--README15
-rwxr-xr-xextract.py128
-rw-r--r--filter.json19
-rw-r--r--pelicanconf.py33
-rwxr-xr-xtrac2md.py193
8 files changed, 425 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..3888036
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+TAGS
+__pycache__
+attachments
+pelican/content/*/*
+pelican/pelicanconf.py
+trac.db
+pelican/website
diff --git a/GNUmakefile b/GNUmakefile
new file mode 100644
index 0000000..a6e7257
--- /dev/null
+++ b/GNUmakefile
@@ -0,0 +1,24 @@
+# Sample settings, tweak for particular jobs on make command line or
+# via environment variables.
+
+SOURCE_URL := https://git.cryptech.is/
+SOURCE_TRAC_DB := bikeshed.cryptech.is:/home/trac/db/trac.db
+SOURCE_TRAC_ATTACHMENTS := bikeshed.cryptech.is:/home/trac/files/attachments
+
+all:
+ ./extract.py --source-url ${SOURCE_URL}
+ cd pelican; pelican --output website --settings pelicanconf.py --fatal errors content
+
+fetch:
+ rsync -aP --delete ${SOURCE_TRAC_DB} ${SOURCE_TRAC_ATTACHMENTS} .
+
+clean:
+ rm -rf wiki pelican
+
+distclean: clean
+ rm -rf trac.db attachments
+
+webfsd:
+ webfsd -r pelican/website -4 -L - -F -f index.html
+
+.PHONY: all clean fetch distclean webfsd
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..45988f6
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,6 @@
+# BSD-style Makefile so that `make` works on both FreeBSD and Linux.
+# GNU make will just ignore this because of the GNUmakefile.
+
+.MAIN: all
+
+${.TARGETS}: ; @gmake $@
diff --git a/README b/README
new file mode 100644
index 0000000..ecf209a
--- /dev/null
+++ b/README
@@ -0,0 +1,15 @@
+Tools to convert Permatrac Wiki to Markdown format.
+
+This is just the tool set for the Wiki, there's another set of scripts
+for converting Trac tickets but I haven't yet converted them from the
+task-specific form they're in at the moment (from an old Trac site
+whose tickets I had to migrate to GitHub issues about five years
+ago...).
+
+As a test, we run the Markdown from the conversion process through
+Pelican to generate a static site. This may or may not be useful in
+its own right, but for our purposes simply running the Pelican site
+generator is helpful, since it spots a lot of dumb Markdown errors for
+us (missing links, horribly borked Markdown syntax, etc).
+
+Pelican content format: https://docs.getpelican.com/en/latest/content.html
diff --git a/extract.py b/extract.py
new file mode 100755
index 0000000..368211c
--- /dev/null
+++ b/extract.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+
+import fnmatch
+import hashlib
+import json
+import os
+import shutil
+import sqlite3
+import sys
+import time
+import urllib.parse
+import argparse
+
+import trac2md
+
+wiki_query = '''
+ SELECT
+ name,
+ author,
+ version,
+ time / 1000000 AS time,
+ text
+ FROM wiki
+ ORDER BY
+ name, version
+'''
+
+attachment_query = '''
+ SELECT
+ id,
+ filename,
+ size,
+ author,
+ description,
+ ipnr,
+ time / 1000000 AS createdtime
+ FROM
+ attachment
+ WHERE
+ type = 'wiki'
+ ORDER BY
+ filename, time
+'''
+
+def attachment_link(row):
+ h = lambda whatever: hashlib.sha1(whatever.encode()).hexdigest()
+ h1 = h(row.id)
+ h2 = h(row.filename)
+ fn2 = os.path.splitext(row["filename"])[1]
+ return \
+ os.path.join("attachments", "wiki", h1[:3], h1, h2 + fn2), \
+ os.path.join("pelican", "content", urllib.parse.quote(row.id, ""), row.filename)
+
+class Filter:
+
+ def __init__(self, filename = "filter.json"):
+ with open(filename) as f:
+ filter = json.load(f)
+ if not all(action in "-+" for action, pattern in filter):
+ sys.exit("Bad action \"{}\" in filter".format(action))
+ self.filter = tuple((action == "+", pattern) for action, pattern in filter)
+
+ def __call__(self, name):
+ for action, pattern in self.filter:
+ if fnmatch.fnmatch(name, pattern):
+ return action
+ return True
+
+class Row(sqlite3.Row):
+
+ def __getattr__(self, name):
+ return self[name]
+
+ @property
+ def isotime(self):
+ return time.strftime("%Y-%m-%d %H:%M", time.gmtime(self.time))
+
+def markdown_header(row, first_published):
+ if row.name in first_published:
+ modtime = "Modified: {}\n".format(row.isotime)
+ else:
+ modtime = ""
+ first_published[row.name] = row.isotime
+ return "Title: {}\nAuthor: {}\nDate: {}\n{}\n".format(row.name, row.author, first_published[row.name], modtime)
+
+def main():
+ ap = argparse.ArgumentParser()
+ ap.add_argument("--source-url")
+ args = ap.parse_args()
+
+ for dn in ("wiki", "pelican"):
+ shutil.rmtree(dn)
+
+ for dn in ("wiki", "pelican/content"):
+ os.makedirs(dn)
+
+ os.link("pelicanconf.py", "pelican/pelicanconf.py")
+
+ wiki_to_markdown = trac2md.Trac2Markdown(args.source_url)
+
+ keep = Filter()
+
+ first_published = {}
+
+ db = sqlite3.connect("trac.db")
+ db.row_factory = Row
+
+ for row in db.execute(wiki_query):
+ if keep(row.name):
+ slug = urllib.parse.quote(row.name, "")
+ #print(slug, row.version)
+ with open("wiki/{}.trac".format(slug), "w") as f:
+ f.write(row.text)
+ md = markdown_header(row, first_published) + wiki_to_markdown(row.text, slug)
+ with open("pelican/content/{}.md".format(slug), "w") as f:
+ f.write(md)
+
+ for row in db.execute(attachment_query):
+ src, dst = attachment_link(row)
+ #print("{} => {}".format(dst, src))
+ if not os.path.isdir(os.path.dirname(dst)):
+ os.makedirs(os.path.dirname(dst))
+ os.link(src, dst)
+
+ db.close()
+
+if __name__ == "__main__":
+ main()
diff --git a/filter.json b/filter.json
new file mode 100644
index 0000000..708428e
--- /dev/null
+++ b/filter.json
@@ -0,0 +1,19 @@
+[
+ ["+", "WikiStart"],
+ ["-", "CamelCase"],
+ ["-", "EDAToolchainSurvey\""],
+ ["-", "GitRepositories*"],
+ ["-", "InterMapTxt"],
+ ["-", "InterTrac"],
+ ["-", "InterWiki"],
+ ["-", "PageTemplates"],
+ ["-", "PhotoFolder"],
+ ["-", "RecentChanges"],
+ ["-", "SandBox"],
+ ["-", "ScratchPage"],
+ ["-", "TicketQuery"],
+ ["-", "TitleIndex"],
+ ["-", "Trac*"],
+ ["-", "Wiki*"],
+ ["+", "*"]
+]
diff --git a/pelicanconf.py b/pelicanconf.py
new file mode 100644
index 0000000..ab8afd2
--- /dev/null
+++ b/pelicanconf.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*- #
+
+AUTHOR = "Cryptech Core Team"
+SITENAME = "Cryptech Project"
+
+PATH = "content"
+TIMEZONE = "UTC"
+DEFAULT_LANG = "en"
+
+# Feed generation is usually not desired when developing
+SITEURL = ""
+RELATIVE_URLS = True
+FEED_ALL_ATOM = None
+CATEGORY_FEED_ATOM = None
+TRANSLATION_FEED_ATOM = None
+AUTHOR_FEED_ATOM = None
+AUTHOR_FEED_RSS = None
+
+# Blogroll
+LINKS = (("Pelican", "http://getpelican.com/"),
+ ("Python.org", "http://python.org/"),
+ ("Jinja2", "http://jinja.pocoo.org/"))
+LINKS_WIDGET_NAME = "Links"
+
+# Social widget. Can't get rid of this with default theme, only change its name.
+# Fiddle with themes later
+SOCIAL = ()
+SOCIAL_WIDGET_NAME = "Subscribe"
+
+DEFAULT_PAGINATION = 10
+
+#THEME = "/home/blog/pelican-themes/sundown"
diff --git a/trac2md.py b/trac2md.py
new file mode 100755
index 0000000..5a00754
--- /dev/null
+++ b/trac2md.py
@@ -0,0 +1,193 @@
+# This started out as https://www.snip2code.com/Snippet/1704331/Convert-trac-markup-to-Markdown/
+# which in turn said "This code mostly taken from patches to pagure_importer by mreynolds".
+# Has mutated considerably since then.
+
+import re
+from urllib.parse import quote
+
+class Trac2Markdown:
+
+ content_linebreak_pattern = re.compile(r"\[\[br\]\]|\\\\", re.I)
+ camelcase_pattern = re.compile(r"(?:^|(?<=\s))([A-Z][a-z]+[A-Z][a-z][A-Za-z]*)(?:$|(?=\s))")
+
+ wikiheading_patterns = tuple(
+ (level, re.compile("^{} (.*)[ \t]*=*$".format("=" * level)))
+ for level in range(1, 7)
+ )
+
+ def convert_headers(self, line):
+ for level_count, header in self.wikiheading_patterns:
+ try:
+ level = header.search(line).group(1)
+ if level:
+ line = "{} {}".format('#' * level_count, level.rstrip("= \r\t"))
+ break # No need to check other heading levels
+ except:
+ pass # Try the next heading level
+ return line
+
+ def convert_to_creole(self, m):
+ # Convert Trac's native link form to Creole's, so that rest of the code only has to deal with one format.
+ # Creole's is easier to parse and harder to confuse with partially converted Markdown.
+
+ text = m.group(1).strip()
+ if " " in text:
+ return "[[{0[0]}|{0[1]}]]".format(text.split(" ", 1))
+ elif ":" in text or self.camelcase_pattern.match(text):
+ return "[[{}]]".format(text)
+ else:
+ return m.group(0)
+
+ # Probably most of the non-wiki scheme tests should become a table in an
+ # extended JSON config file which maps
+ #
+ # { "source:fee/fie/foe/fum": "https://git.cryptech.is/blarg/blee/blue" }
+
+ def convert_wikilinks(self, m):
+ scheme, link, text = [p.strip() if p else p for p in m.groups()]
+ if text is None:
+ text = link
+ if any(link.startswith(q) and link.endswith(q) for q in ('"', "'")):
+ link = link[1:-1]
+ if any(text.startswith(q) and text.endswith(q) for q in ('"', "'")):
+ text = text[1:-1]
+ if text == link and link.startswith("http") and "://" in link:
+ return "<{}>".format(link)
+ elif scheme == "attachment:":
+ return "[{}]({{attach}}{}/{})".format(text, self.slug, link)
+ elif scheme in ("source:", "browser:"):
+ return "[{}]({}/{})".format(text, self.source_url.rstrip("/"), link.lstrip("/"))
+ elif scheme == "wiki:" or (scheme is None and self.camelcase_pattern.match(link)):
+ return "[{}]({{filename}}{}.md)".format(text, link)
+ else:
+ return "[{}]({})".format(text, link)
+
+ def convert_image(self, m):
+ text = m.group(1).split(",")[0].strip()
+ if "://" in text:
+ return "<img src=\"{}\">".format(text)
+ else:
+ return "![{}]({{attach}}{}/{})".format(text, self.slug, quote(text, ""))
+
+ def __init__(self, source_url):
+ self.source_url = source_url
+ self.pattern_actions = (
+
+ # Convert TracLinks to WikiCreole syntax to simplify remaining processing
+ (re.compile(r"(?<!\[)\[([^][]+)\]"), self.convert_to_creole),
+
+ # Convert CamelCase links to explicit links
+ (self.camelcase_pattern, r"[[\1]]"),
+
+ # Convert !x quoting
+ (re.compile(r"!((?:\w|[#])+)"), r"\1"),
+
+ # Convert (limited subset of) spans
+ (re.compile(r"\[\[span\((?:[^][]*,)*([^(),]+)\)\]\]"), r"\1"),
+
+ # Convert images
+ (re.compile(r"\[\[Image\((.*)\)\]\]"), self.convert_image),
+
+ # Delete Trac macros that have no useful counterpart
+ (re.compile(r"\[\[PageOutline\]\]", re.I), r""),
+
+ # Convert wiki links
+ (re.compile(r"\[\[(wiki:|attachment:|source:|browser:)?([^]|[]+)(?:[|]([^][]+))?\]\]"), self.convert_wikilinks),
+
+ # Convert striked through text
+ (re.compile(r"~~([^~]+)~~"), r"<s>\1</s>"),
+
+ # Convert line breaks -- Markdown spec says linebreak is <SPACE><SPACE><RETURN>, who am I to argue?
+ (re.compile(r"\\\\$"), r" "),
+
+ # Convert bold and italic text (do this last)
+ (re.compile(r"'''"), r"**"),
+ (re.compile(r"''"), r"*"),
+ )
+
+ def __call__(self, content, slug):
+ self.slug = slug
+
+ old_content = self.content_linebreak_pattern.sub("\\\\\\\\\n", content).splitlines()
+ new_content = []
+
+ code_block = False
+ in_list = False
+ in_table = False
+ nested_level = 0
+ prev_indent = 0
+
+ while old_content:
+ line = old_content.pop(0).rstrip()
+ tail = ["\n"]
+ while "{{{" in line or "}}}" in line:
+ if "{{{" in line:
+ code_block = True
+ line = line.replace("{{{", "```")
+ if "}}}" in line:
+ code_block = False
+ line = line.replace("}}}", "```")
+ if not code_block:
+
+ # Convert tables. References:
+ # https://github.github.com/gfm/#tables-extension-
+ # https://permatrac.noc.ietf.org/wiki/WikiFormatting#Tables
+ # Table start: line containing "||"; table end: blank line?
+ #
+ # Figuring out whether there's a real header line is fun, trac doesn't require one, markdown does. Guess we can
+ # add a dummy header if no better idea. Markdown requires delimiter line, which we add immediately after the
+ # header, both appear to be mandatory. Trac can have label cells anywhere, not just in header, might need to
+ # add "*" to those or just ignore the issue. Justification we can sort of figure out from the header,
+ # if the rows do anything different, ouch, because markdown specifies in delimiter line.
+ #
+ # Might do something clever with the "=" markers and alignment, start with just getting the basic table
+ # structure to something markdown will believe.
+
+ if line.strip().startswith("||"):
+ line = line.replace("=|", "|").replace("|=", "|")
+ line = line.replace("||", "|")
+ if not in_table:
+ tail.append("|---" * (line.count("|") - 1) + "|\n")
+ in_table = True
+ elif in_table and not line.strip().startswith("||"):
+ new_content.append("\n")
+ in_table = False
+
+ #
+ # Convert bullet lists. The start and end of a list needs an empty line.
+ #
+ nested_line = line.lstrip(' ')
+ if nested_line.startswith('- ') or nested_line.startswith('* '):
+ if not in_list:
+ new_content.append("\n")
+ nested_level = 0
+ prev_indent = 0
+ in_list = True
+ indent = len(line) - len(nested_line)
+ text_indent = len(line) - len(nested_line[1:].lstrip())
+ if indent > prev_indent:
+ nested_level += 1
+ elif indent < prev_indent:
+ nested_level -= 1
+ prev_indent = indent
+ line = ' ' * nested_level + nested_line
+ elif in_list and len(line) < len(nested_line) + text_indent:
+ new_content.append("\n")
+ in_list = False
+ nested_level = 0
+ prev_indent = 0
+ text_indent = 0
+
+ # Convert headers
+ line = self.convert_headers(line)
+
+ # Rest is regexp-driven conversions
+ for pattern, action in self.pattern_actions:
+ line = pattern.sub(action, line)
+
+ new_content.append(line)
+ new_content.extend(tail)
+
+ del self.slug
+
+ return "".join(new_content)