diff options
-rw-r--r-- | .gitignore | 7 | ||||
-rw-r--r-- | GNUmakefile | 24 | ||||
-rw-r--r-- | Makefile | 6 | ||||
-rw-r--r-- | README | 15 | ||||
-rwxr-xr-x | extract.py | 128 | ||||
-rw-r--r-- | filter.json | 19 | ||||
-rw-r--r-- | pelicanconf.py | 33 | ||||
-rwxr-xr-x | trac2md.py | 193 |
8 files changed, 425 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3888036 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +TAGS +__pycache__ +attachments +pelican/content/*/* +pelican/pelicanconf.py +trac.db +pelican/website diff --git a/GNUmakefile b/GNUmakefile new file mode 100644 index 0000000..a6e7257 --- /dev/null +++ b/GNUmakefile @@ -0,0 +1,24 @@ +# Sample settings, tweak for particular jobs on make command line or +# via environment variables. + +SOURCE_URL := https://git.cryptech.is/ +SOURCE_TRAC_DB := bikeshed.cryptech.is:/home/trac/db/trac.db +SOURCE_TRAC_ATTACHMENTS := bikeshed.cryptech.is:/home/trac/files/attachments + +all: + ./extract.py --source-url ${SOURCE_URL} + cd pelican; pelican --output website --settings pelicanconf.py --fatal errors content + +fetch: + rsync -aP --delete ${SOURCE_TRAC_DB} ${SOURCE_TRAC_ATTACHMENTS} . + +clean: + rm -rf wiki pelican + +distclean: clean + rm -rf trac.db attachments + +webfsd: + webfsd -r pelican/website -4 -L - -F -f index.html + +.PHONY: all clean fetch distclean webfsd diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..45988f6 --- /dev/null +++ b/Makefile @@ -0,0 +1,6 @@ +# BSD-style Makefile so that `make` works on both FreeBSD and Linux. +# GNU make will just ignore this because of the GNUmakefile. + +.MAIN: all + +${.TARGETS}: ; @gmake $@ @@ -0,0 +1,15 @@ +Tools to convert Permatrac Wiki to Markdown format. + +This is just the tool set for the Wiki, there's another set of scripts +for converting Trac tickets but I haven't yet converted them from the +task-specific form they're in at the moment (from an old Trac site +whose tickets I had to migrate to GitHub issues about five years +ago...). + +As a test, we run the Markdown from the conversion process through +Pelican to generate a static site. This may or may not be useful in +its own right, but for our purposes simply running the Pelican site +generator is helpful, since it spots a lot of dumb Markdown errors for +us (missing links, horribly borked Markdown syntax, etc). + +Pelican content format: https://docs.getpelican.com/en/latest/content.html diff --git a/extract.py b/extract.py new file mode 100755 index 0000000..368211c --- /dev/null +++ b/extract.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python3 + +import fnmatch +import hashlib +import json +import os +import shutil +import sqlite3 +import sys +import time +import urllib.parse +import argparse + +import trac2md + +wiki_query = ''' + SELECT + name, + author, + version, + time / 1000000 AS time, + text + FROM wiki + ORDER BY + name, version +''' + +attachment_query = ''' + SELECT + id, + filename, + size, + author, + description, + ipnr, + time / 1000000 AS createdtime + FROM + attachment + WHERE + type = 'wiki' + ORDER BY + filename, time +''' + +def attachment_link(row): + h = lambda whatever: hashlib.sha1(whatever.encode()).hexdigest() + h1 = h(row.id) + h2 = h(row.filename) + fn2 = os.path.splitext(row["filename"])[1] + return \ + os.path.join("attachments", "wiki", h1[:3], h1, h2 + fn2), \ + os.path.join("pelican", "content", urllib.parse.quote(row.id, ""), row.filename) + +class Filter: + + def __init__(self, filename = "filter.json"): + with open(filename) as f: + filter = json.load(f) + if not all(action in "-+" for action, pattern in filter): + sys.exit("Bad action \"{}\" in filter".format(action)) + self.filter = tuple((action == "+", pattern) for action, pattern in filter) + + def __call__(self, name): + for action, pattern in self.filter: + if fnmatch.fnmatch(name, pattern): + return action + return True + +class Row(sqlite3.Row): + + def __getattr__(self, name): + return self[name] + + @property + def isotime(self): + return time.strftime("%Y-%m-%d %H:%M", time.gmtime(self.time)) + +def markdown_header(row, first_published): + if row.name in first_published: + modtime = "Modified: {}\n".format(row.isotime) + else: + modtime = "" + first_published[row.name] = row.isotime + return "Title: {}\nAuthor: {}\nDate: {}\n{}\n".format(row.name, row.author, first_published[row.name], modtime) + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--source-url") + args = ap.parse_args() + + for dn in ("wiki", "pelican"): + shutil.rmtree(dn) + + for dn in ("wiki", "pelican/content"): + os.makedirs(dn) + + os.link("pelicanconf.py", "pelican/pelicanconf.py") + + wiki_to_markdown = trac2md.Trac2Markdown(args.source_url) + + keep = Filter() + + first_published = {} + + db = sqlite3.connect("trac.db") + db.row_factory = Row + + for row in db.execute(wiki_query): + if keep(row.name): + slug = urllib.parse.quote(row.name, "") + #print(slug, row.version) + with open("wiki/{}.trac".format(slug), "w") as f: + f.write(row.text) + md = markdown_header(row, first_published) + wiki_to_markdown(row.text, slug) + with open("pelican/content/{}.md".format(slug), "w") as f: + f.write(md) + + for row in db.execute(attachment_query): + src, dst = attachment_link(row) + #print("{} => {}".format(dst, src)) + if not os.path.isdir(os.path.dirname(dst)): + os.makedirs(os.path.dirname(dst)) + os.link(src, dst) + + db.close() + +if __name__ == "__main__": + main() diff --git a/filter.json b/filter.json new file mode 100644 index 0000000..708428e --- /dev/null +++ b/filter.json @@ -0,0 +1,19 @@ +[ + ["+", "WikiStart"], + ["-", "CamelCase"], + ["-", "EDAToolchainSurvey\""], + ["-", "GitRepositories*"], + ["-", "InterMapTxt"], + ["-", "InterTrac"], + ["-", "InterWiki"], + ["-", "PageTemplates"], + ["-", "PhotoFolder"], + ["-", "RecentChanges"], + ["-", "SandBox"], + ["-", "ScratchPage"], + ["-", "TicketQuery"], + ["-", "TitleIndex"], + ["-", "Trac*"], + ["-", "Wiki*"], + ["+", "*"] +] diff --git a/pelicanconf.py b/pelicanconf.py new file mode 100644 index 0000000..ab8afd2 --- /dev/null +++ b/pelicanconf.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- # + +AUTHOR = "Cryptech Core Team" +SITENAME = "Cryptech Project" + +PATH = "content" +TIMEZONE = "UTC" +DEFAULT_LANG = "en" + +# Feed generation is usually not desired when developing +SITEURL = "" +RELATIVE_URLS = True +FEED_ALL_ATOM = None +CATEGORY_FEED_ATOM = None +TRANSLATION_FEED_ATOM = None +AUTHOR_FEED_ATOM = None +AUTHOR_FEED_RSS = None + +# Blogroll +LINKS = (("Pelican", "http://getpelican.com/"), + ("Python.org", "http://python.org/"), + ("Jinja2", "http://jinja.pocoo.org/")) +LINKS_WIDGET_NAME = "Links" + +# Social widget. Can't get rid of this with default theme, only change its name. +# Fiddle with themes later +SOCIAL = () +SOCIAL_WIDGET_NAME = "Subscribe" + +DEFAULT_PAGINATION = 10 + +#THEME = "/home/blog/pelican-themes/sundown" diff --git a/trac2md.py b/trac2md.py new file mode 100755 index 0000000..5a00754 --- /dev/null +++ b/trac2md.py @@ -0,0 +1,193 @@ +# This started out as https://www.snip2code.com/Snippet/1704331/Convert-trac-markup-to-Markdown/ +# which in turn said "This code mostly taken from patches to pagure_importer by mreynolds". +# Has mutated considerably since then. + +import re +from urllib.parse import quote + +class Trac2Markdown: + + content_linebreak_pattern = re.compile(r"\[\[br\]\]|\\\\", re.I) + camelcase_pattern = re.compile(r"(?:^|(?<=\s))([A-Z][a-z]+[A-Z][a-z][A-Za-z]*)(?:$|(?=\s))") + + wikiheading_patterns = tuple( + (level, re.compile("^{} (.*)[ \t]*=*$".format("=" * level))) + for level in range(1, 7) + ) + + def convert_headers(self, line): + for level_count, header in self.wikiheading_patterns: + try: + level = header.search(line).group(1) + if level: + line = "{} {}".format('#' * level_count, level.rstrip("= \r\t")) + break # No need to check other heading levels + except: + pass # Try the next heading level + return line + + def convert_to_creole(self, m): + # Convert Trac's native link form to Creole's, so that rest of the code only has to deal with one format. + # Creole's is easier to parse and harder to confuse with partially converted Markdown. + + text = m.group(1).strip() + if " " in text: + return "[[{0[0]}|{0[1]}]]".format(text.split(" ", 1)) + elif ":" in text or self.camelcase_pattern.match(text): + return "[[{}]]".format(text) + else: + return m.group(0) + + # Probably most of the non-wiki scheme tests should become a table in an + # extended JSON config file which maps + # + # { "source:fee/fie/foe/fum": "https://git.cryptech.is/blarg/blee/blue" } + + def convert_wikilinks(self, m): + scheme, link, text = [p.strip() if p else p for p in m.groups()] + if text is None: + text = link + if any(link.startswith(q) and link.endswith(q) for q in ('"', "'")): + link = link[1:-1] + if any(text.startswith(q) and text.endswith(q) for q in ('"', "'")): + text = text[1:-1] + if text == link and link.startswith("http") and "://" in link: + return "<{}>".format(link) + elif scheme == "attachment:": + return "[{}]({{attach}}{}/{})".format(text, self.slug, link) + elif scheme in ("source:", "browser:"): + return "[{}]({}/{})".format(text, self.source_url.rstrip("/"), link.lstrip("/")) + elif scheme == "wiki:" or (scheme is None and self.camelcase_pattern.match(link)): + return "[{}]({{filename}}{}.md)".format(text, link) + else: + return "[{}]({})".format(text, link) + + def convert_image(self, m): + text = m.group(1).split(",")[0].strip() + if "://" in text: + return "<img src=\"{}\">".format(text) + else: + return "".format(text, self.slug, quote(text, "")) + + def __init__(self, source_url): + self.source_url = source_url + self.pattern_actions = ( + + # Convert TracLinks to WikiCreole syntax to simplify remaining processing + (re.compile(r"(?<!\[)\[([^][]+)\]"), self.convert_to_creole), + + # Convert CamelCase links to explicit links + (self.camelcase_pattern, r"[[\1]]"), + + # Convert !x quoting + (re.compile(r"!((?:\w|[#])+)"), r"\1"), + + # Convert (limited subset of) spans + (re.compile(r"\[\[span\((?:[^][]*,)*([^(),]+)\)\]\]"), r"\1"), + + # Convert images + (re.compile(r"\[\[Image\((.*)\)\]\]"), self.convert_image), + + # Delete Trac macros that have no useful counterpart + (re.compile(r"\[\[PageOutline\]\]", re.I), r""), + + # Convert wiki links + (re.compile(r"\[\[(wiki:|attachment:|source:|browser:)?([^]|[]+)(?:[|]([^][]+))?\]\]"), self.convert_wikilinks), + + # Convert striked through text + (re.compile(r"~~([^~]+)~~"), r"<s>\1</s>"), + + # Convert line breaks -- Markdown spec says linebreak is <SPACE><SPACE><RETURN>, who am I to argue? + (re.compile(r"\\\\$"), r" "), + + # Convert bold and italic text (do this last) + (re.compile(r"'''"), r"**"), + (re.compile(r"''"), r"*"), + ) + + def __call__(self, content, slug): + self.slug = slug + + old_content = self.content_linebreak_pattern.sub("\\\\\\\\\n", content).splitlines() + new_content = [] + + code_block = False + in_list = False + in_table = False + nested_level = 0 + prev_indent = 0 + + while old_content: + line = old_content.pop(0).rstrip() + tail = ["\n"] + while "{{{" in line or "}}}" in line: + if "{{{" in line: + code_block = True + line = line.replace("{{{", "```") + if "}}}" in line: + code_block = False + line = line.replace("}}}", "```") + if not code_block: + + # Convert tables. References: + # https://github.github.com/gfm/#tables-extension- + # https://permatrac.noc.ietf.org/wiki/WikiFormatting#Tables + # Table start: line containing "||"; table end: blank line? + # + # Figuring out whether there's a real header line is fun, trac doesn't require one, markdown does. Guess we can + # add a dummy header if no better idea. Markdown requires delimiter line, which we add immediately after the + # header, both appear to be mandatory. Trac can have label cells anywhere, not just in header, might need to + # add "*" to those or just ignore the issue. Justification we can sort of figure out from the header, + # if the rows do anything different, ouch, because markdown specifies in delimiter line. + # + # Might do something clever with the "=" markers and alignment, start with just getting the basic table + # structure to something markdown will believe. + + if line.strip().startswith("||"): + line = line.replace("=|", "|").replace("|=", "|") + line = line.replace("||", "|") + if not in_table: + tail.append("|---" * (line.count("|") - 1) + "|\n") + in_table = True + elif in_table and not line.strip().startswith("||"): + new_content.append("\n") + in_table = False + + # + # Convert bullet lists. The start and end of a list needs an empty line. + # + nested_line = line.lstrip(' ') + if nested_line.startswith('- ') or nested_line.startswith('* '): + if not in_list: + new_content.append("\n") + nested_level = 0 + prev_indent = 0 + in_list = True + indent = len(line) - len(nested_line) + text_indent = len(line) - len(nested_line[1:].lstrip()) + if indent > prev_indent: + nested_level += 1 + elif indent < prev_indent: + nested_level -= 1 + prev_indent = indent + line = ' ' * nested_level + nested_line + elif in_list and len(line) < len(nested_line) + text_indent: + new_content.append("\n") + in_list = False + nested_level = 0 + prev_indent = 0 + text_indent = 0 + + # Convert headers + line = self.convert_headers(line) + + # Rest is regexp-driven conversions + for pattern, action in self.pattern_actions: + line = pattern.sub(action, line) + + new_content.append(line) + new_content.extend(tail) + + del self.slug + + return "".join(new_content) |