extract.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. #!/usr/bin/env python3
  2. import fnmatch
  3. import hashlib
  4. import json
  5. import os
  6. import shutil
  7. import sqlite3
  8. import sys
  9. import time
  10. import urllib.parse
  11. import argparse
  12. import trac2md
  13. wiki_query = '''
  14. SELECT
  15. name,
  16. author,
  17. version,
  18. time / 1000000 AS time,
  19. text
  20. FROM wiki
  21. ORDER BY
  22. name, version
  23. '''
  24. attachment_query = '''
  25. SELECT
  26. id,
  27. filename,
  28. size,
  29. author,
  30. description,
  31. ipnr,
  32. time / 1000000 AS createdtime
  33. FROM
  34. attachment
  35. WHERE
  36. type = 'wiki'
  37. ORDER BY
  38. filename, time
  39. '''
  40. def attachment_link(row):
  41. h = lambda whatever: hashlib.sha1(whatever.encode()).hexdigest()
  42. h1 = h(row.id)
  43. h2 = h(row.filename)
  44. fn2 = os.path.splitext(row["filename"])[1]
  45. return \
  46. os.path.join("attachments", "wiki", h1[:3], h1, h2 + fn2), \
  47. os.path.join("pelican", "content", urllib.parse.quote(row.id, ""), row.filename)
  48. class Filter:
  49. def __init__(self, filename = "tools/filter.json"):
  50. with open(filename) as f:
  51. filter = json.load(f)
  52. if not all(action in "-+" for action, pattern in filter):
  53. sys.exit("Bad action \"{}\" in filter".format(action))
  54. self.filter = tuple((action == "+", pattern) for action, pattern in filter)
  55. def __call__(self, name):
  56. for action, pattern in self.filter:
  57. if fnmatch.fnmatch(name, pattern):
  58. return action
  59. return True
  60. class Row(sqlite3.Row):
  61. def __getattr__(self, name):
  62. return self[name]
  63. @property
  64. def isotime(self):
  65. return time.strftime("%Y-%m-%d %H:%M", time.gmtime(self.time))
  66. def markdown_header(row, first_published):
  67. if row.name in first_published:
  68. modtime = "Modified: {}\n".format(row.isotime)
  69. else:
  70. modtime = ""
  71. first_published[row.name] = row.isotime
  72. return "Title: {}\nAuthor: {}\nDate: {}\n{}\n".format(row.name, row.author, first_published[row.name], modtime)
  73. def main():
  74. ap = argparse.ArgumentParser()
  75. ap.add_argument("--source-url")
  76. args = ap.parse_args()
  77. for dn in ("wiki", "pelican"):
  78. if os.path.exists(dn):
  79. shutil.rmtree(dn)
  80. for dn in ("wiki", "pelican/content"):
  81. os.makedirs(dn)
  82. os.link("tools/pelicanconf.py", "pelican/pelicanconf.py")
  83. wiki_to_markdown = trac2md.Trac2Markdown(args.source_url)
  84. keep = Filter()
  85. first_published = {}
  86. db = sqlite3.connect("trac.db")
  87. db.row_factory = Row
  88. for row in db.execute(wiki_query):
  89. if keep(row.name):
  90. slug = urllib.parse.quote(row.name, "")
  91. #print(slug, row.version)
  92. with open("wiki/{}.trac".format(slug), "w") as f:
  93. f.write(row.text)
  94. md = markdown_header(row, first_published) + wiki_to_markdown(row.text, slug)
  95. with open("pelican/content/{}.md".format(slug), "w") as f:
  96. f.write(md)
  97. for row in db.execute(attachment_query):
  98. src, dst = attachment_link(row)
  99. #print("{} => {}".format(dst, src))
  100. if not os.path.isdir(os.path.dirname(dst)):
  101. os.makedirs(os.path.dirname(dst))
  102. os.link(src, dst)
  103. db.close()
  104. if __name__ == "__main__":
  105. main()