doc/wiki2docbook.py

   1 #!/usr/bin/python
   2 #
   3 # This script converts trac wiki to docbook
   4 # wiki pages must be in wiki/ directory and their names must start with "Guide"
   5 # the first page is named GuideIndex
   6 # output is written to docbook/ directory
   7 #
   8 # based on the following scripts:
   9 #
  10 # http://trac-hacks.org/wiki/Page2DocbookPlugin
  11 # http://trac.edgewall.org/attachment/wiki/TracWiki/trac_wiki2html.py
  12 #
  13 # see the links above for a list of requirements
  14
  15
  16 import sys
  17 import os
  18 from trac.test import EnvironmentStub, Mock, MockPerm
  19 from trac.mimeview import Context
  20 from trac.wiki.formatter import HtmlFormatter
  21 from trac.wiki.model import WikiPage
  22 from trac.web.href import Href
  23
  24 import urllib
  25 from tidy import parseString
  26 import libxml2
  27 import libxslt
  28 import re
  29
  30 datadir = os.getcwd() + "/wiki2docbook"
  31
  32
  33 xhtml2dbXsl = u"""<?xml version="1.0"?>
  34 <xsl:stylesheet version="1.0"
  35     xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
  36
  37   <xsl:import href=\"file:///""" + urllib.pathname2url(datadir + '/html2db/html2db.xsl') + """\" />
  38   <xsl:output method="xml" indent="no" encoding="utf-8"/>
  39   <xsl:param name="document-root" select="'__top_element__'"/>
  40 </xsl:stylesheet>
  41 """
  42
  43 normalizedHeadingsXsl = u"""<?xml version="1.0"?>
  44 <xsl:stylesheet version="1.0"
  45     xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
  46
  47   <xsl:import href=\"file:///""" + urllib.pathname2url(datadir + '/headingsNormalizer/headingsNormalizer.xsl') + """\" />
  48   <xsl:output method="xml" indent="no" encoding="utf-8"/>
  49   <xsl:param name="defaultTopHeading" select="FIXME"/>
  50 </xsl:stylesheet>
  51 """
  52
  53 normalizedHeadingsXsl_xmldoc = libxml2.parseDoc(normalizedHeadingsXsl)
  54 normalizedHeadingsXsl_xsldoc = libxslt.parseStylesheetDoc(normalizedHeadingsXsl_xmldoc)
  55
  56 xhtml2dbXsl_xmldoc = libxml2.parseDoc(xhtml2dbXsl)
  57 xhtml2dbXsl_xsldoc = libxslt.parseStylesheetDoc(xhtml2dbXsl_xmldoc)
  58
  59 def html2docbook(html):
  60
  61         options = dict(output_xhtml=1, add_xml_decl=1, indent=1, tidy_mark=0, input_encoding='utf8', output_encoding='utf8', doctype='auto', wrap=0, char_encoding='utf8')
  62         xhtml = parseString(html.encode("utf-8"), **options)
  63
  64         xhtml_xmldoc = libxml2.parseDoc(str(xhtml))
  65
  66         xhtml2_xmldoc = normalizedHeadingsXsl_xsldoc.applyStylesheet(xhtml_xmldoc, None)
  67
  68         nhstring = normalizedHeadingsXsl_xsldoc.saveResultToString(xhtml2_xmldoc)
  69
  70         docbook_xmldoc = xhtml2dbXsl_xsldoc.applyStylesheet(xhtml2_xmldoc, None)
  71
  72         dbstring = xhtml2dbXsl_xsldoc.saveResultToString(docbook_xmldoc)
  73
  74         xhtml_xmldoc.freeDoc()
  75         xhtml2_xmldoc.freeDoc()
  76         docbook_xmldoc.freeDoc()
  77         return dbstring.decode('utf-8')
  78
  79
  80 text = {}  #wiki text
  81 depth = {} #document depth, 0 for index, leaf documents have depth 1 or 2
  82 parent = {}#parent document (if depth > 0)
  83 inner = {} #defined for documents that are parents
  84
  85 #top element indexed by depth
  86 top_element = [ 'book', 'chapter', 'section', 'section', 'section', 'section', 'section', 'section', 'section', 'section' ]
  87
  88 env = EnvironmentStub()
  89 req = Mock(href=Href('/'), abs_href=Href('http://www.example.com/'),
  90            authname='anonymous', perm=MockPerm(), args={})
  91 context = Context.from_request(req, 'wiki')
  92
  93
  94 def read_file(name):
  95         text[name] = file("wiki/" + name).read().decode('utf-8')
  96         page = WikiPage(env)
  97         page.name = name
  98         page.text = '--'
  99         page.save('', '', '::1', 0)
 100
 101
 102 def read_index():
 103         index_name = "GuideIndex"
 104         read_file(index_name)
 105         index_text = text[index_name]
 106         depth[index_name] = 0
 107         inner[index_name] = 1
 108
 109         stack = [ index_name , '', '', '' ]
 110
 111         for line in index_text.splitlines() :
 112                 match = re.match('^( *)\* \[wiki:(Guide[a-zA-Z0-9]*)', line)
 113                 if match:
 114                         name = match.group(2)
 115                         d = len(match.group(1)) / 2
 116                         if (d > 0):
 117                                 depth[name] = d
 118                                 parent[name] = stack[d - 1]
 119                                 inner[stack[d - 1]] = 1
 120                                 stack[d] = name
 121                                 read_file(name)
 122
 123 # exclude links with depth > 1 from wiki text, they will be included indirectly
 124 def filter_out_indirect(text):
 125         out = ""
 126         for line in text.splitlines() :
 127                 match = re.match('^( *)\* \[wiki:(Guide[a-zA-Z0-9]*)', line)
 128                 d = 1
 129                 if match:
 130                         d = len(match.group(1)) / 2
 131                 if (d == 1):
 132                          out = out + line + "\n"
 133         return out
 134
 135 def process_pages():
 136         for name in text.keys():
 137                 txt = text[name]
 138
 139                 if name in inner:
 140                         txt = filter_out_indirect(txt)
 141
 142                 html = HtmlFormatter(env, context, txt).generate()
 143
 144                 html = html.replace("/wiki/Guide", "#Guide")
 145
 146                 top = top_element[depth[name]]
 147                 db = html2docbook(html)
 148
 149                 if name in inner:
 150                         # replace list items with XIncludes, FIXME: this is ugly
 151                         r = re.compile('<itemizedlist[^>]*>')
 152                         db = r.sub(r'', db);
 153
 154                         r = re.compile('</itemizedlist>')
 155                         db = r.sub(r'', db);
 156
 157                         r = re.compile('<listitem>\s*<para>\s*<link\s*linkend="(Guide[a-zA-Z0-9]*)">[^<]*</link>\s*</para>\s*</listitem>')
 158                         db = r.sub(r'<xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="\1.xml"/>\n', db);
 159
 160
 161                 db = db.replace("<__top_element__>", "<" + top + " id=\"" + name + "\">")
 162                 db = db.replace("</__top_element__>", "</" + top + ">")
 163
 164                 open("docbook/" + name + ".xml", "w").write(db.encode('utf-8'))
 165
 166
 167 read_index()
 168 process_pages()
 169
 170
 171