MarkdownMath.py

# Description: Markdown math extension
# Documentation: math_syntax.txt

from __future__ import absolute_import
from __future__ import unicode_literals
from __future__ import print_function

import re
from markdown.inlinepatterns import Pattern
from markdown.util import etree, AtomicString
from markdown import Extension
from markdown.treeprocessors import Treeprocessor

class MarkdownMath_Extension(Extension):
    """ Math extension for Python-Markdown. """

    def extendMarkdown(self, md, md_globals):
        """ Add MarkdownMath to Markdown instance. """
        md.registerExtension(self)

        # Add math inline-patterns.

        # Math should not be affected by markdown \ escaping.
        # For example, 
        #
        # $\begin{bmatrix} a & b \\ c & d \end{bmatrix}
        #
        # The row-end marker \\ should not be interpreted
        # as escaping \. Therefore, the math inline-pattern
        # must run before `escape` inline-pattern.

        # Should the `backtick` inline-pattern run before math?
        # Ideally, inline-patterns should be parsed recursively, 
        # as we discuss in problem.txt. However, Python Markdown 
        # parses them sequentially, and therefore forces us to 
        # choose an order. We handle backticks first, because 
        # we don't want the extension to modify how backticks 
        # work in Markdown.

        # The same can be asked for _emphasis_ and others.
        # Since the mathematics is its own language, we
        # want it to not be affected by other inline-patterns,
        # and therefore run mathematics before most of everything 
        # else. This makes the backticks an exception.

        # The math inline-patterns block future inline-patterns 
        # from modifying the math (by using AtomicString in 
        # MarkdownMath_Pattern).

        # First run latex display-math $$, because
        # otherwise $ would incorrectly handle it.
        md.inlinePatterns.add(
            'display-latex-math', 
            MarkdownMath_Pattern('$$', '$$', 'math/tex; mode=display', 'display-latex-math'), 
            '>backtick' )

        # Then run Latex inline-math $. 
        md.inlinePatterns.add(
            'inline-latex-math', 
            MarkdownMath_Pattern('$', '$', 'math/tex', 'latex-math'), 
            '>display-latex-math' )

        # Finally run Asciimath inline-math ''.
        md.inlinePatterns.add(
            'inline-ascii-math', 
            MarkdownMath_Pattern("''", "''", 'math/asciimath', 'ascii-math'), 
            '>inline-latex-math' )

        # Consider $a + `b` + c$. This will end up embedding
        # a <code>b</code> tag into the math expression.
        # Using a tree-processor, we replace the <code> tag 
        # with its text wrapped in backticks (`b`), to recover 
        # the original text. This solves the problem of using
        # an even number of backticks in a math-expression.
        # It does not solve the problem of using an odd number
        # of backticks.

        # We place the tree-processor at the end, because it
        # must occur after the Region-extension converts
        # <region> tags to <script> tags.
        md.treeprocessors.add(
            'math-replace-code',
            MarkdownMath_TreeProcessor(md.parser),
            '_end')

class MarkdownMath_Pattern(Pattern):
    def __init__(self, beginString, endString, scriptType, className):
        self.className = className
        self.scriptType = scriptType
        self.pattern = (
            r'^(.*?)' +
            re.escape(beginString) +
            r'(.*?)' +
            re.escape(endString) +
            r'(.*?)$'
            )
        self.regex = re.compile(self.pattern, re.DOTALL | re.UNICODE)

    def getCompiledRegExp(self):
        return self.regex

    def handleMatch(self, match):
        # We need a span-element to be able to assign
        # a class-identifier for CSS-styling. Why not
        # assign a class to the <script> element?
        # Because MathJax gets rid of it, and so it
        # cannot be used for styling.
        spanElement = etree.Element(
            'span', 
            {
                'class' : self.className
            })

        scriptElement = etree.SubElement(
            spanElement,
            'script', 
            {
                'type' : self.scriptType
            })

        # The AtomicString makes sure that the expression will not
        # be considered by the other inline patterns. 
        scriptElement.text = AtomicString(match.group(2))

        return spanElement

class MarkdownMath_TreeProcessor(Treeprocessor):
    """
    Among all <script type="math/...> tags, replaces a 
    child <code> tag with its text, and removes all
    child tags. The <code> tag is created by backticks
    in a math expression, which we then fix here. Also
    replaces occurrence of </ with < /, to avoid
    accidentally closing the <script> tag.
    """

    def __init__(self, md):
        None

    def printIt(self, element, level = 0):
        print('\t' * level + element.tag)
        for child in element:
            self.printIt(child, level + 1)

    def run(self, root):
        # Iterate over all `script` tags.
        for element in root.findall(".//script"):
            if not element.get('type', '').startswith('math/'):
                # Only visit those `script` tags which
                # have `type` attribute beginning with 
                # `math/`.
                continue

            # In case the script contains mixed content,
            # i.e. text interleaved with tags, the text
            # following a tag is stored in the `tail`
            # member. While we remove the tags, we want
            # to preserve the text.
            for child in element:
                if child.tag == 'code':
                    # Add the text in a <code>
                    # tag back to the main text.
                    element.text += '`' + child.text + '`'
                # Add the text following the child tag.
                element.text += child.tail

            # Remove all child tags.
            childSet = list(element)
            for child in childSet:
                element.remove(child)

            # Inside a <script> element, the characters are interpreted
            # as CDATA, and the first occurrence of </ is interpreted
            # as the beginning of the end-tag </script>. Therefore,
            # if the mathematics contains a </, we need to convert it
            # to something else. We interpret </ as a combination of 
            # less-than and division, and therefore replace it with < /.
            element.text = element.text.replace('</', '< /')

def makeExtension(*args, **kwargs):
    return MarkdownMath_Extension(*args, **kwargs)