MarkdownRegion.py

# Description: Markdown region extension
# Documentation: markdown_region.txt

"""
Region extension for Python-Markdown
====================================

Adds regions, which wrap the content in an html-element.

A region declaration is of the form

    !!! <element key1 = "value1" key2 = "value2" ...>
        Stuff here
        * more stuff here

This is similar to the form of admonitions.
"""

from __future__ import absolute_import
from __future__ import unicode_literals
from __future__ import print_function

from markdown import Extension
from markdown.blockprocessors import BlockProcessor
from markdown.util import etree, AtomicString
from markdown.treeprocessors import Treeprocessor
import re

class MarkdownRegion_Extension(Extension):
    """ Region extension for Python-Markdown. """

    def extendMarkdown(self, md, md_globals):
        """ Add MarkdownRegion to Markdown instance. """
        md.registerExtension(self)

        # A block-processor first wraps everything in <region> elements.
        md.parser.blockprocessors.add(
            'region',
            MarkdownRegion_BlockProcessor(md.parser),
            '_begin')

        # Since <region> is not a valid html-element, 
        # it is replaced with a <div> element by a 
        # tree-processor. 
        md.treeprocessors.add(
            'region',
            MarkdownRegion_TreeProcessor(md.parser),
            '_begin')

class MarkdownRegion_BlockProcessor(BlockProcessor):

    # A region declaration is of the form
    # !!! <tag aKey = "aValue" bKey = "bValue">

    # We allow the region to be preceded by
    # stuff inside a block, as in
    # 
    # preceding stuff !!! <tag...>
    #
    # It can be anything provided it isn't indented.
    precedingPattern = r'((?:^|\n)(?!    ).*?)'

    introPattern = r'!!!'
    whitespacePattern = r'[ \t]*'
    namePattern = r'([\w\-]*)'
    stringPattern = r'"([^"]*)"'
    trailingWhitespacePattern = r'[ \t]*\n?'

    keyPattern = (
        r'(?:' +
        namePattern +
        whitespacePattern + 
        r'=' +
        whitespacePattern +
        stringPattern +
        whitespacePattern +
        r')')

    regionPattern = (
        precedingPattern +
        introPattern + 
        whitespacePattern + 
        r'<' +
        namePattern + 
        whitespacePattern + 
        r'(' + keyPattern + r'*)' +
        r'>' +
        trailingWhitespacePattern)

    regionRegex = re.compile(regionPattern)
    keyRegex = re.compile(keyPattern)

    def test(self, parent, block):
        sibling = self.lastChild(parent)
        return self.regionRegex.search(block) or \
            (block.startswith(' ' * self.tab_length) and 
            sibling is not None and
            sibling.tag == 'region')

    def run(self, parent, blockSet):
        block = blockSet.pop(0)

        match = self.regionRegex.search(block)
        if match != None and match.group(1).strip() != '':
            # The block may capture preceding stuff like this:
            #
            # Preceding stuff !!! <div class = "A">
            #     Stuff

            # Parse the preceding stuff.
            self.parser.parseChunk(parent, match.group(1))

        # A block is a Markdown concept, which means
        # text without empty lines.

        # Suppose we are given a block like this:
        #
        # !!! <div class = "A">
        #     Stuff
        # !!! <div class = "B">
        # !!! <div class = "C">
        #     !!! <div class = "C-A">
        #
        # Then we would like to handle the different
        # <div>-regions separately. The parseBlacks()
        # function separates the regions A, B and C 
        # into a list of texts.
        parsedSet = self.parseBlocks(block)

        # Push the separated regions back to the set
        # of blocks to handle.
        blockSet[0 : 0] = parsedSet        

        # Pick the first separated block. In our example,
        # this is
        #
        # !!! <div class = "A">
        #     Stuff
        block = blockSet.pop(0)

        # Since there can be empty lines, a subsequent part 
        # of the region may be denoted by indentation. Therefore,
        # there are two cases:
        #
        # 1) Block begins with !!! <div class = "A">
        # 2) Block begins with indentation, and is preceded
        # by a case-1-block, or a case-2-block.
        #
        # Note that self.test() checks exactly for these
        # conditions, and the self.run() is only run for
        # blocks which pass self.test().

        # Check whether we have case 1.
        if match:
            # This is case 1.

            # The !!! <div class = "A"> part contains
            # all the data concerning the generated 
            # html-element.

            # Extract the region tag.
            tagName = match.group(2)
            if tagName == '':
                tagName = 'div'

            # Create a 'region' sub-element for the current 
            # element-tree node. The 'region' element does not
            # exist in html; we will change the tag later.
            region = etree.SubElement(
                parent, 'region',
                {
                    # The actual element-tag is stored as an attribute.
                    'tag' : tagName
                })

            # Set the key-value pairs as element attributes.
            keySet = match.group(3)
            for keyMatch in self.keyRegex.finditer(keySet):
                key = keyMatch.group(1)
                value = keyMatch.group(2)
                region.set(key, value)

            # Remove the !!! <div class = "A">  part from the
            # block, so that we get to the actual content in the 
            # region.
            block = block[match.end():]
        else:
            # This is case 2.

            # Rather than creating a new html-element,
            # we append the indented content into the
            # previously created element.
            region = self.lastChild(parent)

        # In either case, we now have indented content.
        # However, it may be followed by unindented content:
        # 
        #     Stuff
        # Stuff ended, and now something else follows.

        # Deindent one level from the block, and store the
        # unindented stuff following the indented stuff
        # in 'theRest'. 
        block, theRest = self.detab(block)

        contentType = region.get('remark-content', 'remark')

        # The standard ParagraphProcessor block-processor
        # works specially depending on parse.state. If
        # it isn't 'list', then it wraps the content in
        # the <p> element.
        if contentType == 'remark-no-p':
            self.parser.state.set('list')

        # At this point, the block consists solely of the
        # indented content, which has been deindented.

        if (contentType == 'remark' or 
            contentType == 'remark-no-p'):
            # The content is to be interpreted as Markdown.
            # Parse the block recursively.
            self.parser.parseChunk(region, block)
        elif contentType == 'text':
            # The content is to be interpreted as raw text.
            # Store or append it to the element's text field.

            # Markdown's post-processor substitutes html-entities
            # for & < >. To avoid this, all raw html must be stored
            # in Markdown's htmlStash. The function returns a
            # placeholder string, which is expanded by the html
            # post-processor.
            encodedBlock = self.parser.markdown.htmlStash.store(block)

            if region.text == None:
                # One has to be careful when combining strings here;
                # the string operations return a str-type, which
                # loses the AtomicString super-class. So first do
                # the operations, and then wrap them into an
                # AtomicString.
                region.text = AtomicString(encodedBlock + '\n')
            else:
                region.text = AtomicString(region.text + '\n' + encodedBlock + '\n') 

        if theRest:
            # Insert the unindented stuff back into the set
            # of blocks to process. 
            blockSet.insert(0, theRest)

        if contentType == 'remark-no-p':
            self.parser.state.reset()

    def parseBlocks(self, block):
        previousStart = 0;
        blockSet = []
        # print('PARSE')
        # print(repr(block))
        # print(len(block))
        for match in re.finditer(self.regionRegex, block):
            if match.start() != previousStart:
                newBlock = block[previousStart : match.start(1)]
                blockSet.append(newBlock)
                # print('MATCH', previousStart, match.start(1))
                # print(repr(newBlock))
                previousStart = match.start(1)

        if previousStart < len(block):
            newBlock = block[previousStart : ]
            blockSet.append(newBlock)
            # print('LAST-MATCH', previousStart, len(block))
            # print(repr(newBlock))

        return blockSet

class MarkdownRegion_TreeProcessor(Treeprocessor):
    """
    A Treeprocessor that traverses a tree, replacing 'region' tags
    with 'div'.
    """

    def __init__(self, md):
        None

    def run(self, root):
        for element in root.findall(".//region"):
            element.tag = element.get('tag', 'div')
            element.attrib.pop('tag')
            if element.attrib.get('remark-content') != None:
                element.attrib.pop('remark-content')

def makeExtension(*args, **kwargs):
    return MarkdownRegion_Extension(*args, **kwargs)