MarkdownScope.py

# Description: Markdown scoped links extension
# Documentation: markdown_scoped_links.txt

from __future__ import absolute_import
from __future__ import unicode_literals
from __future__ import print_function

import re
from markdown.inlinepatterns import LinkPattern
from markdown.util import etree, AtomicString
from markdown import Extension
from markdown.treeprocessors import Treeprocessor

class MarkdownScope_Extension(Extension):
    """ Scope extension for Python-Markdown. """

    def extendMarkdown(self, md, md_globals):
        """ Add MarkdownScope to Markdown instance. """
        md.registerExtension(self)

        # Add scoped references.
        md.inlinePatterns.add(
            'scoped-reference',
            Markdown_ScopedReference_Pattern(md),
            '>reference')

        # Remove standard references.
        del md.inlinePatterns['reference']
        del md.inlinePatterns['image_reference']

        # Add scoped link-definitions.
        md.inlinePatterns.add(
            'scoped-link-definition',
            Markdown_LinkDefinition_Pattern(md),
            '<scoped-reference')

        # for line in md.inlinePatterns:
        #     print(line)

        # Remove the Reference preprocessor.
        del md.preprocessors['reference']

        # Add the scoped-link resolver.
        # This has to happen after the parsing of
        # the inline-patterns, which is done by the
        # 'inline' tree-processor.
        md.treeprocessors.add(
            'scoped-reference',
            MarkdownScope_TreeProcessor(md.parser),
            '>inline')

class Markdown_LinkDefinition_Pattern(LinkPattern):
    # The regexes follow those in Markdown's
    # ReferencePreprocessor, for compatibility.

    idPattern = r'\[([^\]]*)\]'
    urlPattern = r'([^ \n]*)'
    titlePattern = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*'

    # The only change here is that we allow the optional 
    # newline directly, since we have the block in string 
    # form.
    linkDefinitionPattern = (
        r'[ ]{0,3}' + 
        idPattern +
        r':\s*' + 
        urlPattern +
        r'[ \t]*(?:\n)?[ \t]*' +
        r'(%s)?' % titlePattern
        )

    def __init__(self, markdown):
        super(Markdown_LinkDefinition_Pattern, self).__init__(
            self.linkDefinitionPattern, markdown)

    def handleMatch(self, match):
        id = match.group(2).strip().lower()

        url = match.group(3).lstrip('<').rstrip('>')
        if not url:
            url = ''
        url = self.sanitize_url(self.unescape(url))

        title = match.group(6) or match.group(7) or match.group(8)
        if not title:
            title = ''

        # We store the link-definition as an element,
        # and handle it later in a tree-processor.
        element = etree.Element(
            'scoped-link-definition',
            {
                'id' : id,
                'url' : url,
                'title' : title
            })

        #print('DEFINITION', id, url)

        return element

class Markdown_ScopedReference_Pattern(LinkPattern):
    # These regular expressions were directly copied
    # from Python Markdown's implementation, for
    # compatibility.

    NEWLINE_CLEANUP_RE = re.compile(r'[ ]?\n', re.MULTILINE)

    imagePattern = r'(\!?)'

    NOBRACKET = r'[^\]\[]*'
    textPattern = (
        r'\[(' +
        (NOBRACKET + r'(\[')*6 +
        (NOBRACKET + r'\])*')*6 +
        NOBRACKET + 
        r')\]'
    )

    idPattern = r'\[([^\]]*)\]'
    referencePattern = (
        imagePattern + 
        textPattern + 
        r'\s?' + 
        idPattern
        )

    def __init__(self, markdown):
        super(Markdown_ScopedReference_Pattern, self).__init__(
            self.referencePattern, markdown)

    def handleMatch(self, match):
        try:
            # The link has an explicit
            # link-id. Use that.
            id = match.group(10).lower()
        except IndexError:
            id = None

        if not id:
            # The link is of the form "[Google][]"
            # or "[Google]". Since there is no explicit link-id, 
            # we use the link-description as the link-id.
            # The link-id is case-insensitive by the
            # Markdown specification.
            id = match.group(3).lower()

        # Clean up linebreaks in id
        id = self.NEWLINE_CLEANUP_RE.sub(' ', id)

        imageTag = match.group(2)

        #print('LINK', imageTag, id)

        referenceType = 'link'
        if imageTag != '':
            referenceType = 'image'

        description = self.unescape(match.group(3))

        # Create an element for the link,
        # and store the link-id in it.
        element = etree.Element(
            'scoped-reference',
            {
                'id' : id,
                'type' : referenceType,
                'description' : description
            })

        if referenceType == 'link':
            # Store the description in the text argument,
            # to parse it further. For example, it may 
            # contain an emphasis.
            element.text = description

        return element

class Link(object):
    def __init__(self, title = '', url = ''):
        self.title = title
        self.url = url

class MarkdownScope_TreeProcessor(Treeprocessor):
    def __init__(self, md):
        None

    def moveUpLinkSets(self, root):
        # The set of elements which create a new scope
        # for the link-definitions.
        scopeElementSet = {
            # Block
            'div',
            # List item
            'li',
            # Definition
            'dd',
            # Table item
            'td'
        }

        # The set of elements which to remove if they
        # become empty due to moving the link-definition.
        emptyElementSet = {
            'p'
        }

        parentSet = {c:p for p in root.iter() for c in p}

        removeSet = []
        while True:
            changed = False
            for child in root.findall('.//scoped-link-definition'):
                parent = parentSet[child]
                if (not parent.tag in scopeElementSet):
                    parent.remove(child)
                    grandParent = parentSet[parent]
                    grandParent.append(child)
                    parentSet[child] = grandParent
                    changed = True
                    if (parent.tag in emptyElementSet and
                        len(parent) == 0):
                        removeSet.append((parent, grandParent))

            if not changed:
                break

        for (parent, grandParent) in removeSet:
            grandParent.remove(parent)

    def gatherLinkSets(self, root):
        '''
        Gathers link-definitions into a dictionary
        which maps a link-id to a Link object. This
        dictionary is stored in the 'remarkLinkSet' 
        attribute of the element itself.
        '''
        linkSet = dict()
        for child in root:
            if child.tag == 'scoped-link-definition':
                id = child.get('id')
                title = child.get('title')
                url = child.get('url')

                linkSet[id] = Link(title, url)

        if len(linkSet) > 0:
            root.set('remarkLinkSet', linkSet)

        for child in root:
            self.gatherLinkSets(child)

    def resolveLinks(self, root, parentScope):
        # Update the scope.
        scope = dict(parentScope)
        scope.update(root.get('remarkLinkSet', dict()))

        # Resolve the links at this level.
        for child in root:
            if child.tag != 'scoped-reference':
                continue

            # There are two kinds of references:
            # links and images.
            referenceType = child.get('type')
            child.attrib.pop('type')

            if referenceType == 'link':
                # For a link, we create an <a> element.
                elementTag = 'a'
                urlAttribute = 'href'
            elif referenceType == 'image':
                # For an image, we create an <img> element.
                elementTag = 'img'
                urlAttribute = 'src'
            else:
                # The type of the reference is unknown.
                # Skip it.
                continue

            # Change the current element tag to
            # the desired tag.
            child.tag = elementTag

            # Get the reference id, 
            # and remove it from the attbributes.
            id = child.get('id')
            child.attrib.pop('id')

            # Get the reference description, 
            # and remove it from the attbributes.
            description = child.get('description')
            child.attrib.pop('description')

            if referenceType == 'link':
                # For a link, the description is the
                # text in the link; it is already in
                # place.
                None
            else:
                # For an image, the description is the
                # alternative text for the image.
                child.set('alt', description)

            # Get the link-definition based on its id.
            link = scope.get(id)
            if link == None:
                # The link id is unknown. 
                continue

            # Set the url of the reference.
            url = link.url.strip()
            if url != '':
                child.set(urlAttribute, url)

            # Set the tool-tip of the reference.
            title = link.title.strip()
            if title != '':
                child.set('title', title)

        # Resolve the links at the child elements.
        for child in root:
            self.resolveLinks(child, scope)

    def removeLinkSets(self, root):
        if root.get('remarkLinkSet') != None:
            root.attrib.pop('remarkLinkSet')

        # Gather the link elements for removal.
        removeSet = []
        for child in root:
            if (child.tag == 'scoped-link-definition' or
                child.tag == 'scoped-reference'):
                removeSet.append(child)
            else:
                self.removeLinkSets(child)

        # Remove the link elements.
        for child in removeSet:
            root.remove(child)

    def printIt(self, root, level = 0):
        print('    ' * level + root.tag, end = ' ')

        for (key, value) in root.items():
            print(key, '=', '"' + str(value) + '"', end = ' ')

        for child in root:
            self.printIt(child, level + 1)

    def run(self, root):
        self.moveUpLinkSets(root)
        self.gatherLinkSets(root)
        #self.printIt(root)
        self.resolveLinks(root, dict())
        self.removeLinkSets(root)