Dictionary_TagParser.py

# -*- coding: utf-8 -*-

# Description: Dictionary tag parser
# Detail: Searches for tags based on tag-key : tag-text patterns.
# Documentation: tag_parsers.txt

from Remark.FileSystem import openFileUtf8

import re
import string

class Dictionary_TagParser(object):
    def __init__(self, tagMap):
        '''
        Searches for tag-definitions of the form tag-key : tag-text
        where tag-key and tag-text are both strings and the tag-text 
        reaches to the end of the line. There can be optional 
        whitespace surrounding the colon.

        tagMap (string --> string):
        A map from a tag-name to a tag-key. The map has to
        invertible; there is a unique tag-name for each tag-key.        
        For example: {'description' : 'Description', ...}
        '''

        # Construct a regex of the form
        # (tag1|tag2|tag3|...) : text
        # the parentheses capture the tag-key,
        # while text captures the tag-text.
        tags = 0

        # This is meant to cover for the possible comment mark.
        # By excluding preceding numbers and letters we cut off
        # the possibility of it being something else than a
        # tag-key : tag-text pattern.
        regex = r'^[^a-zA-Z0-9:]*'
        #regex = r'^.*'

        # The tag-key (captured).
        regex += r'('
        for tagKey in tagMap.values():
            # The tag-name may contain characters that
            # are meta-characters in a regular expression.
            # Therefore we need to escape the tag-names.
            regex += re.escape(tagKey.strip())
            if tags < len(tagMap) - 1:
                regex += r'|'
            tags +=1
        regex += r')'

        # The :, but not ::, surrounded by whitespace.
        # The :: is common in C++. For example, I had
        # a source-code line which began with 'Detail::'.
        regex += r'[ \t]*:(?!:)[ \t]*'

        # The tag-text (captured). It extends to the end of the line.
        regex += r'(.*)'

        # Compile the regex to a regex-object.
        self.tagRegex = re.compile(regex)

        # Construct a mapping from keys to tag-names.
        # This is needed because the regex searches
        # for the tag-keys.
        self.tagKeyMap = {}
        for tagName, tagKey in tagMap.items():
            self.tagKeyMap[tagKey] = tagName

    def parse(self, fileName, maxLines, reporter):
        tagSet = {}
        with openFileUtf8(fileName) as file:
            lineNumber = 0
            for fileLine in file:
                match = self.tagRegex.match(fileLine)
                if match != None:
                    # The first group is the tag-name.
                    tagKey = match.group(1)
                    # The second group is the tag-text.
                    tagText = match.group(2).strip()

                    # Find out the corresponding tag-name.
                    tagName = self.tagKeyMap.get(tagKey)
                    assert tagName != None

                    # See if the tag has already been defined.
                    if tagSet.get(tagName, [''])[0] != '':
                        # The tag has already been defined.
                        # Ignore the later definition.
                        reporter.reportWarning(["Multiple definitions for the tag '" + tagName + "'.",
                                                'Current: ' + tagSet[tagName][0],
                                                'New: ' + tagText],
                                               'ambiguous-input')
                    else:
                        tagSet[tagName] = [tagText]

                lineNumber += 1
                if lineNumber >= maxLines:
                    # All tags must occur within 'maxLines' lines.
                    break

        return tagSet