FileSystem.py

# -*- coding: utf-8 -*-

# Description: File-system abstractions
# Documentation: algorithms.txt

from __future__ import print_function

import os.path
import string
import codecs
import shutil
import fnmatch
import re
import six

globalOptions_ = None
emptyDict = object()

def remarkDirectory():
    '''
    Returns the directory containing the Remark package.
    '''
    return os.path.dirname(os.path.realpath(__file__))

remarkScriptpath_ = ''
def remarkScriptPath():
    '''
    Returns the path to the remark.py command-line script.
    This is non-empty only if Remark is used from the
    command-line, and not as a module.
    '''
    global remarkScriptPath_
    return remarkScriptPath_

def setRemarkScriptPath(path):
    global remarkScriptPath_
    remarkScriptPath_ = os.path.normpath(path)

def findMatchingFiles(inputDirectory, includeSet, excludeSet):
    '''
    Finds each file in the given directory whose relative-name 
    matches an inclusion-glob and does not match an exclusion-glob.

    inputDirectory (string):
    Path to the directory.

    includeSet (iterable of strings):
    A set of inclusion-globs.

    excludeSet (iterable of strings):
    A set of exclusion-globs.

    returns (list of strings):
    The set of relative-names of matching files.
    '''

    # Construct the matching regex strings.
    includeRegexString = globToRegex(includeSet)
    excludeRegexString = globToRegex(excludeSet)

    # Compile the regex strings into regexes.
    includeRegex = re.compile(includeRegexString)
    excludeRegex = re.compile(excludeRegexString)

    # Gather the specified files.
    relativeNameSet = []
    for pathName, directorySet, fileNameSet in os.walk(inputDirectory):
        for filename in fileNameSet:
            fullName = os.path.normpath(os.path.join(pathName, filename))
            relativeName = unixRelativePath(inputDirectory, fullName)
            if (re.match(includeRegex, relativeName) != None and
                not re.match(excludeRegex, relativeName)):
                # The relative-name of the file matches 
                # the inclusion-glob, and does not match 
                # the exclusion-glob; take it in.
                relativeNameSet.append(relativeName)

    # Return the set of matching files.
    return relativeNameSet

def splitPath(p):
    '''
    Splits a pathname.

    This is a bug-fixed version of os.path.split() from
    Python 2.7.3. It used os.path.splitdrive() which does not correctly
    handle long-UNC paths. This is fixed by using os.path.splitunc()
    instead.

    returns: 
    tuple (head, tail) where tail is everything after the final slash.
    Either part may be empty.
    '''

    d, p = os.path.splitunc(p)
    # set i to index beyond p's last slash
    i = len(p)
    while i and p[i-1] not in '/\\':
        i = i - 1
    head, tail = p[:i], p[i:]  # now tail has no slashes
    # remove trailing slashes from head, unless it's all slashes
    head2 = head
    while head2 and head2[-1] in '/\\':
        head2 = head2[:-1]
    head = head2 or head
    return d + head, tail

def setGlobalOptions(options):
    '''
    Sets the global-options object. The global options 
    can then be accessed by globalOptions().
    '''
    global globalOptions_
    globalOptions_ = options

def globalOptions():
    '''
    Returns the global-options object.
    '''
    return globalOptions_;

def htmlRegion(htmlText):
    # We need to wrap the html into a region to avoid
    # it being wrapped into a <p> element.
    return markdownRegion(
        htmlText, 
        {
            'class' : 'html', 
            'remark-content' : 'text'
        })

def markdownRegion(enclosedText, keySet = emptyDict, elementName = 'div'):
    '''
    Encloses the given text in a Markdown region (Remark extension)

    !!! <elementName aKey="aValue" bKey="bValue" ...>
        enclosedText

    enclosedText (list of strings):
    The text to enclose.

    keySet (string:string):
    A set of key-value pairs to apply to the region.

    elementName (string):
    The name of the region element.
    '''
    if keySet is emptyDict: keySet = dict()

    regionText = '!!! <' 
    regionText += elementName

    for (key, value) in keySet.items():
        regionText += ' ' + key.strip() + ' = "' + value + '"'

    regionText += '>'

    text = []
    text.append(regionText)

    for line in enclosedText:
        text.append('\t' + line)

    # When the text is enclosed in a region inline,
    # as in [[Verbatim: like this]], we need
    # the new-line to avoid the succeeding 
    # content being appended inside the region.
    # Otherwise this new-line is extraneous,
    # and looks a bit ugly in the Markdown 
    # source output.
    text.append('')

    return text

def globToRegex(glob):
    '''
    Converts a glob or a set of globs to a regular expression.

    If glob is a string, then it is simply converted to a 
    regular expression. Otherwise glob is assumed to be iterable,
    and each string in glob is converted to a regular expression, 
    which are then combined as alternatives into a single regular 
    expression. If the iterable is empty, then a regex is generated
    which matches nothing.

    glob (string or an iterable of strings):
    A glob or a set of globs to convert to a regular expression.

    returns (string):
    The converted regular expression.
    '''
    if not isinstance(glob, six.string_types):
        regexSet = []
        for line in glob:
            regexSet.append(globToRegex(line))
        if len(regexSet) == 0:
            # Match nothing.
            regexSet.append(r'(?!)')
        return combineRegex(regexSet)

    return fnmatch.translate(glob.strip())

def combineRegex(regex):
    '''
    Combines a regular expression or a set of regular expression 
    as alternatives into a single regular expression.

    If the regular expression is a string, it is returned as it is.
    Otherwise each regular expression is grouped into a non-capturing
    parenthesis and these groups are combined as alternatives.

    regex (string or an iterable of strings):
    The regular expression or a set of regular expression to 
    combine. Whitespace will be removed from both sides of
    each regular expression.

    returns (string):
    The combined regular expression.
    '''
    regexString = ''
    if isinstance(regex, six.string_types):
        regexString = regex.strip()
    else:
        regexSet = []
        for line in regex:
            regexSet.append(line.strip())
        if regexSet != []:
            if len(regexSet) > 1:
                # Join together as alternatives, grouped
                # in non-capturing parentheses.
                regexString = r'(?:' + r')|('.join(regexSet) + r')'
            else:
                # Use the regex as it is.
                regexString = regexSet[0]
    return regexString

def escapeMarkdown(text):
    '''
    Escapes the * and _ Markdown meta-characters by \* and \_.

    text (string):
    The text to escape.

    returns (string):
    The text with * and _ replaced with \* and \_, respectively.
    '''
    escapedText = ''
    escapeSet = set(['*', '_'])
    for c in text:
        if c in escapeSet:
            escapedText += '\\'
        escapedText += c
    return escapedText

def pathSuffixSet(relativePath):
    '''
    Returns the set of path-suffixes of a given relative-path.

    relativePath (string):
    The relative-path to compute the path-suffixes for.

    returns (list of strings):
    The set of path-suffixes for the relative-path.
    '''
    path = unixDirectoryName(relativePath)
    n = len(path)
    index = n
    lastStart = index
    suffixSet = []
    while index > 0:
        index -= 1
        if (relativePath[index] == '/' and index < n):
            suffixSet.append(relativePath[index + 1 : ])
    return suffixSet

def unixRelativePath(fromRelativeDirectory, toRelativePath):
    '''
    Forms a unix-style relative-path from the given relative
    directory to the given relative path.

    fromRelativeDirectory:
        The relative directory in which the link resides.

    toRelativePath:
        A relative path to link to.
    '''
    relativePath = os.path.relpath(toRelativePath, fromRelativeDirectory)
    return unixDirectoryName(relativePath)

def openFileUtf8(fileName):
    '''
    Opens a file for reading as utf-8 decoded.
    Decoding errors are treated by 'replace'.

    fileName (string):
    The file to open.

    returns:
    A handle to the opened file.
    '''
    file = codecs.open(longPath(fileName),
                       mode = 'rU', encoding = 'utf-8-sig', 
                       errors = 'replace')

    return file    

def fileSize(fileName):
    '''
    Returns os.path.getsize(longPath(fileName)).
    '''
    return os.path.getsize(longPath(fileName))

def readFile(fileName, maxSize = -1):
    '''
    Opens a file using openFileUtf8, and reads the contents 
    into a list of strings corresponding to the rows of the file.
    The form-feeds and newlines are stripped off from the end of
    each line. 

    fileName (string):
    The file to read.

    maxSize (integer):
    Maximum size of a file to read. Use a negative number
    for unbounded size.
    
    returns (list of strings):
    The rows of the file, if the file is not skipped. Otherwise
    the empty list [].
    '''
    size = fileSize(fileName)
    if maxSize >= 0 and size >= maxSize:
        # If the file is very large, then it probably is not
        # part of the Remark documentation. Refuse to read
        # such files.
        print
        print('Warning:', fileName, end = ' ')
        print('is larger than', maxSize, 'bytes (it is', size, 'bytes).', end = ' ')
        print('Ignoring it.')
        return []

    # Read the file into memory
    text = []
    try:
        with openFileUtf8(fileName) as file:
            text = file.readlines()
    except:
        print
        print('Warning:', fileName, end = ' ')
        print('could not be read for some reason.', end = ' ')
        print('Ignoring it.')
        return []

    for i in range(0, len(text)):
        # Remove possible newlines from the ends of the lines.
        # The lines are encoded by the list-structure instead.
        # Note that this should have been done by 
        # file.readlines(keepends = False). However, it is a bug 
        # in Python 2.7.3 that the keepends argument is missing. 
        text[i] = text[i].rstrip('\r\n')

    return text

def writeFile(text, outputFullName):
    '''
    Writes text into a file using utf-8 encoding.

    text (a list of strings):
    The text to write into the file.

    outputFullName (string):
    The name of the file to write to.
    '''
    # If the directories do not exist, create them.
    outputDirectory = os.path.split(outputFullName)[0]
    if not pathExists(outputDirectory):
        createDirectories(outputDirectory)

    # Save the text to a file.
    with codecs.open(longPath(outputFullName), mode = 'w', encoding = 'utf-8') as outputFile:
        # Note that we can't use outputFile.writelines() because it would
        # concatenate the lines without the new-line at the end.
        for line in text:
            outputFile.write(line)
            outputFile.write('\n')

def createDirectories(name):
    '''
    Calls os.makedirs(longPath(name)).
    '''
    os.makedirs(longPath(name))

def pathExists(path):
    '''
    Returns os.path.exists(longPath(path))
    '''
    return os.path.exists(longPath(path))

def directoryExists(path):
    '''
    Returns pathExists(path) and os.path.isdir(longPath(path))
    '''
    return pathExists(path) and os.path.isdir(longPath(path))

def copyTree(inputDirectory, outputDirectory):
    '''
    Calls shutil.copytree(longPath(inputDirectory), longPath(outputDirectory)).
    '''
    shutil.copytree(longPath(inputDirectory), longPath(outputDirectory))

def copyFile(inputRelativeName, inputDirectory, 
             outputRelativeName, outputDirectory):
    '''
    Copies an input-file to the output-file. Creates
    the needed input directories if they do not exist.
    If the input-file does not exist, nothing is done.

    inputRelativeName (string):
    The name of the input-file, relative to the input-directory.

    inputDirectory (string):
    The input-directory.

    outputRelativeName (string):
    The name of the output-file, relative to the output-directory.

    inputDirectory (string):
    The output-directory.
    '''
    inputFilePath = os.path.join(inputDirectory, inputRelativeName)
    outputFilePath = os.path.join(outputDirectory, outputRelativeName)

    # If the output directory does not exist, create it.
    finalOutputDirectory = os.path.split(outputFilePath)[0];
    if not pathExists(finalOutputDirectory):
        createDirectories(finalOutputDirectory)

    # Copy the file.
    shutil.copy(longPath(inputFilePath), longPath(outputFilePath))

def longPath(path):
    '''
    Returns the input path with a possible long-UNC-prefix \\?\.
    
    returns:
    The input path prefixed with \\?\, if under Windows and
    the path is longer than 259 characters. Otherwise the input
    path unchanged.
    '''
    # Windows has a limit of 260 characters for the length of 
    # standard paths, including the null-character at the end.
    # Longer paths need to use a special long-UNC notation which is 
    # denoted by a prefix \\?\. 

    # You would hope that this fixed the long-path problems with 
    # Windows, but this unfortunately is not the case. For example, 
    # the CreateDirectoryW function of the Windows API does not 
    # support long-paths even by the \\?\-prefix, contrary to its 
    # documentation. So why we bother here? The reason is that oddly 
    # this trick works to support slightly longer paths with network-
    # mapped drives, which use some kind of path-compression to 
    # artifically produce longer path-names (but the compressed path 
    # also needs to be <= 259 characters). In particular, this 
    # situation comes by when the mapped-drive is a Linux file 
    # system, and contains paths longer than 259 characters.

    # It is essential that the \\?\ is only prefixed
    # for paths longer than 259 characters. The reason is related to
    # Python's handling of directories, e.g. in listdir implemented
    # in posixmodule.c, which uses a / as a separator to append a
    # /*.* path-suffix. The \\?\ has the effect, in Windows API, of
    # disabling any string-processing for the path, including the
    # replacement of / with \. Therefore using the \\?\-prefix for
    # shorter paths leads to invalid path-names. But why then
    # is the \\?\-prefix ok with paths longer than 259 characters?
    # I have no idea, but in our tests, the / suddenly becomes 
    # acceptable as a separator after this breakpoint.

    # To be consistent with Python, you might be thinking of using
    # the //?/-prefix instead. This has the effect of not disabling
    # the string-processing in Windows API, and the result is that
    # / are converted to \, leading to the \\?\-prefix, and also
    # avoiding the invalid path-names. However, for some unimaginable 
    # reasons, this is not equivalent to the \\?\-prefix, and does 
    # not work to support those long path-names in network-mapped 
    # drives.

    if os.name == 'nt' and len(path) > 259:
        return '\\\\?\\' + path

    return path

def copyIfNecessary(inputRelativeName, inputDirectory, 
                    outputRelativeName, outputDirectory):
    '''
    Copies an input-file to an output file if and only if
    the output file is not up-to-date as determined by
    fileUpToDate().

    inputRelativeName (string):
    The name of the input-file, relative to the input-directory.

    inputDirectory (string):
    The input-directory.

    outputRelativeName (string):
    The name of the output-file, relative to the output-directory.

    inputDirectory (string):
    The output-directory.

    returns:
    A boolean stating if the file was actually copied.
    '''
    upToDate = fileUpToDate(inputRelativeName, inputDirectory, 
                            outputRelativeName, outputDirectory)

    if not upToDate:
        copyFile(inputRelativeName, inputDirectory,
                 outputRelativeName, outputDirectory)

    return not upToDate

def listDirectory(path):
    '''
    Returns os.listdir(longPath(path)).
    '''
    return os.listdir(longPath(path))

def fileModificationTime(filePath):
    '''
    Returns os.path.getmtime(longPath(filePath)).
    '''
    return os.path.getmtime(longPath(filePath))

def fileExists(inputRelativeName, inputDirectory):
    '''
    Returns pathExists(os.path.join(inputDirectory, inputRelativeName)) and os.path.isfile(longPath(path)).
    '''
    path = os.path.join(inputDirectory, inputRelativeName)
    return pathExists(path) and os.path.isfile(longPath(path))

def fileUpToDate(inputRelativeName, inputDirectory,
                 outputRelativeName, outputDirectory):
    '''
    Returns whether the output-file is up-to-date.

    returns:
    Whether the output file exists and has a modification 
    time-stamp not later than with the input file.
    '''
    inputFilePath = os.path.join(inputDirectory, inputRelativeName)
    outputFilePath = os.path.join(outputDirectory, outputRelativeName)

    if not pathExists(inputFilePath):
        print('Error: fileUpToDate: the input file ' + inputRelativeName + ' does not exist.')
        return False

    # The output file is up-to-date if it exists and has a 
    # modification time-stamp not later than with the input file.
    return (pathExists(outputFilePath) and 
        fileModificationTime(inputFilePath) <= fileModificationTime(outputFilePath))

def unixDirectoryName(name):
    '''
    Returns a normalized unix-style directory-name of the given 
    directory-name (unix-style or not).
    '''
    return os.path.normpath(name).replace('\\', '/')                

def fileExtension(fileName):
    '''
    Returns the filename-extension of the the filename.
    '''
    return os.path.splitext(fileName)[1]

def withoutFileExtension(fileName):
    '''
    Returns the filename without the filename-extension.
    '''
    return os.path.splitext(fileName)[0]

def changeExtension(fileName, newExtension):
    '''
    Changes the filename-extension to another.
    '''
    return withoutFileExtension(fileName) + newExtension