Source code for lucidity.template

# :coding: utf-8
# :copyright: Copyright (c) 2013 Martin Pengelly-Phillips
# :license: See LICENSE.txt.

import sys
import re
import copy
from collections import defaultdict

import lucidity.error

# Type of a RegexObject for isinstance check.
_RegexType = type(re.compile(''))


[docs]class Template(object):
    '''A template.'''

    _STRIP_EXPRESSION_REGEX = re.compile(r'{(.+?)(:(\\}|.)+?)}')
    _PLAIN_PLACEHOLDER_REGEX = re.compile(r'{(.+?)}')

    ANCHOR_START, ANCHOR_END, ANCHOR_BOTH = (1, 2, 3)

    RELAXED, STRICT = (1, 2)

[docs]    def __init__(self, name, pattern, anchor=ANCHOR_START,
                 default_placeholder_expression='[\w_.\-]+',
                 duplicate_placeholder_mode=RELAXED):
        '''Initialise with *name* and *pattern*.

        *anchor* determines how the pattern is anchored during a parse. A
        value of :attr:`~Template.ANCHOR_START` (the default) will match the
        pattern against the start of a path. :attr:`~Template.ANCHOR_END` will
        match against the end of a path. To anchor at both the start and end
        (a full path match) use :attr:`~Template.ANCHOR_BOTH`. Finally,
        ``None`` will try to match the pattern once anywhere in the path.

        *duplicate_placeholder_mode* determines how duplicate placeholders will
        be handled during parsing. :attr:`~Template.RELAXED` mode extracts the
        last matching value without checking the other values.
        :attr:`~Template.STRICT` mode ensures that all duplicate placeholders
        extract the same value and raises :exc:`~lucidity.error.ParseError` if
        they do not.

        '''
        super(Template, self).__init__()
        self.duplicate_placeholder_mode = duplicate_placeholder_mode
        self._default_placeholder_expression = default_placeholder_expression
        self._period_code = '_LPD_'
        self._name = name
        self._pattern = pattern
        self._anchor = anchor
        self._placeholderCount = defaultdict(int)
        self._regex = self._construct_regular_expression(self.pattern)
        self._format_specification = self._construct_format_specification(
            self.pattern
        )
        self._placeholders = self._extract_placeholders(
            self._format_specification
        )

    def __repr__(self):
        '''Return unambiguous representation of template.'''
        return '{0}(name={1!r}, pattern={2!r})'.format(
            self.__class__.__name__, self.name, self.pattern
        )

    def __deepcopy__(self, memo):
        '''Return deep copy of template.'''
        cls = self.__class__
        result = cls.__new__(cls)
        memo[id(self)] = result

        for key, value in self.__dict__.items():
            if isinstance(value, _RegexType):
                # RegexObject is not deepcopyable, so store a new instance
                # compiled using the same pattern. Note that the compiled result
                #  will typically be the same instance.
                setattr(result, key, re.compile(value.pattern))
            else:
                setattr(result, key, copy.deepcopy(value, memo))

        return result

    @property
    def name(self):
        '''Return name of template.'''
        return self._name

    @property
    def pattern(self):
        '''Return template pattern.'''
        return self._pattern

[docs]    def parse(self, path):
        '''Return dictionary of data extracted from *path* using this template.

        Raise :py:class:`~lucidity.error.ParseError` if *path* is not
        parseable by this template.

        '''
        parsed = {}

        match = self._regex.search(path)
        if match:
            data = {}
            for key, value in sorted(match.groupdict().items()):
                # Strip number that was added to make group name unique.
                key = key[:-3]

                # If strict mode enabled for duplicate placeholders, ensure that
                # all duplicate placeholders extract the same value.
                if self.duplicate_placeholder_mode == self.STRICT:
                    if key in parsed:
                        if parsed[key] != value:
                            raise lucidity.error.ParseError(
                                'Different extracted values for placeholder '
                                '{0!r} detected. Values were {1!r} and {2!r}.'
                                .format(key, parsed[key], value)
                            )
                    else:
                        parsed[key] = value

                # Expand dot notation keys into nested dictionaries.
                target = data

                parts = key.split(self._period_code)
                for part in parts[:-1]:
                    target = target.setdefault(part, {})

                target[parts[-1]] = value

            return data

        else:
            raise lucidity.error.ParseError(
                'Path {0!r} did not match template pattern.'.format(path)
            )

[docs]    def format(self, data):
        '''Return a path formatted by applying *data* to this template.

        Raise :py:class:`~lucidity.error.FormatError` if *data* does not
        supply enough information to fill the template fields.

        '''
        def _format(match):
            '''Return value from data for *match*.'''
            placeholder = match.group(1)
            parts = placeholder.split('.')

            try:
                value = data
                for part in parts:
                    value = value[part]

            except (TypeError, KeyError):
                raise lucidity.error.FormatError(
                    'Could not format data {0!r} due to missing key {1!r}.'
                    .format(data, placeholder)
                )

            else:
                return value

        return self._PLAIN_PLACEHOLDER_REGEX.sub(
            _format, self._format_specification
        )

[docs]    def keys(self):
        '''Return unique set of placeholders in pattern.'''
        return self._placeholders.copy()

    def _extract_placeholders(self, pattern):
        '''Extract and return unique set of placeholders in *pattern*.'''
        return set(self._PLAIN_PLACEHOLDER_REGEX.findall(pattern))

    def _construct_format_specification(self, pattern):
        '''Return format specification from *pattern*.'''
        return self._STRIP_EXPRESSION_REGEX.sub('{\g<1>}', pattern)

    def _construct_regular_expression(self, pattern):
        '''Return a regular expression to represent *pattern*.'''
        # Escape non-placeholder components.
        expression = re.sub(
            r'(?P<placeholder>{(.+?)(:(\\}|.)+?)?})|(?P<other>.+?)',
            self._escape,
            pattern
        )

        # Replace placeholders with regex pattern.
        expression = re.sub(
            r'{(?P<placeholder>.+?)(:(?P<expression>(\\}|.)+?))?}',
            self._convert,
            expression
        )

        if self._anchor is not None:
            if bool(self._anchor & self.ANCHOR_START):
                expression = '^{0}'.format(expression)

            if bool(self._anchor & self.ANCHOR_END):
                expression = '{0}$'.format(expression)

        # Compile expression.
        try:
            compiled = re.compile(expression)
        except re.error as error:
            if 'bad group name' in error:
                raise ValueError('Placeholder name contains invalid '
                                 'characters.')
            else:
                _, value, traceback = sys.exc_info()
                message = 'Invalid pattern: {0}'.format(value)
                raise ValueError, message, traceback  #@IgnorePep8

        return compiled

    def _convert(self, match):
        '''Return a regular expression to represent *match*.'''
        placeholder_name = match.group('placeholder')

        # Support period (.) as nested key indicator. Currently, a period is
        # not a valid character for a group name in the standard Python regex
        # library. Rather than rewrite or monkey patch the library work around
        # the restriction with a unique identifier.
        placeholder_name = placeholder_name.replace('.', self._period_code)

        # The re module does not support duplicate group names. To support
        # duplicate placeholder names in templates add a unique count to the
        # regular expression group name and strip it later during parse.
        self._placeholderCount[placeholder_name] += 1
        placeholder_name += '{0:03d}'.format(
            self._placeholderCount[placeholder_name]
        )

        expression = match.group('expression')
        if expression is None:
            expression = self._default_placeholder_expression

        # Un-escape potentially escaped characters in expression.
        expression = expression.replace('\{', '{').replace('\}', '}')

        return r'(?P<{0}>{1})'.format(placeholder_name, expression)

    def _escape(self, match):
        '''Escape matched 'other' group value.'''
        groups = match.groupdict()
        if groups['other'] is not None:
            return re.escape(groups['other'])

        return groups['placeholder']