Source code for are.are

"""
Library for defining and operating on abstract regular expressions that work
with any symbol type, with an emphasis on supporting scenarios in which it is
necessary to work with regular expressions as abstract mathematical objects.
"""
from __future__ import annotations
import doctest
from typing import Optional, Iterable
import collections.abc
from reiter import reiter
from nfa import nfa, epsilon

[docs]class are(tuple): """ Base class for abstract regular expression instances (and the individual nodes found within an abstract syntax tree instance). Abstract regular expressions can contain symbols of any immutable type and can be built up using common operators such as concatenation, alternation, and repetition. >>> a = con(lit(1), con(lit(2), lit(3))) >>> a([1, 2, 3]) 3 This class is derived from the built-in :class:`tuple <tuple>` type. Each instance of this class acts as a node within the abstract syntax tree representing an abstract regular expression. The elements inside the instance are the child nodes of the node represented by the instance and can be accessed in the usual manner supported by the :class:`tuple <tuple>` type. >>> a = con(lit(1), con(lit(2), lit(3))) >>> a[1] con(lit(2), lit(3)) """
[docs] def to_nfa(self: are, _nfa: nfa = None) -> nfa: """ Convert this abstract regular expression instance into a nondeterministic finite automaton (NFA) that accepts the set of iterables that satisfies this instance. >>> a = con(lit(1), con(lit(2), lit(3))) >>> a.to_nfa() nfa({1: nfa({2: nfa({3: nfa()})})}) """ _nfa_ = None _nfa_next = nfa() if _nfa is None else _nfa if isinstance(self, nul): _nfa_ = None elif isinstance(self, emp): _nfa_ = _nfa_next elif isinstance(self, lit): _nfa_ = nfa({self[0]: _nfa_next}) elif isinstance(self, con): _nfa_rhs = self[1].to_nfa(_nfa_next) _nfa_ = None if _nfa_rhs is None else self[0].to_nfa(_nfa_rhs) elif isinstance(self, alt): _nfa_lhs = self[0].to_nfa(_nfa_next) _nfa_rhs = self[1].to_nfa(_nfa_next) if _nfa_lhs is not None and _nfa_rhs is not None: _nfa_ = nfa({epsilon: [_nfa_lhs, _nfa_rhs]}) elif _nfa_lhs is not None: _nfa_ = _nfa_lhs elif _nfa_rhs is not None: _nfa_ = _nfa_rhs else: _nfa_ = None elif isinstance(self, rep): _nfa_ = nfa({epsilon: [_nfa_next]}) _nfa_arg = self[0].to_nfa(_nfa_) if _nfa_arg is not None: _nfa_[epsilon].append(_nfa_arg) # In the root invocation, ``None`` implies that the NFA instance should # reject all strings. Otherwise, return the assembled NFA instance. return -nfa() if (_nfa_ is None and _nfa is None) else _nfa_
[docs] def compile(self: are) -> are: """ Convert this instance into an equivalent NFA and store it internally as an attribute (to enable more efficient matching). Return the original abstract regular expression instance. >>> a = alt(lit('x'), rep(con(lit('y'), lit('z')))) >>> a = a.compile() >>> a(['x']) 1 >>> a(['y', 'z', 'y', 'z']) 4 """ setattr(self, '_compiled', self.to_nfa().compile()) # Transition table. return self
[docs] def to_re(self: are) -> str: """ If this instance has string symbols (and no other symbols of any other type), convert it to an equivalent regular expression string that is compatible with the built-in :obj:`re <re>` module. >>> rep(alt(con(lit('a'), lit('b')), emp())).to_re() '((((a)(b))|)*)' >>> rep(alt(con(lit('a'), con(lit('b'), nul())), emp())).to_re() '((((a)((b)[^\\\\w\\\\W]))|)*)' Any attempt to convert an instance that has non-string symbols raises an exception. >>> rep(alt(con(lit(123), lit(456)), emp())).to_re() Traceback (most recent call last): ... TypeError: all symbols must be strings """ if isinstance(self, nul): re_ = r'[^\w\W]' # Contradiction. if isinstance(self, emp): re_ = '' elif isinstance(self, lit): if not isinstance(self[0], str): raise TypeError('all symbols must be strings') re_ = '(' + self[0] + ')' elif isinstance(self, con): re_ = '(' + self[0].to_re() + self[1].to_re() + ')' elif isinstance(self, alt): re_ = '(' + self[0].to_re() + '|' + self[1].to_re() + ')' elif isinstance(self, rep): re_ = '(' + self[0].to_re() + '*)' return re_
[docs] def __call__(self: are, string: Iterable, full: bool = True, _index: int = 0) -> Optional[int]: """ Determine whether an iterable of symbols (*i.e.*, an abstract *string* in the formal sense associated with the mathematical definition of a regular expression) is in the formal language represented by this instance. By default, the length of the abstract string is returned if the abstract string satisfies this instance. >>> a = rep(con(lit(1), lit(2))) >>> a([1, 2, 1, 2, 1, 2]) 6 >>> a = alt(rep(lit(2)), rep(lit(3))) >>> a([2, 2, 2, 2, 2]) 5 >>> a([3, 3, 3, 3]) 4 If the supplied abstract string does not satisfy this instance, then ``None`` is returned. >>> a([1, 1, 1]) is None True If the optional parameter ``full`` is set to ``False``, then the length of the longest prefix of the abstract string that satisfies this instance is returned. If no prefix satisfies this instance, then ``None`` is returned. >>> a = con(lit(1), con(lit(2), lit(3))) >>> a([1, 2, 3, 4, 5], full=False) 3 >>> a = con(lit(1), con(lit(2), lit(3))) >>> a([4, 4, 4], full=False) is None True If an instance is satisfied by the empty abstract string and ``full`` is set to ``False``, then the empty prefix of any abstract string satisfies the abstract regular expression instance (and, thus, a successful integer result of ``0`` is returned in such cases). >>> a = alt(lit(2), emp()) # Satisfied by the empty abstract string. >>> a([1, 1, 1], full=False) # Empty string is a prefix of ``[1, 1, 1]``. 0 Any attempt to apply an abstract regular expression instance to a non-iterable raises an exception. >>> nul()(123) Traceback (most recent call last): ... ValueError: input must be an iterable """ if not isinstance(string, (collections.abc.Iterable, reiter)): raise ValueError('input must be an iterable') string = reiter(string) # Determine the length of the longest match either using the compiled # NFA (if it is present) or the instance itself. return ( # pylint: disable=no-member self._compiled(string, full=full) if hasattr(self, '_compiled') and self._compiled is not None else self._match(string, full, _index) )
[docs] def __str__(self: are) -> str: """ Return string representation of instance. >>> a = rep(con(lit(1), alt(lit(2), lit(3)))) >>> str(a) 'rep(con(lit(1), alt(lit(2), lit(3))))' Assuming that this module has been imported in a manner such that the :obj:`are` subclasses are associated with the variables as they appear in this module (*e.g.*, :obj:`con` is associated with the variable ``con`` in the relevant scope), the strings returned by this method can be evaluated to reconstruct the instance. >>> a = rep(con(lit(1), alt(lit(2), lit(3)))) >>> eval(str(a)) rep(con(lit(1), alt(lit(2), lit(3)))) """ return \ type(self).__name__ + \ (str(tuple(self)) if len(self) != 1 else str(tuple(self))[:-2] + ')')
[docs] def __repr__(self: are) -> str: """ Return string representation of instance. >>> rep(alt(con(lit('a'), lit('b')), emp())) rep(alt(con(lit('a'), lit('b')), emp())) """ return str(self)
[docs]class nul(are): """ Singleton class containing an object that corresponds to the sole abstract regular expression instance that cannot be satisfied by any iterable (*i.e.*, that cannot be satisfied by any abstract string). >>> (nul()(iter('ab')), nul()(iter('abc'), full=False)) (None, None) >>> r = nul() >>> (r(''), r('abc'), r('', full=False), r('abc', full=False)) (None, None, None, None) More usage examples involving compilation of :obj:`are` instances that contain instances of this class are presented below. >>> r = r.compile() >>> (r(''), r('abc'), r('', full=False), r('abc', full=False)) (None, None, None, None) >>> ((con(nul(), lit('a')))('a'), (con(nul(), lit('a'))).compile()('a')) (None, None) >>> ((con(lit('a'), nul()))('a'), (con(lit('a'), nul())).compile()('a')) (None, None) >>> ((alt(nul(), lit('a')))('a'), (alt(nul(), lit('a'))).compile()('a')) (1, 1) >>> ((alt(lit('a'), nul()))('a'), (alt(lit('a'), nul())).compile()('a')) (1, 1) >>> ((alt(nul(), nul()))('a'), (alt(nul(), nul())).compile()('a')) (None, None) >>> (con(rep(nul()), lit('a')).compile())('a') 1 Any attempt to apply an abstract regular expression instance to a non-iterable raises an exception. >>> nul()(123) Traceback (most recent call last): ... ValueError: input must be an iterable """ def __new__(cls): """Instance constructor.""" return super().__new__(cls) # pylint: disable=unused-argument def _match(self: are, string: Iterable, full: bool, _index: int): """ One of an ensemble of mutually recursive matching functions (one function per type of base case or internal node of an abstract regular expression). Accepts an iterable of symbols and returns the length of the longest matching prefix of that string (or ``None``, if the matching criteria are not met). These functions are invoked by the :obj:`are.__call__` method of an instance. """ try: _ = string[_index] # May consume a symbol. return None except (StopIteration, IndexError): return None
[docs]class emp(are): """ Singleton class containing an object that corresponds to the sole abstract regular expression instance that is satisfied only by an empty iterable (*i.e.*, an abstract string with a length of zero). >>> (emp()(''), emp()('ab')) (0, None) >>> emp()(iter('ab')) is None True >>> emp()('abc', full=False) 0 >>> emp()(iter('abc'), full=False) 0 >>> r = emp().compile() >>> (r(''), r('abc')) (0, None) Any attempt to apply an abstract regular expression instance to a non-iterable raises an exception. >>> emp()(123) Traceback (most recent call last): ... ValueError: input must be an iterable """ def __new__(cls): """Instance constructor.""" return super().__new__(cls) def _match(self: are, string: Iterable, full: bool, _index: int) -> Optional[int]: """ One of an ensemble of mutually recursive matching functions (one function per type of base case or internal node of an abstract regular expression). Accepts an iterable of symbols and returns the length of the longest matching prefix of that string (or ``None``, if the matching criteria are not met). These functions are invoked by the :obj:`are.__call__` method of an instance. """ try: _ = string[_index] # May consume a symbol. if not full: return 0 return None except (StopIteration, IndexError): return 0
[docs]class lit(are): """ Abstract regular expression instances that are satisfied by exactly one symbol. Instances of this class also serve as the leaf nodes (*i.e.*, base cases) corresponding to abstract string *literals* (in the formal sense associated with the mathematical definition of a regular expression). >>> (lit('a')(''), lit('a')('a'), lit('a')('ab')) (None, 1, None) >>> (lit('a')('', full=False), lit('a')('ab', full=False)) (None, 1) >>> lit('a')(iter('ab'), full=False) 1 >>> r = lit('a').compile() >>> (r('a'), r('')) (1, None) Any attempt to apply an abstract regular expression instance to a non-iterable raises an exception. >>> lit('a')(123) Traceback (most recent call last): ... ValueError: input must be an iterable """ def __new__(cls, argument): """Instance constructor.""" return super().__new__(cls, [argument]) def _match(self: are, string: Iterable, full: bool, _index: int) -> Optional[int]: """ One of an ensemble of mutually recursive matching functions (one function per type of base case or internal node of an abstract regular expression). Accepts an iterable of symbols and returns the length of the longest matching prefix of that string (or ``None``, if the matching criteria are not met). These functions are invoked by the :obj:`are.__call__` method of an instance. """ try: symbol = string[_index] if symbol == self[0]: if not full: return 1 # pylint: disable=protected-access result = emp()._match(string, full=True, _index=(_index + 1)) if result == 0: return 1 return None # Abstract string does not satisfy the regular expression. except (StopIteration, IndexError): return None
[docs]class con(are): """ Concatenation operation for two :obj:`are` instances. Instances of this class also serve as the internal nodes of the tree data structure representing an abstract regular expression. >>> r = con(lit('a'), lit('b')) >>> (r('ab'), r('a'), r('abc'), r('cd')) (2, None, None, None) >>> (r(iter('ab')), r(iter('a')), r(iter('abc')), r(iter('cd'))) (2, None, None, None) >>> (r('a', full=False), r('abc', full=False), r('cd', full=False)) (None, 2, None) >>> (r(iter('a'), full=False), r(iter('abc'), full=False), r(iter('cd'), full=False)) (None, 2, None) >>> r = con(lit('a'), con(lit('b'), lit('c'))) >>> (r('abc'), r('abcd', full=False), r('ab')) (3, 3, None) >>> (r(iter('abc')), r(iter('abcd'), full=False), r(iter('ab'))) (3, 3, None) >>> r = con(con(lit('a'), lit('b')), lit('c')) >>> r('abc') 3 >>> r(iter('abc')) 3 >>> r = con(lit('a'), lit('b')).compile() >>> (r('ab'), r('a'), r('abc'), r('cd')) (2, None, None, None) Any attempt to apply an abstract regular expression instance to a non-iterable raises an exception. >>> r(123) Traceback (most recent call last): ... ValueError: input must be an iterable """ def __new__(cls, *arguments): """Instance constructor.""" return super().__new__(cls, [*arguments]) def _match(self: are, string: Iterable, full: bool, _index: int) -> Optional[int]: """ One of an ensemble of mutually recursive matching functions (one function per type of base case or internal node of an abstract regular expression). Accepts an iterable of symbols and returns the length of the longest matching prefix of that string (or ``None``, if the matching criteria are not met). These functions are invoked by the :obj:`are.__call__` method of an instance. """ # pylint: disable=protected-access lengths = self[0]._match(string, full=False, _index=_index) if lengths is not None: length = self[1]._match(string, full=False, _index=(_index + lengths)) if length is not None: lengths += length if not full: return lengths result = emp()._match(string, full=True, _index=(_index + lengths)) if result == 0: return lengths return None # Abstract string does not satisfy the regular expression.
[docs]class alt(are): """ Alternation operation for two :obj:`are` instances. Instances of this class also serve as the internal nodes of the tree data structure representing an abstract regular expression. >>> r = alt(con(lit('a'), lit('a')), lit('a')) >>> r('aa') 2 >>> r = alt(lit('b'), con(lit('a'), lit('a'))) >>> r('aa') 2 >>> r = con(alt(lit('a'), lit('b')), alt(lit('c'), lit('d'))) >>> (r('ac'), r('ad'), r('bc'), r('bd')) (2, 2, 2, 2) >>> r = con(alt(lit('a'), lit('b')), lit('c')) >>> (r('ac'), r('bc'), r('c'), r('a'), r('b')) (2, 2, None, None, None) >>> r = alt(con(lit('a'), lit('b')), lit('a')) >>> r('abc', full=False) 2 >>> r = alt(lit('a'), con(lit('a'), lit('a'))) >>> r('aaa', full=False) 2 >>> r = alt(lit('a'), con(lit('a'), lit('a'))) >>> r('aaa') is None True >>> r = alt(con(lit('a'), lit('a')), lit('a')) >>> r('aa', full=False) 2 >>> r = alt(lit('a'), lit('a')) >>> r('a') 1 >>> r = alt(lit('a'), lit('b')) >>> r('ac') is None True >>> (r('a'), r('b'), r('c')) (1, 1, None) >>> r('ac', full=False) 1 >>> r0 = alt(lit('a'), alt(lit('b'), lit('c'))) >>> r1 = con(r0, r0) >>> {r1(x + y) for x in 'abc' for y in 'abc'} {2} >>> r = alt(lit('b'), con(lit('c'), lit('a'))) >>> r('aab') is None True >>> r(iter('aab')) is None True >>> r = alt(con(lit('a'), lit('a')), con(lit('a'), con(lit('a'), lit('a')))) >>> (r('aaa'), r('aa')) (3, 2) >>> r = alt(con(lit('a'), lit('a')), con(lit('a'), con(lit('a'), lit('a')))) >>> (r('aaa', full=False), r('aa', full=False)) (3, 2) >>> (r(iter('aaa'), full=False), r(iter('aa'), full=False)) (3, 2) >>> r = alt(con(lit('a'), lit('a')), con(lit('a'), con(lit('a'), lit('a')))) >>> r = r.compile() >>> (r('aaa'), r('aa'), r('a')) (3, 2, None) Any attempt to apply an abstract regular expression instance to a non-iterable raises an exception. >>> r(123) Traceback (most recent call last): ... ValueError: input must be an iterable """ def __new__(cls, *arguments): """Instance constructor.""" return super().__new__(cls, [*arguments]) def _match(self: are, string: Iterable, full: bool, _index: int) -> Optional[int]: """ One of an ensemble of mutually recursive matching functions (one function per type of base case or internal node of an abstract regular expression). Accepts an iterable of symbols and returns the length of the longest matching prefix of that string (or ``None``, if the matching criteria are not met). These functions are invoked by the :obj:`are.__call__` method of an instance. """ # pylint: disable=protected-access # Determine whether (and by what length of substring) the first alternative # is satisfied. lengths = [self[0]._match(string, full=full, _index=_index)] if lengths[0] is None: # First alternative is not satisfied, so determine separately whether # (and by what length of substring) the second alternative is satisfied. return self[1]._match(string, full=full, _index=_index) # First alternative is satisfied; determine if second alternative is also # satisfied (for example, in case it is satisfied by a longer portion of the # abstract string). lengths.append(self[1]._match(string, full=full, _index=_index)) if lengths[1] is None: # Second alternative is not satisfied, so return the length of the # satisfying substring (*unless* it has been specified that the full # string must satisfy this instance). length = lengths[0] if not full: return length # Ensure that the full string satisfies this instance; otherwise, # indicate failure. result = emp()._match(string, full=True, _index=(_index + length)) return length if result == 0 else None # Both alternatives succeeded, so return the length of the longer of the # two satisfied alternatives (*unless* it has been specified that the full # string must satisfy this instance). length = max(lengths) if not full: return length # Ensure that the full string satisfies this instance; otherwise, # indicate failure. result = emp()._match(string, full=True, _index=(_index + length)) return length if result == 0 else None
[docs]class rep(are): """ Repetition operation (zero or more times) for an :obj:`are` instance. Instances of this class also serve as the internal nodes of the tree data structure representing an abstract regular expression. >>> r = rep(lit('a')) >>> all([r('a'*i) == i for i in range(100)]) True >>> {r('a'*i + 'b') for i in range(10)} {None} >>> {r(iter('a'*i + 'b')) for i in range(10)} {None} >>> r = con(lit('a'), rep(lit('b'))) >>> r('a' + 'b'*10) 11 >>> r(iter('a' + 'b'*10)) 11 >>> r = con(rep(lit('a')), lit('b')) >>> r('aaab') 4 >>> r(iter('aaab')) 4 >>> r = con(rep(lit('a')), lit('b')).compile() >>> r('aaab') 4 >>> r = rep(lit('a')).compile() >>> all([r('a'*i) == i for i in range(100)]) True Note that the empty abstract string satisfies any instance of this class. >>> r('') 0 >>> r('bbbb', full=False) 0 >>> all([r('a'*i + 'b', full=False) == i for i in range(100)]) True >>> all([r(iter('a'*i + 'b'), full=False) == i for i in range(100)]) True Any attempt to apply an abstract regular expression instance to a non-iterable raises an exception. >>> r(123) Traceback (most recent call last): ... ValueError: input must be an iterable """ def __new__(cls, argument): """Instance constructor.""" return super().__new__(cls, [argument]) def _match(self: are, string: Iterable, full: bool, _index: int) -> Optional[int]: """ One of an ensemble of mutually recursive matching functions (one function per type of base case or internal node of an abstract regular expression). Accepts an iterable of symbols and returns the length of the longest matching prefix of that string (or ``None``, if the matching criteria are not met). These functions are invoked by the :obj:`are.__call__` method of an instance. """ # pylint: disable=protected-access lengths = 0 length = self[0]._match(string, full=False, _index=_index) while length is not None: lengths += length length = self[0]._match(string, full=False, _index=(_index + lengths)) if not full: return lengths result = emp()._match(string, full=True, _index=(_index + lengths)) if result == 0: return lengths return None # Abstract string does not satisfy the regular expression.
if __name__ == '__main__': doctest.testmod() # pragma: no cover