Evaluating Regular Expressions

Regular expressions evaluate to sets of strings on \(\Sigma\)–i.e. languages on \(\Sigma\). Another way of thinking about this is that a regular expression on \(\Sigma\) describes a language on \(\Sigma\).

We can define this evaluation procedure formally as a function \(\text{eval}: R(\Sigma) \rightarrow 2^{\Sigma^*}\), where \(R(\Sigma)\) is the set of regular expressions on \(\Sigma\).

\(\text{eval}(\rho) = \begin{cases}\{\} & \text{if } \rho = \emptyset \\\{\_\} & \text{if } \rho = \epsilon \\ \{\rho\} & \text{if } \rho \in \Sigma\\ \text{eval}(\rho_1) \times \text{eval}(\rho_2) & \text{if } \rho = (\rho_1 \circ \rho_2) \\ \text{eval}(\rho_1) \cup \text{eval}(\rho_2) & \text{if } \rho = (\rho_1 \cup \rho_2)\\ \bigcup_{i = 0}^\infty \text{eval}(\rho_1)^i & \text{if } \rho = \rho_1^*\\ \end{cases}\)

So first, we’re going to want to have access to the regular expressions’ structure for the purposes of evaluating them. We could do this by using pyparsing or some hand-built parser to reconstruct the structure of each expression, but we may as well just build retain the structure while generating the expression.

def regular_expressions_structured(sigma):
    old_regex_set = frozenset(sigma | {'βˆ…', 'πœ–'})
    
    for rho in old_regex_set:
        yield rho
    
    while True:
        new_regex_set = set(old_regex_set)
            
        for rho in old_regex_set:
            elem = (rho, '*')
            new_regex_set |= {elem}
            yield elem
            
        for rho1 in old_regex_set:
            for rho2 in old_regex_set:
                elem = (rho1, 'βˆͺ', rho2)
                new_regex_set |= {elem}
                yield elem
                
        for rho1 in old_regex_set:
            for rho2 in old_regex_set:
                elem = (rho1, '∘', rho2)
                new_regex_set |= {elem}
                yield elem
                
        old_regex_set = frozenset(new_regex_set)
        
for i, r in enumerate(regular_expressions_structured({'Ι™', 'm'})):
    print(r)
    
    if i > 100:
        break
m
βˆ…
Ι™
πœ–
('m', '*')
('βˆ…', '*')
('Ι™', '*')
('πœ–', '*')
('m', 'βˆͺ', 'm')
('m', 'βˆͺ', 'βˆ…')
('m', 'βˆͺ', 'Ι™')
('m', 'βˆͺ', 'πœ–')
('βˆ…', 'βˆͺ', 'm')
('βˆ…', 'βˆͺ', 'βˆ…')
('βˆ…', 'βˆͺ', 'Ι™')
('βˆ…', 'βˆͺ', 'πœ–')
('Ι™', 'βˆͺ', 'm')
('Ι™', 'βˆͺ', 'βˆ…')
('Ι™', 'βˆͺ', 'Ι™')
('Ι™', 'βˆͺ', 'πœ–')
('πœ–', 'βˆͺ', 'm')
('πœ–', 'βˆͺ', 'βˆ…')
('πœ–', 'βˆͺ', 'Ι™')
('πœ–', 'βˆͺ', 'πœ–')
('m', '∘', 'm')
('m', '∘', 'βˆ…')
('m', '∘', 'Ι™')
('m', '∘', 'πœ–')
('βˆ…', '∘', 'm')
('βˆ…', '∘', 'βˆ…')
('βˆ…', '∘', 'Ι™')
('βˆ…', '∘', 'πœ–')
('Ι™', '∘', 'm')
('Ι™', '∘', 'βˆ…')
('Ι™', '∘', 'Ι™')
('Ι™', '∘', 'πœ–')
('πœ–', '∘', 'm')
('πœ–', '∘', 'βˆ…')
('πœ–', '∘', 'Ι™')
('πœ–', '∘', 'πœ–')
(('πœ–', '∘', 'm'), '*')
(('βˆ…', '∘', 'Ι™'), '*')
(('βˆ…', 'βˆͺ', 'm'), '*')
(('Ι™', '∘', 'Ι™'), '*')
('βˆ…', '*')
(('m', 'βˆͺ', 'πœ–'), '*')
(('m', 'βˆͺ', 'βˆ…'), '*')
(('m', 'βˆͺ', 'm'), '*')
(('πœ–', 'βˆͺ', 'Ι™'), '*')
(('m', '∘', 'Ι™'), '*')
(('βˆ…', 'βˆͺ', 'βˆ…'), '*')
(('βˆ…', '∘', 'πœ–'), '*')
(('βˆ…', '∘', 'βˆ…'), '*')
(('βˆ…', '∘', 'm'), '*')
('πœ–', '*')
(('Ι™', '∘', 'βˆ…'), '*')
(('m', '*'), '*')
(('Ι™', '∘', 'πœ–'), '*')
(('Ι™', '*'), '*')
(('Ι™', '∘', 'm'), '*')
(('πœ–', 'βˆͺ', 'πœ–'), '*')
(('βˆ…', 'βˆͺ', 'Ι™'), '*')
(('Ι™', 'βˆͺ', 'Ι™'), '*')
(('πœ–', 'βˆͺ', 'βˆ…'), '*')
(('πœ–', '∘', 'Ι™'), '*')
(('πœ–', 'βˆͺ', 'm'), '*')
(('πœ–', '*'), '*')
('Ι™', '*')
(('m', '∘', 'πœ–'), '*')
(('βˆ…', '*'), '*')
(('m', '∘', 'βˆ…'), '*')
(('m', '∘', 'm'), '*')
(('m', 'βˆͺ', 'Ι™'), '*')
('m', '*')
(('Ι™', 'βˆͺ', 'πœ–'), '*')
(('Ι™', 'βˆͺ', 'βˆ…'), '*')
(('πœ–', '∘', 'πœ–'), '*')
(('πœ–', '∘', 'βˆ…'), '*')
(('Ι™', 'βˆͺ', 'm'), '*')
(('βˆ…', 'βˆͺ', 'πœ–'), '*')
(('πœ–', '∘', 'm'), 'βˆͺ', ('πœ–', '∘', 'm'))
(('πœ–', '∘', 'm'), 'βˆͺ', ('βˆ…', '∘', 'Ι™'))
(('πœ–', '∘', 'm'), 'βˆͺ', ('βˆ…', 'βˆͺ', 'm'))
(('πœ–', '∘', 'm'), 'βˆͺ', ('Ι™', '∘', 'Ι™'))
(('πœ–', '∘', 'm'), 'βˆͺ', 'βˆ…')
(('πœ–', '∘', 'm'), 'βˆͺ', ('m', 'βˆͺ', 'πœ–'))
(('πœ–', '∘', 'm'), 'βˆͺ', ('m', 'βˆͺ', 'βˆ…'))
(('πœ–', '∘', 'm'), 'βˆͺ', ('m', 'βˆͺ', 'm'))
(('πœ–', '∘', 'm'), 'βˆͺ', ('πœ–', 'βˆͺ', 'Ι™'))
(('πœ–', '∘', 'm'), 'βˆͺ', ('m', '∘', 'Ι™'))
(('πœ–', '∘', 'm'), 'βˆͺ', ('βˆ…', 'βˆͺ', 'βˆ…'))
(('πœ–', '∘', 'm'), 'βˆͺ', ('βˆ…', '∘', 'πœ–'))
(('πœ–', '∘', 'm'), 'βˆͺ', ('βˆ…', '∘', 'βˆ…'))
(('πœ–', '∘', 'm'), 'βˆͺ', ('βˆ…', '∘', 'm'))
(('πœ–', '∘', 'm'), 'βˆͺ', 'πœ–')
(('πœ–', '∘', 'm'), 'βˆͺ', ('Ι™', '∘', 'βˆ…'))
(('πœ–', '∘', 'm'), 'βˆͺ', ('m', '*'))
(('πœ–', '∘', 'm'), 'βˆͺ', ('Ι™', '∘', 'πœ–'))
(('πœ–', '∘', 'm'), 'βˆͺ', ('Ι™', '*'))
(('πœ–', '∘', 'm'), 'βˆͺ', ('Ι™', '∘', 'm'))
(('πœ–', '∘', 'm'), 'βˆͺ', ('πœ–', 'βˆͺ', 'πœ–'))
(('πœ–', '∘', 'm'), 'βˆͺ', ('βˆ…', 'βˆͺ', 'Ι™'))

Next, we can write a function that recursively evaluates a regular expression.

def evaluate_regular_expression(regex):
    if regex == 'βˆ…':
        return
    
    elif regex == 'πœ–':
        yield ''
    
    elif isinstance(regex, str):
        yield regex
    
    elif regex[1] == '*':
        i = 0
        while True:
            for s in evaluate_regular_expression(regex[0]):
                yield s*i
            
            i += 1
            
    elif regex[1] == 'βˆͺ':
        for s1 in evaluate_regular_expression(regex[0]):
            yield s1
            
        for s2 in evaluate_regular_expression(regex[2]):
            yield s2
            
    elif regex[1] == '∘':
        for s1 in evaluate_regular_expression(regex[0]):
            for s2 in evaluate_regular_expression(regex[2]):
                yield s1 + s2
for s in evaluate_regular_expression('Ι™'):
    print(s)
Ι™
for s in evaluate_regular_expression(('Ι™', 'βˆͺ', 'm')):
    print(s)
Ι™
m
for s in evaluate_regular_expression(('Ι™', '∘', 'm')):
    print(s)
Ι™m
for s in evaluate_regular_expression(('Ι™', '∘', ('m', 'βˆͺ', 'g'))):
    print(s)
Ι™m
Ι™g
for i, s in enumerate(evaluate_regular_expression(('Ι™', '*'))):
    print(s)
    
    if i > 10:
        break

Ι™
Ι™Ι™
Ι™Ι™Ι™
Ι™Ι™Ι™Ι™
Ι™Ι™Ι™Ι™Ι™
Ι™Ι™Ι™Ι™Ι™Ι™
Ι™Ι™Ι™Ι™Ι™Ι™Ι™
Ι™Ι™Ι™Ι™Ι™Ι™Ι™Ι™
Ι™Ι™Ι™Ι™Ι™Ι™Ι™Ι™Ι™
Ι™Ι™Ι™Ι™Ι™Ι™Ι™Ι™Ι™Ι™
Ι™Ι™Ι™Ι™Ι™Ι™Ι™Ι™Ι™Ι™Ι™
for i, s in enumerate(evaluate_regular_expression((('Ι™', 'βˆͺ', 'm'), '*'))):
    print(s)
    
    if i > 10:
        break


Ι™
m
Ι™Ι™
mm
Ι™Ι™Ι™
mmm
Ι™Ι™Ι™Ι™
mmmm
Ι™Ι™Ι™Ι™Ι™
mmmmm