def regular_expressions_structured(sigma):
= frozenset(sigma | {'β
', 'π'})
old_regex_set
for rho in old_regex_set:
yield rho
while True:
= set(old_regex_set)
new_regex_set
for rho in old_regex_set:
= (rho, '*')
elem |= {elem}
new_regex_set yield elem
for rho1 in old_regex_set:
for rho2 in old_regex_set:
= (rho1, 'βͺ', rho2)
elem |= {elem}
new_regex_set yield elem
for rho1 in old_regex_set:
for rho2 in old_regex_set:
= (rho1, 'β', rho2)
elem |= {elem}
new_regex_set yield elem
= frozenset(new_regex_set)
old_regex_set
Evaluating Regular Expressions
Regular expressions evaluate to sets of strings on \(\Sigma\)βi.e. languages on \(\Sigma\). Another way of thinking about this is that a regular expression on \(\Sigma\) describes a language on \(\Sigma\).
We can define this evaluation procedure formally as a function \(\text{eval}: R(\Sigma) \rightarrow 2^{\Sigma^*}\), where \(R(\Sigma)\) is the set of regular expressions on \(\Sigma\).
\(\text{eval}(\rho) = \begin{cases}\{\} & \text{if } \rho = \emptyset \\\{\_\} & \text{if } \rho = \epsilon \\ \{\rho\} & \text{if } \rho \in \Sigma\\ \text{eval}(\rho_1) \times \text{eval}(\rho_2) & \text{if } \rho = (\rho_1 \circ \rho_2) \\ \text{eval}(\rho_1) \cup \text{eval}(\rho_2) & \text{if } \rho = (\rho_1 \cup \rho_2)\\ \bigcup_{i = 0}^\infty \text{eval}(\rho_1)^i & \text{if } \rho = \rho_1^*\\ \end{cases}\)
So first, weβre going to want to have access to the regular expressionsβ structure for the purposes of evaluating them. We could do this by using pyparsing
or some hand-built parser to reconstruct the structure of each expression, but we may as well just build retain the structure while generating the expression.
for i, r in enumerate(regular_expressions_structured({'Ι', 'm'})):
print(r)
if i > 100:
break
m
β
Ι
π
('m', '*')
('β
', '*')
('Ι', '*')
('π', '*')
('m', 'βͺ', 'm')
('m', 'βͺ', 'β
')
('m', 'βͺ', 'Ι')
('m', 'βͺ', 'π')
('β
', 'βͺ', 'm')
('β
', 'βͺ', 'β
')
('β
', 'βͺ', 'Ι')
('β
', 'βͺ', 'π')
('Ι', 'βͺ', 'm')
('Ι', 'βͺ', 'β
')
('Ι', 'βͺ', 'Ι')
('Ι', 'βͺ', 'π')
('π', 'βͺ', 'm')
('π', 'βͺ', 'β
')
('π', 'βͺ', 'Ι')
('π', 'βͺ', 'π')
('m', 'β', 'm')
('m', 'β', 'β
')
('m', 'β', 'Ι')
('m', 'β', 'π')
('β
', 'β', 'm')
('β
', 'β', 'β
')
('β
', 'β', 'Ι')
('β
', 'β', 'π')
('Ι', 'β', 'm')
('Ι', 'β', 'β
')
('Ι', 'β', 'Ι')
('Ι', 'β', 'π')
('π', 'β', 'm')
('π', 'β', 'β
')
('π', 'β', 'Ι')
('π', 'β', 'π')
(('π', 'β', 'm'), '*')
(('β
', 'β', 'Ι'), '*')
(('β
', 'βͺ', 'm'), '*')
(('Ι', 'β', 'Ι'), '*')
('β
', '*')
(('m', 'βͺ', 'π'), '*')
(('m', 'βͺ', 'β
'), '*')
(('m', 'βͺ', 'm'), '*')
(('π', 'βͺ', 'Ι'), '*')
(('m', 'β', 'Ι'), '*')
(('β
', 'βͺ', 'β
'), '*')
(('β
', 'β', 'π'), '*')
(('β
', 'β', 'β
'), '*')
(('β
', 'β', 'm'), '*')
('π', '*')
(('Ι', 'β', 'β
'), '*')
(('m', '*'), '*')
(('Ι', 'β', 'π'), '*')
(('Ι', '*'), '*')
(('Ι', 'β', 'm'), '*')
(('π', 'βͺ', 'π'), '*')
(('β
', 'βͺ', 'Ι'), '*')
(('Ι', 'βͺ', 'Ι'), '*')
(('π', 'βͺ', 'β
'), '*')
(('π', 'β', 'Ι'), '*')
(('π', 'βͺ', 'm'), '*')
(('π', '*'), '*')
('Ι', '*')
(('m', 'β', 'π'), '*')
(('β
', '*'), '*')
(('m', 'β', 'β
'), '*')
(('m', 'β', 'm'), '*')
(('m', 'βͺ', 'Ι'), '*')
('m', '*')
(('Ι', 'βͺ', 'π'), '*')
(('Ι', 'βͺ', 'β
'), '*')
(('π', 'β', 'π'), '*')
(('π', 'β', 'β
'), '*')
(('Ι', 'βͺ', 'm'), '*')
(('β
', 'βͺ', 'π'), '*')
(('π', 'β', 'm'), 'βͺ', ('π', 'β', 'm'))
(('π', 'β', 'm'), 'βͺ', ('β
', 'β', 'Ι'))
(('π', 'β', 'm'), 'βͺ', ('β
', 'βͺ', 'm'))
(('π', 'β', 'm'), 'βͺ', ('Ι', 'β', 'Ι'))
(('π', 'β', 'm'), 'βͺ', 'β
')
(('π', 'β', 'm'), 'βͺ', ('m', 'βͺ', 'π'))
(('π', 'β', 'm'), 'βͺ', ('m', 'βͺ', 'β
'))
(('π', 'β', 'm'), 'βͺ', ('m', 'βͺ', 'm'))
(('π', 'β', 'm'), 'βͺ', ('π', 'βͺ', 'Ι'))
(('π', 'β', 'm'), 'βͺ', ('m', 'β', 'Ι'))
(('π', 'β', 'm'), 'βͺ', ('β
', 'βͺ', 'β
'))
(('π', 'β', 'm'), 'βͺ', ('β
', 'β', 'π'))
(('π', 'β', 'm'), 'βͺ', ('β
', 'β', 'β
'))
(('π', 'β', 'm'), 'βͺ', ('β
', 'β', 'm'))
(('π', 'β', 'm'), 'βͺ', 'π')
(('π', 'β', 'm'), 'βͺ', ('Ι', 'β', 'β
'))
(('π', 'β', 'm'), 'βͺ', ('m', '*'))
(('π', 'β', 'm'), 'βͺ', ('Ι', 'β', 'π'))
(('π', 'β', 'm'), 'βͺ', ('Ι', '*'))
(('π', 'β', 'm'), 'βͺ', ('Ι', 'β', 'm'))
(('π', 'β', 'm'), 'βͺ', ('π', 'βͺ', 'π'))
(('π', 'β', 'm'), 'βͺ', ('β
', 'βͺ', 'Ι'))
Next, we can write a function that recursively evaluates a regular expression.
def evaluate_regular_expression(regex):
if regex == 'β
':
return
elif regex == 'π':
yield ''
elif isinstance(regex, str):
yield regex
elif regex[1] == '*':
= 0
i while True:
for s in evaluate_regular_expression(regex[0]):
yield s*i
+= 1
i
elif regex[1] == 'βͺ':
for s1 in evaluate_regular_expression(regex[0]):
yield s1
for s2 in evaluate_regular_expression(regex[2]):
yield s2
elif regex[1] == 'β':
for s1 in evaluate_regular_expression(regex[0]):
for s2 in evaluate_regular_expression(regex[2]):
yield s1 + s2
for s in evaluate_regular_expression('Ι'):
print(s)
Ι
for s in evaluate_regular_expression(('Ι', 'βͺ', 'm')):
print(s)
Ι
m
for s in evaluate_regular_expression(('Ι', 'β', 'm')):
print(s)
Ιm
for s in evaluate_regular_expression(('Ι', 'β', ('m', 'βͺ', 'g'))):
print(s)
Ιm
Ιg
for i, s in enumerate(evaluate_regular_expression(('Ι', '*'))):
print(s)
if i > 10:
break
Ι
ΙΙ
ΙΙΙ
ΙΙΙΙ
ΙΙΙΙΙ
ΙΙΙΙΙΙ
ΙΙΙΙΙΙΙ
ΙΙΙΙΙΙΙΙ
ΙΙΙΙΙΙΙΙΙ
ΙΙΙΙΙΙΙΙΙΙ
ΙΙΙΙΙΙΙΙΙΙΙ
for i, s in enumerate(evaluate_regular_expression((('Ι', 'βͺ', 'm'), '*'))):
print(s)
if i > 10:
break
Ι
m
ΙΙ
mm
ΙΙΙ
mmm
ΙΙΙΙ
mmmm
ΙΙΙΙΙ
mmmmm