from collections.abc import Iterable
def regular_expressions(sigma: set[str]) -> Iterable[str]:
= frozenset(sigma | {'∅', '𝜖'})
old_regex_set
for rho in old_regex_set:
yield rho
while True:
= set(old_regex_set)
new_regex_set
for rho in old_regex_set:
= rho+'*'
elem |= {elem}
new_regex_set yield elem
for rho1 in old_regex_set:
for rho2 in old_regex_set:
= '('+rho1+' ∪ '+rho2+')'
elem |= {elem}
new_regex_set yield elem
for rho1 in old_regex_set:
for rho2 in old_regex_set:
= '('+rho1+' ∘ '+rho2+')'
elem |= {elem}
new_regex_set yield elem
= frozenset(new_regex_set) old_regex_set
Formal Definition
Regular expressions themselves are strings. We formally define these strings recursively, starting with an alphabet \(\Sigma\), a symbol for the empty string \(\epsilon\), and a symbol for the empty set \(\emptyset\). Let’s assume the alphabet is the set of English phonemes:
\[\Sigma \equiv \{\text{ɑ, æ, ʌ, ɔ, aʊ, aɪ, b, tʃ, d, ð,} \ldots\}\]
\(\rho\) is a regular expression if and only if:
- \(\rho \in \Sigma \cup \{\epsilon, \emptyset\}\)
- \(\rho\) is \((\rho_1 \cup \rho_2)\) for some regular expressions \(\rho_1\) and \(\rho_2\)
- \(\rho\) is \((\rho_1 \circ \rho_2)\) for some regular expressions \(\rho_1\) and \(\rho_2\)
- \(\rho\) is \(\rho_1^*\) for some regular expression \(\rho_1\)
Thus, given the \(\Sigma\) above, the following are all regular expressions.
- æ
- b
- (æ \(\cup\) b)
- (æ \(\circ\) b\(^*\))
- (æ \(\circ\) b)\(^*\)
- ((æ \(\circ\) b) \(\cup\) b)
- (æ \(\circ\) (b \(\cup\) b))
The following are not regular expressions.
- æ \(\cup\) b
- æ \(\circ\) b\(^*\)
- æ \(\circ\) b \(\cup\) b
Another way of saying this is that the regular expressions on \(\Sigma\) are a language on \(\Sigma \cup\{\epsilon, \emptyset, \cup, \circ, (, ), *\}\). Note that, based on the recursive definition above, the set of regular expressions is infinite. Thus, the regular expressions on \(\Sigma\) are the first infinite language we’ve described (besides \(\Sigma^*\) itself).
We can generate all the regular expressions given an alphabet. Note that because the set of regular expressions is infinite, we need to use a generator.
= {"ɑ", "æ", "ə", "ʌ", "ɔ", "aʊ", "aɪ", "b", "tʃ", "d", "ð", "ɛ",
english_phonemes "ɝ", "eɪ", "f", "g", "h", "ɪ", "i", "dʒ", "k", "l", "m",
"n", "ŋ", "oʊ", "ɔɪ", "p", "ɹ", "s", "ʃ", "t", "θ", "ʊ",
"u", "v", "w", "j", "z", "ʒ"}
for i, r in enumerate(regular_expressions(english_phonemes)):
print(r)
if i > 100:
break
h
s
b
ɑ
f
∅
θ
n
u
p
dʒ
ɔɪ
ɛ
𝜖
eɪ
ŋ
oʊ
z
g
ʃ
tʃ
l
ɔ
æ
ə
ɝ
ɪ
ʒ
i
ʊ
j
d
ɹ
ð
t
ʌ
v
m
k
aʊ
w
aɪ
h*
s*
b*
ɑ*
f*
∅*
θ*
n*
u*
p*
dʒ*
ɔɪ*
ɛ*
𝜖*
eɪ*
ŋ*
oʊ*
z*
g*
ʃ*
tʃ*
l*
ɔ*
æ*
ə*
ɝ*
ɪ*
ʒ*
i*
ʊ*
j*
d*
ɹ*
ð*
t*
ʌ*
v*
m*
k*
aʊ*
w*
aɪ*
(h ∪ h)
(h ∪ s)
(h ∪ b)
(h ∪ ɑ)
(h ∪ f)
(h ∪ ∅)
(h ∪ θ)
(h ∪ n)
(h ∪ u)
(h ∪ p)
(h ∪ dʒ)
(h ∪ ɔɪ)
(h ∪ ɛ)
(h ∪ 𝜖)
(h ∪ eɪ)
(h ∪ ŋ)
(h ∪ oʊ)
(h ∪ z)
Regular expressions (so defined) evaluate to sets of strings on \(\Sigma\)–i.e. languages on \(\Sigma\). Another way of thinking about this is that a regular expression on \(\Sigma\) describes a language on \(\Sigma\).
We can define this evaluation procedure formally as a function \(\text{eval}: R(\Sigma) \rightarrow 2^{\Sigma^*}\), where \(R(\Sigma)\) is the set of regular expressions on \(\Sigma\).
\(\text{eval}(\rho) = \begin{cases}\{\} & \text{if } \rho = \emptyset \\\{\_\} & \text{if } \rho = \epsilon \\ \{\rho\} & \text{if } \rho \in \Sigma\\ \text{eval}(\rho_1) \times \text{eval}(\rho_2) & \text{if } \rho = (\rho_1 \circ \rho_2) \\ \text{eval}(\rho_1) \cup \text{eval}(\rho_2) & \text{if } \rho = (\rho_1 \cup \rho_2)\\ \bigcup_{i = 0}^\infty \text{eval}(\rho_1)^i & \text{if } \rho = \rho_1^*\\ \end{cases}\)
So first, we’re going to want to have access to the regular expressions’ structure for the purposes of evaluating them. We could do this by using pyparsing
or some hand-built parser to reconstruct the structure of each expression, but we may as well just build retain the structure while generating the expression.
def regular_expressions_structured(sigma):
= frozenset(sigma | {'∅', '𝜖'})
old_regex_set
for rho in old_regex_set:
yield rho
while True:
= set(old_regex_set)
new_regex_set
for rho in old_regex_set:
= (rho, '*')
elem |= {elem}
new_regex_set yield elem
for rho1 in old_regex_set:
for rho2 in old_regex_set:
= (rho1, '∪', rho2)
elem |= {elem}
new_regex_set yield elem
for rho1 in old_regex_set:
for rho2 in old_regex_set:
= (rho1, '∘', rho2)
elem |= {elem}
new_regex_set yield elem
= frozenset(new_regex_set)
old_regex_set
for i, r in enumerate(regular_expressions_structured({'ə', 'm'})):
print(r)
if i > 100:
break
m
∅
ə
𝜖
('m', '*')
('∅', '*')
('ə', '*')
('𝜖', '*')
('m', '∪', 'm')
('m', '∪', '∅')
('m', '∪', 'ə')
('m', '∪', '𝜖')
('∅', '∪', 'm')
('∅', '∪', '∅')
('∅', '∪', 'ə')
('∅', '∪', '𝜖')
('ə', '∪', 'm')
('ə', '∪', '∅')
('ə', '∪', 'ə')
('ə', '∪', '𝜖')
('𝜖', '∪', 'm')
('𝜖', '∪', '∅')
('𝜖', '∪', 'ə')
('𝜖', '∪', '𝜖')
('m', '∘', 'm')
('m', '∘', '∅')
('m', '∘', 'ə')
('m', '∘', '𝜖')
('∅', '∘', 'm')
('∅', '∘', '∅')
('∅', '∘', 'ə')
('∅', '∘', '𝜖')
('ə', '∘', 'm')
('ə', '∘', '∅')
('ə', '∘', 'ə')
('ə', '∘', '𝜖')
('𝜖', '∘', 'm')
('𝜖', '∘', '∅')
('𝜖', '∘', 'ə')
('𝜖', '∘', '𝜖')
(('𝜖', '∘', 'm'), '*')
(('∅', '∘', 'ə'), '*')
(('∅', '∪', 'm'), '*')
(('ə', '∘', 'ə'), '*')
('∅', '*')
(('m', '∪', '𝜖'), '*')
(('m', '∪', '∅'), '*')
(('m', '∪', 'm'), '*')
(('𝜖', '∪', 'ə'), '*')
(('m', '∘', 'ə'), '*')
(('∅', '∪', '∅'), '*')
(('∅', '∘', '𝜖'), '*')
(('∅', '∘', '∅'), '*')
(('∅', '∘', 'm'), '*')
('𝜖', '*')
(('ə', '∘', '∅'), '*')
(('m', '*'), '*')
(('ə', '∘', '𝜖'), '*')
(('ə', '*'), '*')
(('ə', '∘', 'm'), '*')
(('𝜖', '∪', '𝜖'), '*')
(('∅', '∪', 'ə'), '*')
(('ə', '∪', 'ə'), '*')
(('𝜖', '∪', '∅'), '*')
(('𝜖', '∘', 'ə'), '*')
(('𝜖', '∪', 'm'), '*')
(('𝜖', '*'), '*')
('ə', '*')
(('m', '∘', '𝜖'), '*')
(('∅', '*'), '*')
(('m', '∘', '∅'), '*')
(('m', '∘', 'm'), '*')
(('m', '∪', 'ə'), '*')
('m', '*')
(('ə', '∪', '𝜖'), '*')
(('ə', '∪', '∅'), '*')
(('𝜖', '∘', '𝜖'), '*')
(('𝜖', '∘', '∅'), '*')
(('ə', '∪', 'm'), '*')
(('∅', '∪', '𝜖'), '*')
(('𝜖', '∘', 'm'), '∪', ('𝜖', '∘', 'm'))
(('𝜖', '∘', 'm'), '∪', ('∅', '∘', 'ə'))
(('𝜖', '∘', 'm'), '∪', ('∅', '∪', 'm'))
(('𝜖', '∘', 'm'), '∪', ('ə', '∘', 'ə'))
(('𝜖', '∘', 'm'), '∪', '∅')
(('𝜖', '∘', 'm'), '∪', ('m', '∪', '𝜖'))
(('𝜖', '∘', 'm'), '∪', ('m', '∪', '∅'))
(('𝜖', '∘', 'm'), '∪', ('m', '∪', 'm'))
(('𝜖', '∘', 'm'), '∪', ('𝜖', '∪', 'ə'))
(('𝜖', '∘', 'm'), '∪', ('m', '∘', 'ə'))
(('𝜖', '∘', 'm'), '∪', ('∅', '∪', '∅'))
(('𝜖', '∘', 'm'), '∪', ('∅', '∘', '𝜖'))
(('𝜖', '∘', 'm'), '∪', ('∅', '∘', '∅'))
(('𝜖', '∘', 'm'), '∪', ('∅', '∘', 'm'))
(('𝜖', '∘', 'm'), '∪', '𝜖')
(('𝜖', '∘', 'm'), '∪', ('ə', '∘', '∅'))
(('𝜖', '∘', 'm'), '∪', ('m', '*'))
(('𝜖', '∘', 'm'), '∪', ('ə', '∘', '𝜖'))
(('𝜖', '∘', 'm'), '∪', ('ə', '*'))
(('𝜖', '∘', 'm'), '∪', ('ə', '∘', 'm'))
(('𝜖', '∘', 'm'), '∪', ('𝜖', '∪', '𝜖'))
(('𝜖', '∘', 'm'), '∪', ('∅', '∪', 'ə'))
Next, we can write a function that recursively evaluates a regular expression.
def evaluate_regular_expression(regex):
if regex == '∅':
return
elif regex == '𝜖':
yield ''
elif isinstance(regex, str):
yield regex
elif regex[1] == '*':
= 0
i while True:
for s in evaluate_regular_expression(regex[0]):
yield s*i
+= 1
i
elif regex[1] == '∪':
for s1 in evaluate_regular_expression(regex[0]):
yield s1
for s2 in evaluate_regular_expression(regex[2]):
yield s2
elif regex[1] == '∘':
for s1 in evaluate_regular_expression(regex[0]):
for s2 in evaluate_regular_expression(regex[2]):
yield s1 + s2
for s in evaluate_regular_expression('ə'):
print(s)
ə
for s in evaluate_regular_expression(('ə', '∪', 'm')):
print(s)
ə
m
for s in evaluate_regular_expression(('ə', '∘', 'm')):
print(s)
əm
for s in evaluate_regular_expression(('ə', '∘', ('m', '∪', 'g'))):
print(s)
əm
əg
for i, s in enumerate(evaluate_regular_expression(('ə', '*'))):
print(s)
if i > 10:
break
ə
əə
əəə
əəəə
əəəəə
əəəəəə
əəəəəəə
əəəəəəəə
əəəəəəəəə
əəəəəəəəəə
əəəəəəəəəəə
for i, s in enumerate(evaluate_regular_expression((('ə', '∪', 'm'), '*'))):
print(s)
if i > 10:
break
ə
m
əə
mm
əəə
mmm
əəəə
mmmm
əəəəə
mmmmm