import urllib.request
import os
if not os.path.exists("cmudict-0.7b"):
urllib.request.urlretrieve(
"https://raw.githubusercontent.com/Alexir/CMUdict/master/cmudict-0.7b",
"cmudict-0.7b"
)Loading a Lexicon
One use case for regular expressions in the context of computational linguistics is querying a lexicon. To look at this use case, we will use the lexicon of word forms found in the CMU Pronouncing Dictionary.
with open("cmudict-0.7b") as f:
for i, line in enumerate(f):
if i >= 200:
break
print(line, end="");;; # CMUdict -- Major Version: 0.07
;;;
;;; # $HeadURL: https://svn.code.sf.net/p/cmusphinx/code/branches/cmudict/cmudict-0.7b $
;;; # $Date:: 2015-07-15 12:34:30 -0400 (Wed, 15 Jul 2015) $:
;;; # $Id:: cmudict-0.7b 13083 2015-07-15 16:34:30Z air $:
;;; # $Rev:: 13083 $:
;;; # $Author:: air $:
;;;
;;; #
;;; # ========================================================================
;;; # Copyright (C) 1993-2015 Carnegie Mellon University. All rights reserved.
;;; #
;;; # Redistribution and use in source and binary forms, with or without
;;; # modification, are permitted provided that the following conditions
;;; # are met:
;;; #
;;; # 1. Redistributions of source code must retain the above copyright
;;; # notice, this list of conditions and the following disclaimer.
;;; # The contents of this file are deemed to be source code.
;;; #
;;; # 2. Redistributions in binary form must reproduce the above copyright
;;; # notice, this list of conditions and the following disclaimer in
;;; # the documentation and/or other materials provided with the
;;; # distribution.
;;; #
;;; # This work was supported in part by funding from the Defense Advanced
;;; # Research Projects Agency, the Office of Naval Research and the National
;;; # Science Foundation of the United States of America, and by member
;;; # companies of the Carnegie Mellon Sphinx Speech Consortium. We acknowledge
;;; # the contributions of many volunteers to the expansion and improvement of
;;; # this dictionary.
;;; #
;;; # THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
;;; # ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
;;; # THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
;;; # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
;;; # NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;;; # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;;; # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;;; # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;;; # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;;; # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;;; # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;; #
;;; # ========================================================================
;;; #
;;;
;;; NOTES ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; [20080401] (air) New dict file format introduced
;;; - comments (like this section) are allowed
;;; - file name is major version; vers/rev information is now in the header
;;;
;;;
;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
!EXCLAMATION-POINT EH2 K S K L AH0 M EY1 SH AH0 N P OY2 N T
"CLOSE-QUOTE K L OW1 Z K W OW1 T
"DOUBLE-QUOTE D AH1 B AH0 L K W OW1 T
"END-OF-QUOTE EH1 N D AH0 V K W OW1 T
"END-QUOTE EH1 N D K W OW1 T
"IN-QUOTES IH1 N K W OW1 T S
"QUOTE K W OW1 T
"UNQUOTE AH1 N K W OW1 T
#HASH-MARK HH AE1 M AA2 R K
#POUND-SIGN P AW1 N D S AY2 N
#SHARP-SIGN SH AA1 R P S AY2 N
%PERCENT P ER0 S EH1 N T
&ERSAND AE1 M P ER0 S AE2 N D
'ALLO AA2 L OW1
'APOSTROPHE AH0 P AA1 S T R AH0 F IY0
'BOUT B AW1 T
'CAUSE K AH0 Z
'COURSE K AO1 R S
'CUSE K Y UW1 Z
'EM AH0 M
'END-INNER-QUOTE EH1 N D IH1 N ER0 K W OW1 T
'END-QUOTE EH1 N D K W OW1 T
'FRISCO F R IH1 S K OW0
'GAIN G EH1 N
'INNER-QUOTE IH1 N ER0 K W OW1 T
'KAY K EY1
'M AH0 M
'N AH0 N
'QUOTE K W OW1 T
'ROUND R AW1 N D
'S EH1 S
'SINGLE-QUOTE S IH1 NG G AH0 L K W OW1 T
'TIL T IH1 L
'TIS T IH1 Z
'TWAS T W AH1 Z
(BEGIN-PARENS B IH0 G IH1 N P ER0 EH1 N Z
(IN-PARENTHESES IH1 N P ER0 EH1 N TH AH0 S IY2 Z
(LEFT-PAREN L EH1 F T P ER0 EH1 N
(OPEN-PARENTHESES OW1 P AH0 N P ER0 EH1 N TH AH0 S IY2 Z
(PAREN P ER0 EH1 N
(PARENS P ER0 EH1 N Z
(PARENTHESES P ER0 EH1 N TH AH0 S IY2 Z
)CLOSE-PAREN K L OW1 Z P ER0 EH1 N
)CLOSE-PARENTHESES K L OW1 Z P ER0 EH1 N TH AH0 S IY2 Z
)END-PAREN EH1 N D P ER0 EH1 N
)END-PARENS EH1 N D P ER0 EH1 N Z
)END-PARENTHESES EH1 N D P ER0 EH1 N TH AH0 S IY2 Z
)END-THE-PAREN EH1 N D DH AH0 P ER0 EH1 N
)PAREN P ER0 EH1 N
)PARENS P ER0 EH1 N Z
)RIGHT-PAREN R AY1 T P EH2 R AH0 N
)UN-PARENTHESES AH1 N P ER0 EH1 N TH AH0 S IY1 Z
+PLUS P L UH1 S
,COMMA K AA1 M AH0
--DASH D AE1 SH
-DASH D AE1 SH
-HYPHEN HH AY1 F AH0 N
...ELLIPSIS IH2 L IH1 P S IH0 S
.DECIMAL D EH1 S AH0 M AH0 L
.DOT D AA1 T
.FULL-STOP F UH1 L S T AA1 P
.PERIOD P IH1 R IY0 AH0 D
.POINT P OY1 N T
/SLASH S L AE1 SH
3-D TH R IY1 D IY2
3D TH R IY1 D IY2
:COLON K OW1 L AH0 N
;SEMI-COLON S EH1 M IY0 K OW1 L AH0 N
;SEMI-COLON(1) S EH1 M IH0 K OW2 L AH0 N
?QUESTION-MARK K W EH1 S CH AH0 N M AA1 R K
A AH0
A(1) EY1
A'S EY1 Z
A. EY1
A.'S EY1 Z
A.D. EY2 D IY1
A.M. EY2 EH1 M
A.S EY1 Z
A42128 EY1 F AO1 R T UW1 W AH1 N T UW1 EY1 T
AA EY2 EY1
AAA T R IH2 P AH0 L EY1
AAAI T R IH2 P AH0 L EY2 AY1
AABERG AA1 B ER0 G
AACHEN AA1 K AH0 N
AACHENER AA1 K AH0 N ER0
AAH AA1
AAKER AA1 K ER0
AALIYAH AA2 L IY1 AA2
AALSETH AA1 L S EH0 TH
AAMODT AA1 M AH0 T
AANCOR AA1 N K AO2 R
AARDEMA AA0 R D EH1 M AH0
AARDVARK AA1 R D V AA2 R K
AARDVARKS AA1 R D V AA2 R K S
AARGH AA1 R G
AARHUS AA2 HH UW1 S
AARON EH1 R AH0 N
AARON'S EH1 R AH0 N Z
AARONS EH1 R AH0 N Z
AARONSON EH1 R AH0 N S AH0 N
AARONSON(1) AA1 R AH0 N S AH0 N
AARONSON'S EH1 R AH0 N S AH0 N Z
AARONSON'S(1) AA1 R AH0 N S AH0 N Z
AARTI AA1 R T IY2
AASE AA1 S
AASEN AA1 S AH0 N
AB AE1 B
ABA EY2 B IY2 EY1
ABABA AH0 B AA1 B AH0
ABABA(1) AA1 B AH0 B AH0
ABACHA AE1 B AH0 K AH0
ABACK AH0 B AE1 K
ABACO AE1 B AH0 K OW2
ABACUS AE1 B AH0 K AH0 S
ABAD AH0 B AA1 D
ABADAKA AH0 B AE1 D AH0 K AH0
ABADI AH0 B AE1 D IY0
ABADIE AH0 B AE1 D IY0
ABAIR AH0 B EH1 R
ABALKIN AH0 B AA1 L K IH0 N
ABALONE AE2 B AH0 L OW1 N IY0
ABALONES AE2 B AH0 L OW1 N IY0 Z
ABALOS AA0 B AA1 L OW0 Z
ABANDON AH0 B AE1 N D AH0 N
ABANDONED AH0 B AE1 N D AH0 N D
ABANDONING AH0 B AE1 N D AH0 N IH0 NG
ABANDONMENT AH0 B AE1 N D AH0 N M AH0 N T
ABANDONMENTS AH0 B AE1 N D AH0 N M AH0 N T S
ABANDONS AH0 B AE1 N D AH0 N Z
ABANTO AH0 B AE1 N T OW0
ABARCA AH0 B AA1 R K AH0
ABARE AA0 B AA1 R IY0
ABASCAL AE1 B AH0 S K AH0 L
ABASH AH0 B AE1 SH
ABASHED AH0 B AE1 SH T
ABASIA AH0 B EY1 ZH Y AH0
ABATE AH0 B EY1 T
ABATED AH0 B EY1 T IH0 D
ABATEMENT AH0 B EY1 T M AH0 N T
ABATEMENTS AH0 B EY1 T M AH0 N T S
ABATES AH0 B EY1 T S
ABATING AH0 B EY1 T IH0 NG
ABATTOIR AE2 B AH0 T W AA1 R
ABBA AE1 B AH0
with open('cmudict-0.7b', encoding = "ISO-8859-1") as f:
cmudict = [l.split() for l in f if l[:3] != ";;;"]
cmudict = {w[0].lower(): w[1:] for w in cmudict}
cmudict["abstraction"]['AE0', 'B', 'S', 'T', 'R', 'AE1', 'K', 'SH', 'AH0', 'N']
This dictionary uses what’s known as the ARPABET and represents stress using numeric indicators: 0 for no stress, 1 for primary stress, and 2 for secondary stress. The ARPABET maps to more recognizable IPA representations in the following way.
arpabet_to_phoneme = {'AA': 'ɑ',
'AE': 'æ',
'AH': 'ə',
'AO': 'ɔ',
'AW': 'aʊ',
'AY': 'aɪ',
'B': 'b',
'CH': 'tʃ',
'D': 'd',
'DH': 'ð',
'EH': 'ɛ',
'ER': 'ɝ',
'EY': 'eɪ',
'F': 'f',
'G': 'g',
'HH': 'h',
'IH': 'ɪ',
'IY': 'i',
'JH': 'dʒ',
'K': 'k',
'L': 'l',
'M': 'm',
'N': 'n',
'NG': 'ŋ',
'OW': 'oʊ',
'OY': 'ɔɪ',
'P': 'p',
'R': 'ɹ',
'S': 's',
'SH': 'ʃ',
'T': 't',
'TH': 'θ',
'UH': 'ʊ',
'UW': 'u',
'V': 'v',
'W': 'w',
'Y': 'j',
'Z': 'z',
'ZH': 'ʒ'}We’ll use this mapping to construct the IPA representation from the ARPABET representation.
import re
entries: dict[str, list[str]] = {
w: [arpabet_to_phoneme[''.join(re.findall('[A-z]', phoneme))]
for phoneme in transcription]
for w, transcription in cmudict.items()
if len({c for c in w})>1
if len(transcription)>1
if not re.findall('[^A-z]', w[0])
}For instance, the IPA representation for the word abstraction can be accessed in the following way.
entries["abstraction"]['æ', 'b', 's', 't', 'ɹ', 'æ', 'k', 'ʃ', 'ə', 'n']
Dump IPA representation to file
with open("cmudict-ipa", "w") as f:
for w, ipa in entries.items():
ipa = " ".join(ipa)
f.write(f"{w},{ipa}\n")