import pyparsing
from typing import TypeVar, Optional
from collections.abc import Hashable, Callable
= Hashable
DataType = list[str, Optional[list['TreeList']]]
TreeList = tuple[DataType, Optional[tuple['TreeTuple', ...]]]
TreeTuple
class Tree:
= pyparsing.Suppress('(')
LPAR = pyparsing.Suppress(')')
RPAR = pyparsing.Regex(r'[^\(\)\s]+')
DATA
= pyparsing.Forward()
PARSER = pyparsing.ZeroOrMore(PARSER)
SUBTREE = pyparsing.Group(LPAR + DATA + SUBTREE + RPAR)
PARSERLIST <<= DATA | PARSERLIST
PARSER
def __init__(self, data: DataType, children: list['Tree'] = []):
self._data = data
self._children = children
self._validate()
def to_tuple(self) -> TreeTuple:
return self._data, tuple(c.to_tuple() for c in self._children)
def __hash__(self) -> int:
return hash(self.to_tuple())
def __eq__(self, other: 'Tree') -> bool:
return self.to_tuple() == other.to_tuple()
def __str__(self) -> str:
return ' '.join(self.terminals)
def __repr__(self) -> str:
return self.to_string()
def to_string(self, depth=0) -> str:
= (depth - 1) * ' ' +\
s int(depth > 0) * '--' +\
self._data + '\n'
+= ''.join(c.to_string(depth+1)
s for c in self._children)
return s
def __contains__(self, data: DataType) -> bool:
# pre-order depth-first search
if self._data == data:
return True
else:
for child in self._children:
if data in child:
return True
return False
def __getitem__(self, idx: int | tuple[int, ...]) -> 'Tree':
if isinstance(idx, int):
return self._children[idx]
elif len(idx) == 1:
return self._children[idx[0]]
elif idx:
return self._children[idx[0]].__getitem__(idx[1:])
else:
return self
@property
def data(self) -> DataType:
return self._data
@property
def children(self) -> list['Tree']:
return self._children
@property
def terminals(self) -> list[str]:
if self._children:
return [w for c in self._children
for w in c.terminals]
else:
return [str(self._data)]
def _validate(self) -> None:
try:
assert all(isinstance(c, Tree)
for c in self._children)
except AssertionError:
= 'all children must be trees'
msg raise TypeError(msg)
def index(self, data: DataType, index_path: tuple[int, ...] = tuple()) -> list[tuple[int, ...]]:
= [index_path] if self._data==data else []
indices = [] if index_path == -1 else index_path
root_path
+= [j
indices for i, c in enumerate(self._children)
for j in c.index(data, root_path+(i,))]
return indices
def relabel(self, label_map: Callable[[DataType], DataType],
bool = False, terminals_only: bool = False) -> 'Tree':
nonterminals_only: if not nonterminals_only and not terminals_only:
= label_map(self._data)
data elif nonterminals_only and self._children:
= label_map(self._data)
data elif terminals_only and not self._children:
= label_map(self._data)
data else:
= self._data
data
= [c.relabel(label_map, nonterminals_only, terminals_only)
children for c in self._children]
return self.__class__(data, children)
@classmethod
def from_string(cls, treestr: str) -> 'Tree':
= cls.PARSER.parseString(treestr[2:-2])[0]
treelist
return cls.from_list(treelist)
@classmethod
def from_list(cls, treelist: TreeList) -> 'Tree':
if isinstance(treelist, str):
return cls(treelist[0])
elif isinstance(treelist[1], str):
return cls(treelist[0], [cls(treelist[1])])
else:
return cls(treelist[0], [cls.from_list(l) for l in treelist[1:]])
Assignments 9 and 10
Assignment 9 consists of Tasks 1 and 2 and Assignment 10 consists of Tasks 3-5.
In these assignments, we will focus on (i) extracting context free grammars (CFGs) from a treebank; and (ii) implement the CKY and Earley algorithms for recognizing/parsing CFGs.
Trees
We’ll use a slightly augmented version of the Tree
class we developed earlier in the class to represent trees that should be output by the parser.
CFG rules
Vanilla context free grammar rules (in contrast to the dotted rules we use for Earley, define below) will be represented using a fairly lightweight wrapper around pairings of a string (the left side) with a tuple of strings (the right side).
from enum import Enum
class NormalForm(Enum):
= 0
CNF = 1
BNF = 2
GNF
class Rule:
"""A context free grammar rule
Parameters
----------
left_side
right_side
"""
def __init__(self, left_side: str, *right_side: str):
self._left_side = left_side
self._right_side = right_side
def __repr__(self) -> str:
return self._left_side + ' -> ' + ' '.join(self._right_side)
def to_tuple(self) -> tuple[str, tuple[str, ...]]:
return (self._left_side, self._right_side)
def __hash__(self) -> int:
return hash(self.to_tuple())
def __eq__(self, other: 'Rule') -> bool:
= self._left_side == other._left_side
left_side_equal = self._right_side == other._right_side
right_side_equal
return left_side_equal and right_side_equal
def validate(self, alphabet: set[str], variables: set[str], normal_form: NormalForm = NormalForm.CNF):
"""validate the rule
Parameters
----------
alphabet : set(str)
variables : set(str)
"""
if self._left_side not in variables:
= "left side of rule must be a variable"
msg raise ValueError(msg)
= alphabet | variables | {''}
acceptable
if not all([s in acceptable for s in self._right_side]):
= "right side of rule must contain only" +\
msg "a variable, a symbol, or the empty string"
raise ValueError(msg)
if normal_form == NormalForm.CNF:
try:
if len(self.right_side) == 1:
assert self.right_side[0] in alphabet
elif len(self.right_side) == 2:
assert all([s in variables for s in self.right_side])
else:
raise AssertionError
except AssertionError:
raise ValueError(f"{self} is not in CNF")
@property
def left_side(self) -> str:
return self._left_side
@property
def right_side(self) -> tuple[str, ...]:
return self._right_side
@property
def is_unary(self) -> bool:
return len(self._right_side) == 1
@property
def is_binary(self) -> bool:
return len(self._right_side) == 2
Defining a rule is straightforward.
"S", "NP", "VP") Rule(
S -> NP VP
Note that these rules are hashable, so they can be members of a python set
.
"S", "NP", "VP"), Rule("S", "NP", "VP")} {Rule(
{S -> NP VP}
Context Free Grammar
As in the previous two assignments, we will define a ContextFreeGrammar
to align very closely with the formal definition we covered in class. But because the recognition and parsing algorithms are slightly more complicated than those for finite state automata, we will factor the parsing algorithms themselves out of this class. To do this, we will specify a parser class that the grammar will initialize during its own initilization. We will set it to None
to start, which will mean that we won’t be able to use the ContextFreeGrammar.__call__
method until we define a parser.
from typing import Literal
from functools import lru_cache
= Literal["recognize", "parse"]
Mode
class ContextFreeGrammar:
"""
A context free grammar
Parameters
----------
alphabet : set(str)
variables : set(str)
rules : set(Rule)
start_variable : str
Attributes
----------
alphabet : set(str)
variables : set(str)
rules : set(Rule)
start_variable : str
Methods
-------
reduce(left_side)
"""
# this will need to be filled in by the parser class, once it is defined:
# - CKYParser
# - EarleyParser
= None
parser_class
def __init__(self, alphabet: set[str], variables: set[str], rules: set[Rule], start_variable: str):
self._alphabet = alphabet
self._variables = variables
self._rules = rules
self._start_variable = start_variable
self._validate_variables()
self._validate_rules()
if self.parser_class is not None:
self._parser = self.parser_class(self)
else:
self._parser = None
def __call__(self, string: str | list[str], mode: Mode = "recognize"):
if self._parser is not None:
return self._parser(string, mode)
else:
raise AttributeError("no parser is specified")
def _validate_variables(self):
if self._alphabet & self._variables:
raise ValueError('alphabet and variables must not share elements')
if self._start_variable not in self._variables:
raise ValueError('start variable must be in set of variables')
def _validate_rules(self):
if self.parser_class is not None:
for r in self._rules:
self._alphabet, self._variables,
r.validate(self.parser_class.normal_form)
@property
def alphabet(self) -> set[str]:
return self._alphabet
@property
def variables(self) -> set[str]:
return self._variables
@lru_cache(2**10)
def rules(self, left_side: str | None = None) -> set[Rule]:
if left_side is None:
return self._rules
else:
return {rule for rule in self._rules
if rule.left_side == left_side}
@property
def start_variable(self) -> str:
return self._start_variable
@lru_cache(2**14)
def parts_of_speech(self, word: str | None = None) -> set[str]:
if word is None:
return {rule.left_side for rule in self._rules
if rule.is_unary
if rule.right_side[0] in self._alphabet}
else:
return {rule.left_side for rule in self._rules
if rule.is_unary
if rule.right_side[0] == word}
@property
def phrase_variables(self) -> set[str]:
try:
return self._phrase_variables
except AttributeError:
self._phrase_variables = {rule.left_side for rule in self._rules
if not rule.is_unary or
0] not in self._alphabet}
rule.right_side[return self._phrase_variables
@lru_cache(2^15)
def reduce(self, *right_side: str) -> set[str]:
"""
the nonterminals that can be rewritten as right_side
Parameters
----------
right_side
Returns
-------
set(str)
"""
return {r.left_side for r in self._rules
if r.right_side == tuple(right_side)}
We can then define a context free grammar as below.
= ContextFreeGrammar(alphabet={'the', 'greyhound', 'ate', 'the', 'salmon',
grammar 'with', 'a', 'fork', 'again', 'quickly'},
={'S', 'NP', 'VP', 'PP', 'D', 'N', 'V', 'P', 'Adv'},
variables={Rule('S', 'NP', 'VP'),
rules'NP', 'D', 'N'),
Rule('NP', 'NP', 'PP'),
Rule('VP', 'V', 'NP'),
Rule('VP', 'VP', 'PP'),
Rule('VP', 'Adv', 'VP'),
Rule('VP', 'VP', 'Adv'),
Rule('PP', 'P', 'NP'),
Rule('D', 'the'),
Rule('D', 'a'),
Rule('N', 'greyhound'),
Rule('N', 'salmon'),
Rule('N', 'fork'),
Rule('V', 'fork'),
Rule('V', 'ate'),
Rule('P', 'with'),
Rule('Adv', 'again'),
Rule('Adv', 'quickly')},
Rule(='S') start_variable
There are a few attributes and methods to take note of here. One allows you to easily work with parts of speech…
grammar.parts_of_speech()
{'Adv', 'D', 'N', 'P', 'V'}
…including finding all the parts of speech for a particular word.
"fork") grammar.parts_of_speech(
{'N', 'V'}
Another allows easy access to rules…
grammar.rules()
{Adv -> again,
Adv -> quickly,
D -> a,
D -> the,
N -> fork,
N -> greyhound,
N -> salmon,
NP -> D N,
NP -> NP PP,
P -> with,
PP -> P NP,
S -> NP VP,
V -> ate,
V -> fork,
VP -> Adv VP,
VP -> V NP,
VP -> VP Adv,
VP -> VP PP}
…including finding all rules that have a particular left side.
"VP") grammar.rules(
{VP -> Adv VP, VP -> V NP, VP -> VP Adv, VP -> VP PP}
Most importantly, I have implemented a ContextFreeGrammar.reduce
method for finding all left sides that can be rewritten from a particular right side.
reduce("V", "NP") grammar.
{'VP'}
Treebank
We will extract a CFG from the English Web Treebank. We can use the reader developed earlier in the course to read the treebank.
import tarfile
from abc import ABC
class TreeBank(ABC):
def __iter__(self):
return self
def __next__(self):
return next(self._tree_iter)
class EnglishWebTreebank(TreeBank):
def __init__(self, root='LDC2012T13.tgz'):
self._root = root
self._tree_iter = self._construct_tree_iter()
def _construct_tree_iter(self):
with tarfile.open(self._root) as corpus:
for fname in corpus.getnames():
if '.xml.tree' in fname:
with corpus.extractfile(fname) as treefile:
= treefile.readline().decode()
treestr yield fname, Tree.from_string(treestr)
Task 1
Write a class method ContextFreeGrammar.from_treebank
that extracts all of the alphabet elements, variables, and rules implied by the trees in the treebank. For instance, the rules implied by the following tree…
print(next(EnglishWebTreebank())[1].__repr__())
…are:
VB -> try
PP-LOC -> IN NP
NP -> NNP NNP
JJ -> argentinian
UH -> please
NP -> DT JJ NN
VB -> like
-NONE- -> *PRO*
NP-SBJ-1 -> PRP
NNP -> tampa
NP-SBJ -> PRP
IN -> in
SQ -> MD NP-SBJ VP
S -> NP-SBJ-1 VP
WHADVP-9 -> WRB
VP -> VB NP PP-LOC ADVP-LOC-9
VP -> VB NP INTJ
NNP -> bay
NP -> NNS
PRP -> I
S -> NP-SBJ VP
NNS -> morcillas
VP -> TO VP
MD -> will
-NONE- -> *T*
NN -> type
WRB -> where
VP -> MD S
MD -> can
INTJ -> UH
. -> ?
CC -> but
, -> ,
NP-SBJ-1 -> -NONE-
S -> S , CC S
VB -> get
NNS -> anothers
S -> SBARQ , S .
ADVP-LOC-9 -> -NONE-
VP -> VB NP
DT -> the
VP -> MD VP
TO -> to
SBARQ -> WHADVP-9 SQ
I suggest breaking extraction into extraction from a tree using a rules
attribute…
class Tree(Tree):
@property
def rules(self) -> set[Rule]:
raise NotImplementedError
…which you can use to implement ContextFreeGrammar.from_treebank
.
class ContextFreeGrammar(ContextFreeGrammar):
@classmethod
def from_treebank(cls, treebank: TreeBank) -> 'ContextFreeGrammar':
raise NotImplementedError
One thing you will need to make sure to do is to ensure that the alphabet and variables are disjoint. This is not guaranteed using naive extraction: nine symbols that show up as nonterminals also show up as terminals. Handling this will require some relabeling, which you can do using Tree.relabel
.
Another thing you should make sure to do is to correctly handle trees whose root node is not S
: a variety of “sentences” in EWT have NP
or FRAG
root nodes. But there are true sentences that have a root note other than S
: S-IMP
, SBAR
, SQ
, SINV
, SBARQ
, and S-HLN
. Assume that all and only trees with variables at the root starting with S
are sentences. Doing this will entail adding some rules to the grammar.
Finally, you should lower case all terminals. This can also be done with Tree.relabel
.
Test your Tree.rules
attribute against the tree we looked at above.
# write tests
CKY parsing
We will implement CKY parsing and recognition using a system of classes: CKYChart
s contain CKYChartEntry
s and are filled with those entries by a CKYParser
, which is a kind of ContextFreeGrammarParser
. The reasoning behind having an abstract base class for ContextFreeGrammarParser
s when we implement the EarleyParser
.
from typing import Union
= tuple[int, int]
SpanIndices = tuple[str, SpanIndices]
CKYBackPointer
class Chart(ABC):
@property
def parses(self):
raise NotImplementedError
class ChartEntry(ABC):
def __hash__(self) -> int:
raise NotImplementedError
@property
def backpointers(self):
raise NotImplementedError
class CKYChartEntry(ChartEntry):
"""
A chart entry for a CKY chart
Parameters
----------
symbol
backpointers
Attributes
----------
symbol
backpointers
"""
def __init__(self, symbol: str, *backpointers: CKYBackPointer):
self._symbol = symbol
self._backpointers = backpointers
def to_tuple(self):
return (self._symbol, self._backpointers)
def __hash__(self) -> int:
return hash(self.to_tuple())
def __eq__(self, other: 'CKYChartEntry') -> bool:
return self.to_tuple() == other.to_tuple()
def __repr__(self) -> str:
return self._symbol + ' -> ' + ' '.join(
f"{bp[0]}({bp[1][0]}, {bp[1][1]})"
for bp in self.backpointers
)
def __str__(self) -> str:
return self.__repr__()
@property
def symbol(self) -> str:
return self._symbol
@property
def backpointers(self) -> tuple[CKYBackPointer, ...]:
return self._backpointers
class CKYChart(Chart):
"""
A chart for a CKY parser
Jurafsky & Martin call this a table
Parameters
----------
input_size
Attributes
----------
parses
"""
def __init__(self, input_size: int):
self._input_size = input_size
self._chart: list[list[set[CKYChartEntry]]] = [
set({})
[for _ in range(input_size+1)]
for _ in range(input_size+1)
]
def __getitem__(self, index: SpanIndices) -> set[CKYChartEntry]:
= index
i, j
self._validate_index(i, j)
return self._chart[i][j]
def __setitem__(self, key: SpanIndices, item: set[CKYChartEntry]):
= key
i, j
self._chart[i][j] = item
def _validate_index(self, i, j):
try:
assert i < j
except AssertionError:
= "cannot index into the lower " +\
msg "triangle of the chart"
raise ValueError(msg)
try:
self._chart[i]
except IndexError:
= "row index is too large"
msg raise ValueError(msg)
try:
self._chart[i][j]
except IndexError:
= "column index is too large"
msg raise ValueError(msg)
@property
def parses(self) -> set[Tree]:
try:
return self._parses
except AttributeError:
self._parses = self._construct_parses()
return self._parses
def _construct_parses(self, entry: Union['CKYChartEntry', None] = None) -> set[Tree]:
"""Construct the parses implied by the chart
Parameters
----------
entry
"""
raise NotImplementedError
class ContextFreeGrammarParser(ABC):
def __init__(self, grammar: ContextFreeGrammar):
self._grammar = grammar
def __call__(self, string, mode="recognize"):
if mode == "recognize":
return self._recognize(string)
elif mode == "parse":
return self._parse(string)
else:
= 'mode must be "parse" or "recognize"'
msg raise ValueError(msg)
class CKYParser(ContextFreeGrammarParser):
"""
A CKY parser
Parameters
----------
grammar : ContextFreeGrammar
"""
= NormalForm.CNF
normal_form
def _fill_chart(self, string: list[str]) -> CKYChart:
raise NotImplementedError
def _parse(self, string):
= self._fill_chart(string)
chart return chart.parses
def _recognize(self, string):
= self._fill_chart(string)
chart
return any([self._grammar.start_variable == entry.symbol
for entry in chart[0,len(string)]])
Task 2
Implement the CKYParser._fill_chart
and CKYChart._construct_parses
methods.
class CKYParser(CKYParser):
def _fill_chart(self, string: list[str]) -> CKYChart:
raise NotImplementedError
class CKYChart(CKYChart):
def _construct_parses(self, entry: CKYChartEntry | None = None) -> set[Tree]:
raise NotImplementedError
Test your implementation by ensuring that the following call yields the correct number of Tree
s. (We are not using the grammar you extracted from the treebank because we want to a relatively minimal example to test against. The grammar you extracted will be relevant for Task 4 below.)
= CKYParser
ContextFreeGrammar.parser_class
= ContextFreeGrammar(alphabet={'the', 'greyhound', 'ate', 'the', 'salmon',
grammar 'with', 'a', 'fork', 'again', 'quickly'},
={'S', 'NP', 'VP', 'PP', 'D', 'N', 'V', 'P', 'Adv'},
variables={Rule('S', 'NP', 'VP'),
rules'NP', 'D', 'N'),
Rule('NP', 'NP', 'PP'),
Rule('VP', 'V', 'NP'),
Rule('VP', 'VP', 'PP'),
Rule('VP', 'Adv', 'VP'),
Rule('VP', 'VP', 'Adv'),
Rule('PP', 'P', 'NP'),
Rule('D', 'the'),
Rule('D', 'a'),
Rule('N', 'greyhound'),
Rule('N', 'salmon'),
Rule('N', 'fork'),
Rule('V', 'fork'),
Rule('V', 'ate'),
Rule('P', 'with'),
Rule('Adv', 'again'),
Rule('Adv', 'quickly')},
Rule(='S')
start_variable
'the', 'greyhound', 'again', 'ate', 'the',
grammar(['salmon', 'with', 'a', 'fork', 'quickly'], mode="parse")
Dotted Rules
To implement Earley parsing, we need to augment our vanilla CFG rule from above to a dotted CFG rule, which tracks which constituents we may have seen up to a particular position in a sentence. This dotted rule needs to track not only the position of the dot in the rule (what kinds of consistuents might have been seen so far), but also which substring the dotted rule is associated with.
class DottedRule(Rule):
def __init__(self, rule: Rule, span: SpanIndices, dot: int = 0):
self._rule = rule
self._left_side = rule.left_side
self._right_side = rule.right_side
self._span = span
self._dot = dot
def to_tuple(self) -> tuple[Rule, SpanIndices, int]:
return self._rule, self._span, self._dot
def __hash__(self) -> int:
return hash(self.to_tuple())
def __eq__(self, other) -> bool:
return self.to_tuple() == other.to_tuple()
def __repr__(self) -> str:
return self._left_side + ' -> ' +\
' '.join(self._right_side[:self._dot]) +\
' . ' +\
' '.join(self._right_side[self._dot:]) +\
' [' + str(self._span[0]) + ', ' + str(self._span[1]) + ']'
def complete(self, new_left_edge: int) -> tuple['DottedRule', int]:
"""Complete the next symbol in this rule
Parameters
----------
new_left_edge
Returns
-------
new_dotted_rule
completed_symbol
old_left_edge
"""
= self._dot + 1
dot = (self._span[0], new_left_edge)
span
return DottedRule(self._rule, span, dot)
@property
def next_symbol(self) -> str:
if self.is_complete:
raise AttributeError('dotted rule is already complete')
else:
return self._right_side[self._dot]
@property
def dot(self) -> int:
return self._dot
@property
def span(self) -> SpanIndices:
return self._span
@property
def is_complete(self) -> bool:
return self._dot == len(self._right_side)
@property
def left_side(self) -> str:
return self._rule.left_side
To initialize a dotted CFG rule, we pass a vanilla CFG rule along with a tuple of indices representing the span that rule has recognized. If it has recognized nothing, the left index will be equal to the right index. We don’t need to pass the dot position because we assume that, on initialization, the dot is before the first right side symbol.
= DottedRule(Rule('S', 'NP', 'VP'), (0, 0))
dotted_rule
dotted_rule
We can increment the dot by calling DottedRule.complete
with the new right edge of the span.
2) dotted_rule.complete(
This procedure creates an entirely new object.
id(dotted_rule), id(dotted_rule.complete(2))
Calling DottedRule.complete
twice will increment the dot twice.
2).complete(10) dotted_rule.complete(
Finally, dotted rules are hashable and behave how you would expect when hashed.
'S', 'NP', 'VP'), (0, 0))} {dotted_rule, DottedRule(Rule(
Earley Parsing
We will implement Earley parsing and recognition using a similar system of classes to the ones we developed for CKY: EarleyChart
s contain EarleyChartEntry
s and are filled with those entries by a EarleyParser
, which is a kind of ContextFreeGrammarParser
.
= tuple[str, int]
EarleyBackPointer
class EarleyChartEntry(ChartEntry):
"""A chart entry for a Earley chart
Parameters
----------
dotted_rule
backpointers
"""
def __init__(self, dotted_rule: DottedRule, *backpointers: EarleyBackPointer):
self._dotted_rule = dotted_rule
self._backpointers = backpointers
def to_tuple(self) -> tuple[DottedRule, tuple[EarleyBackPointer, ...]]:
return self._dotted_rule, self._backpointers
def __hash__(self) -> int:
return hash(self.to_tuple())
def __eq__(self, other) -> bool:
return self.to_tuple() == other.__key()
def __repr__(self) -> str:
return self._dotted_rule.__repr__()
def __str__(self) -> str:
return self.__repr__()
@property
def backpointers(self) -> tuple[EarleyBackPointer, ...]:
return self._backpointers
@property
def dotted_rule(self):
return self._dotted_rule
@property
def next_symbol(self) -> str:
return self._dotted_rule.next_symbol
@property
def span(self) -> tuple[int]:
return self._dotted_rule.span
@property
def is_complete(self):
return self._dotted_rule.is_complete
def complete(self, entry: 'EarleyChartEntry', new_left_edge: int) -> 'EarleyChartEntry':
= self._dotted_rule.complete(new_left_edge)
new_dotted_rule
= (self._dotted_rule.next_symbol, self._dotted_rule.span[1])
bp = self._backpointers + (bp,)
backpointers
return EarleyChartEntry(new_dotted_rule, backpointers)
def is_completion_of(self, other: 'EarleyChartEntry') -> bool:
return self._dotted_rule.left_side == other.dotted_rule.next_symbol
class EarleyChart(Chart):
"""A chart for an Earley parser
Parameters
----------
input_size
"""
def __init__(self, input_size: int, start_variable: str = 'S'):
self._start_variable = start_variable
self._chart: list[set[EarleyChartEntry]] = [
set() for _ in range(input_size+1)
]
def __getitem__(self, index) -> set[EarleyChartEntry]:
return self._chart[index]
def __setitem__(self, key, item) -> None:
self._chart[key] = item
@property
def parses(self) -> set[Tree]:
try:
return self._parses
except AttributeError:
self._parses = set()
for entry in self._chart[-1]:
= entry.dotted_rule.left_side == self._start_variable
is_start = entry.dotted_rule.span == (0, self.input_size)
covers_string
if is_start and covers_string:
self._parses.add(self._construct_parses(entry))
return self._parses
def _construct_parses(self, entry: 'EarleyChartEntry') -> Tree:
"""Construct the parses implied by the chart
Parameters
----------
entry
"""
raise NotImplementedError
@property
def input_size(self) -> int:
return len(self._chart) - 1
class EarleyParser(ContextFreeGrammarParser):
"""
An Earley parser
Parameters
----------
grammar : ContextFreeGrammar
"""
= None
normal_form
def _fill_chart(self, string: list[str]) -> EarleyChart:
"""
a chart for the string based on a CFG
Parameters
----------
string
"""
raise NotImplementedError
def _predict(self, chart: EarleyChart, entry: EarleyChartEntry, position: int):
for rule in self._grammar.rules(entry.next_symbol):
= (position, position)
span = DottedRule(rule, span)
dotted_rule = EarleyChartEntry(dotted_rule)
entry
chart[position].add(entry)
def _scan(self, chart: EarleyChart, entry: EarleyChartEntry, position: int):
= self._string[position]
word = self._grammar.parts_of_speech(word)
pos_for_word
if entry.next_symbol in pos_for_word:
= Rule(entry.next_symbol, word)
rule = (position, position+1)
span = DottedRule(rule, span, dot=1)
dotted_rule
= EarleyChartEntry(dotted_rule)
unary_entry
+1].add(unary_entry)
chart[position
def _complete(self, chart: EarleyChart, entry: EarleyChartEntry, position: int):
= entry.span
start, end
for prev_entry in chart[start]:
if not prev_entry.is_complete and entry.is_completion_of(prev_entry):
= prev_entry.complete(entry, end)
completed_entry
chart[position].add(completed_entry)
def _parse(self, string: str | list[str]):
= self._fill_chart(string)
chart return chart.parses
def _recognize(self, string: str | list[str]):
= self._fill_chart(string)
chart
for entry in chart[-1]:
= entry.dotted_rule.left_side == self._grammar.start_variable
is_start = entry.dotted_rule.span == (0, chart.input_size)
covers_string
if is_start and covers_string:
return True
else:
return False
= EarleyParser ContextFreeGrammar.parser_class
Task 3
Implement the EarleyParser._fill_chart
and EarleyChart._construct_parses
method. Note that EarleyChart._construct_parses
works slightly differently than CKYChart._construct_parses
in only producing a single Tree
. The parses attributes collects all Tree
s implied by a chart into a set.
class EarleyParser(EarleyParser):
def _fill_chart(self, string: list[str]) -> EarleyChart:
"""
a chart for the string based on a CFG
Parameters
----------
string
"""
raise NotImplementedError
class EarleyChart(EarleyChart):
def _construct_parses(self, entry: 'EarleyChartEntry') -> Tree:
"""Construct the parses implied by the chart
Parameters
----------
entry
"""
raise NotImplementedError
Test your implementation using the same grammar and sentence we used to test the CKY implementation.
= EarleyParser
ContextFreeGrammar.parser_class
= ContextFreeGrammar(alphabet={'the', 'greyhound', 'ate', 'the', 'salmon',
grammar 'with', 'a', 'fork', 'again', 'quickly'},
={'S', 'NP', 'VP', 'PP', 'D', 'N', 'V', 'P', 'Adv'},
variables={Rule('S', 'NP', 'VP'),
rules'NP', 'D', 'N'),
Rule('NP', 'NP', 'PP'),
Rule('VP', 'V', 'NP'),
Rule('VP', 'VP', 'PP'),
Rule('VP', 'Adv', 'VP'),
Rule('VP', 'VP', 'Adv'),
Rule('PP', 'P', 'NP'),
Rule('D', 'the'),
Rule('D', 'a'),
Rule('N', 'greyhound'),
Rule('N', 'salmon'),
Rule('N', 'fork'),
Rule('V', 'fork'),
Rule('V', 'ate'),
Rule('P', 'with'),
Rule('Adv', 'again'),
Rule('Adv', 'quickly')},
Rule(='S')
start_variable
'the', 'greyhound', 'again', 'ate', 'the',
grammar(['salmon', 'with', 'a', 'fork', 'quickly'], mode="parse")
Task 4
Implement an instance method EarleyParser._predict_next_word
that is used when EarleyParser
is called in "predict"
mode. This method should take in a tokenized string—the prefix —and output a dictionary mapping from parts-of-speech that could come after the prefix to words that could come after the prefix.
from typing import Dict
from collections import defaultdict
class EarleyParser(EarleyParser):
def __call__(self, string, mode="recognize"):
if mode == "recognize":
return self._recognize(string)
elif mode == "parse":
return self._parse(string)
elif mode == "predict":
return self._predict_next_word(string)
else:
= 'mode must be "parse", "recognize", or "predict"'
msg raise ValueError(msg)
def _predict_next_word(self, prefix: list[str]) -> Dict[str, set[str]]:
raise NotImplementedError
= EarleyParser ContextFreeGrammar.parser_class
Test your prediction method against the grammar we used in class.
= ContextFreeGrammar(alphabet={'the', 'greyhound', 'ate', 'the', 'salmon',
grammar 'with', 'a', 'fork', 'again', 'too'},
={'S', 'NP', 'VP', 'PP', 'D', 'N', 'V', 'P', 'Adv'},
variables={Rule('S', 'NP', 'VP'),
rules'NP', 'D', 'N'),
Rule('NP', 'NP', 'PP'),
Rule('VP', 'V', 'NP'),
Rule('VP', 'VP', 'PP'),
Rule('VP', 'Adv', 'VP'),
Rule('VP', 'VP', 'Adv'),
Rule('PP', 'P', 'NP'),
Rule('D', 'the'),
Rule('D', 'a'),
Rule('N', 'greyhound'),
Rule('N', 'salmon'),
Rule('N', 'fork'),
Rule('V', 'ate'),
Rule('VP', 'ate'),
Rule('P', 'with'),
Rule('Adv', 'again'),
Rule('Adv', 'too')},
Rule(='S') start_variable
For instance, with the prefix the greyhound, you should get the following dictionary:
{'Adv': {'again', 'too'},
'VP': {'ate'},
'P': {'with'},
'V': {'ate'}}
# write tests here
Task 5
Randomly sample sentences from EWT and compute the predicted words given the first one, two, and three words in that sentence. For each such set of predictions for each sentence count the total number of parts-of-speech and the total number of words that could follow that prefix.
= ContextFreeGrammar.from_treebank(EnglishWebTreebank())
grammar
# sample and predict here
You’ll see a general pattern in the numbers, but if you dig down into exactly what the next word predictions are, they will look at. What gives rise to this oddness?