import pyparsingfrom collections.abc import Hashable, Callabletype DataType = Hashabletype TreeList = list[str, list['TreeList'] | None]type TreeTuple = tuple[DataType, tuple['TreeTuple', ...] | None]class Tree: """A tree data structure with parsing and search capabilities. Parameters ---------- data : DataType The data stored at this node. children : list[Tree] The children of this node. """ LPAR = pyparsing.Suppress('(') RPAR = pyparsing.Suppress(')') DATA = pyparsing.Regex(r'[^\(\)\s]+') PARSER = pyparsing.Forward() SUBTREE = pyparsing.ZeroOrMore(PARSER) PARSERLIST = pyparsing.Group(LPAR + DATA + SUBTREE + RPAR) PARSER <<= DATA | PARSERLIST def __init__(self, data: DataType, children: list['Tree'] = []): self._data = data self._children = children self._validate() def to_tuple(self) -> TreeTuple: """Convert this tree to a nested tuple representation. Returns ------- TreeTuple Nested tuple of (data, children_tuples). """ return self._data, tuple(c.to_tuple() for c in self._children) def __hash__(self) -> int: return hash(self.to_tuple()) def __eq__(self, other: 'Tree') -> bool: return self.to_tuple() == other.to_tuple() def __str__(self) -> str: return ' '.join(self.terminals) def __repr__(self) -> str: return self.to_string() def to_string(self, depth=0) -> str: """Render this tree as an indented string. Parameters ---------- depth : int Current depth for indentation. Returns ------- str Indented string representation of the tree. """ s = (depth - 1) * ' ' +\ int(depth > 0) * '--' +\ self._data + '\n' s += ''.join(c.to_string(depth+1) for c in self._children) return s def __contains__(self, data: DataType) -> bool: # pre-order depth-first search if self._data == data: return True else: for child in self._children: if data in child: return True return False def __getitem__(self, idx: int | tuple[int, ...]) -> 'Tree': if isinstance(idx, int): return self._children[idx] elif len(idx) == 1: return self._children[idx[0]] elif idx: return self._children[idx[0]].__getitem__(idx[1:]) else: return self @property def data(self) -> DataType: """The data stored at this node.""" return self._data @property def children(self) -> list['Tree']: """The children of this node.""" return self._children @property def terminals(self) -> list[str]: """The terminal (leaf) strings of this tree.""" if self._children: return [w for c in self._children for w in c.terminals] else: return [str(self._data)] def _validate(self) -> None: try: assert all(isinstance(c, Tree) for c in self._children) except AssertionError: msg = 'all children must be trees' raise TypeError(msg) def index(self, data: DataType, index_path: tuple[int, ...] = tuple()) -> list[tuple[int, ...]]: """Find all index paths to nodes matching the given data. Parameters ---------- data : DataType The data to search for. index_path : tuple[int, ...] The current path prefix for recursive calls. Returns ------- list[tuple[int, ...]] List of index paths to matching nodes. """ indices = [index_path] if self._data==data else [] root_path = [] if index_path == -1 else index_path indices += [j for i, c in enumerate(self._children) for j in c.index(data, root_path+(i,))] return indices def relabel(self, label_map: Callable[[DataType], DataType], nonterminals_only: bool = False, terminals_only: bool = False) -> 'Tree': """Create a copy of this tree with relabeled nodes. Parameters ---------- label_map : Callable[[DataType], DataType] Function to apply to each node's data. nonterminals_only : bool If True, only relabel nonterminal nodes. terminals_only : bool If True, only relabel terminal nodes. Returns ------- Tree A new tree with relabeled nodes. """ if not nonterminals_only and not terminals_only: data = label_map(self._data) elif nonterminals_only and self._children: data = label_map(self._data) elif terminals_only and not self._children: data = label_map(self._data) else: data = self._data children = [c.relabel(label_map, nonterminals_only, terminals_only) for c in self._children] return self.__class__(data, children) @classmethod def from_string(cls, treestr: str) -> 'Tree': """Parse a tree from a parenthesized string representation. Parameters ---------- treestr : str The parenthesized string to parse. Returns ------- Tree The parsed tree. """ treelist = cls.PARSER.parseString(treestr[2:-2])[0] return cls.from_list(treelist) @classmethod def from_list(cls, treelist: TreeList) -> 'Tree': """Build a tree from a nested list representation. Parameters ---------- treelist : TreeList The nested list to convert. Returns ------- Tree The constructed tree. """ if isinstance(treelist, str): return cls(treelist[0]) elif isinstance(treelist[1], str): return cls(treelist[0], [cls(treelist[1])]) else: return cls(treelist[0], [cls.from_list(l) for l in treelist[1:]])Assignments 9 and 10
Assignment 9 consists of Tasks 1 and 2 and Assignment 10 consists of Tasks 3-5.
In these assignments, we will focus on (i) extracting context free grammars (CFGs) from a treebank; and (ii) implement the CKY and Earley algorithms for recognizing/parsing CFGs.
Trees
We’ll use a slightly augmented version of the Tree class we developed earlier in the class to represent trees that should be output by the parser.
CFG rules
Vanilla context free grammar rules (in contrast to the dotted rules we use for Earley, define below) will be represented using a fairly lightweight wrapper around pairings of a string (the left side) with a tuple of strings (the right side).
from enum import Enumclass NormalForm(Enum): """Normal form types for context free grammars.""" CNF = 0 BNF = 1 GNF = 2class Rule: """A context free grammar rule. Parameters ---------- left_side : str The nonterminal on the left side. right_side : str Variable number of symbols on the right side. """ def __init__(self, left_side: str, *right_side: str): self._left_side = left_side self._right_side = right_side def __repr__(self) -> str: return self._left_side + ' -> ' + ' '.join(self._right_side) def to_tuple(self) -> tuple[str, tuple[str, ...]]: """Convert to a hashable tuple representation. Returns ------- tuple[str, tuple[str, ...]] The (left_side, right_side) tuple. """ return (self._left_side, self._right_side) def __hash__(self) -> int: return hash(self.to_tuple()) def __eq__(self, other: 'Rule') -> bool: left_side_equal = self._left_side == other._left_side right_side_equal = self._right_side == other._right_side return left_side_equal and right_side_equal def validate(self, alphabet: set[str], variables: set[str], normal_form: NormalForm = NormalForm.CNF): """Validate the rule against the given alphabet, variables, and normal form. Parameters ---------- alphabet : set[str] The terminal symbols. variables : set[str] The nonterminal symbols. normal_form : NormalForm The normal form to validate against. Raises ------ ValueError If the rule is invalid. """ if self._left_side not in variables: msg = "left side of rule must be a variable" raise ValueError(msg) acceptable = alphabet | variables | {''} if not all([s in acceptable for s in self._right_side]): msg = "right side of rule must contain only" +\ "a variable, a symbol, or the empty string" raise ValueError(msg) if normal_form == NormalForm.CNF: if len(self.right_side) == 1: if self.right_side[0] not in alphabet: raise ValueError(f"{self} is not in CNF") elif len(self.right_side) == 2: if not all(s in variables for s in self.right_side): raise ValueError(f"{self} is not in CNF") else: raise ValueError(f"{self} is not in CNF") @property def left_side(self) -> str: """The nonterminal on the left side.""" return self._left_side @property def right_side(self) -> tuple[str, ...]: """The symbols on the right side.""" return self._right_side @property def is_unary(self) -> bool: """Whether this rule has exactly one symbol on the right side.""" return len(self._right_side) == 1 @property def is_binary(self) -> bool: """Whether this rule has exactly two symbols on the right side.""" return len(self._right_side) == 2Defining a rule is straightforward.
Rule("S", "NP", "VP")S -> NP VP
Note that these rules are hashable, so they can be members of a python set.
{Rule("S", "NP", "VP"), Rule("S", "NP", "VP")}{S -> NP VP}
Context Free Grammar
As in the previous two assignments, we will define a ContextFreeGrammar to align very closely with the formal definition we covered in class. But because the recognition and parsing algorithms are slightly more complicated than those for finite state automata, we will factor the parsing algorithms themselves out of this class. To do this, we will specify a parser class that the grammar will initialize during its own initilization. We will set it to None to start, which will mean that we won’t be able to use the ContextFreeGrammar.__call__ method until we define a parser.
from typing import Literalfrom functools import lru_cachetype Mode = Literal["recognize", "parse"]class ContextFreeGrammar: """A context free grammar. Parameters ---------- alphabet : set[str] The terminal symbols. variables : set[str] The nonterminal symbols. rules : set[Rule] The production rules. start_variable : str The start symbol. """ # filled in by the parser class once defined parser_class = None def __init__(self, alphabet: set[str], variables: set[str], rules: set[Rule], start_variable: str): self._alphabet = alphabet self._variables = variables self._rules = rules self._start_variable = start_variable self._validate_variables() self._validate_rules() if self.parser_class is not None: self._parser = self.parser_class(self) else: self._parser = None def __call__(self, string: str | list[str], mode: Mode = "recognize"): """Parse or recognize a string. Parameters ---------- string : str | list[str] The string to parse or recognize. mode : Mode Whether to "recognize" or "parse". Returns ------- bool | set[Tree] Boolean for recognize, set of parse trees for parse. """ if self._parser is not None: return self._parser(string, mode) else: raise AttributeError("no parser is specified") def _validate_variables(self): if self._alphabet & self._variables: raise ValueError('alphabet and variables must not share elements') if self._start_variable not in self._variables: raise ValueError('start variable must be in set of variables') def _validate_rules(self): if self.parser_class is not None: for r in self._rules: r.validate(self._alphabet, self._variables, self.parser_class.normal_form) @property def alphabet(self) -> set[str]: """The terminal symbols.""" return self._alphabet @property def variables(self) -> set[str]: """The nonterminal symbols.""" return self._variables @lru_cache(2**10) def rules(self, left_side: str | None = None) -> set[Rule]: """Get rules, optionally filtered by left side. Parameters ---------- left_side : str | None If provided, return only rules with this left side. Returns ------- set[Rule] The matching rules. """ if left_side is None: return self._rules else: return {rule for rule in self._rules if rule.left_side == left_side} @property def start_variable(self) -> str: """The start symbol.""" return self._start_variable @lru_cache(2**14) def parts_of_speech(self, word: str | None = None) -> set[str]: """Get parts of speech, optionally filtered by word. Parameters ---------- word : str | None If provided, return only POS tags for this word. Returns ------- set[str] The matching parts of speech. """ if word is None: return {rule.left_side for rule in self._rules if rule.is_unary if rule.right_side[0] in self._alphabet} else: return {rule.left_side for rule in self._rules if rule.is_unary if rule.right_side[0] == word} @property def phrase_variables(self) -> set[str]: """The set of phrase-level (non-POS) variables.""" try: return self._phrase_variables except AttributeError: self._phrase_variables = {rule.left_side for rule in self._rules if not rule.is_unary or rule.right_side[0] not in self._alphabet} return self._phrase_variables @lru_cache(2^15) def reduce(self, *right_side: str) -> set[str]: """Find nonterminals that can be rewritten as the given right side. Parameters ---------- right_side : str The right side symbols to reduce. Returns ------- set[str] The set of nonterminals that can produce this right side. """ return {r.left_side for r in self._rules if r.right_side == tuple(right_side)}We can then define a context free grammar as below.
grammar = ContextFreeGrammar(alphabet={'the', 'greyhound', 'ate', 'the', 'salmon',
'with', 'a', 'fork', 'again', 'quickly'},
variables={'S', 'NP', 'VP', 'PP', 'D', 'N', 'V', 'P', 'Adv'},
rules={Rule('S', 'NP', 'VP'),
Rule('NP', 'D', 'N'),
Rule('NP', 'NP', 'PP'),
Rule('VP', 'V', 'NP'),
Rule('VP', 'VP', 'PP'),
Rule('VP', 'Adv', 'VP'),
Rule('VP', 'VP', 'Adv'),
Rule('PP', 'P', 'NP'),
Rule('D', 'the'),
Rule('D', 'a'),
Rule('N', 'greyhound'),
Rule('N', 'salmon'),
Rule('N', 'fork'),
Rule('V', 'fork'),
Rule('V', 'ate'),
Rule('P', 'with'),
Rule('Adv', 'again'),
Rule('Adv', 'quickly')},
start_variable='S')There are a few attributes and methods to take note of here. One allows you to easily work with parts of speech…
grammar.parts_of_speech(){'Adv', 'D', 'N', 'P', 'V'}
…including finding all the parts of speech for a particular word.
grammar.parts_of_speech("fork"){'N', 'V'}
Another allows easy access to rules…
grammar.rules(){Adv -> again,
Adv -> quickly,
D -> a,
D -> the,
N -> fork,
N -> greyhound,
N -> salmon,
NP -> D N,
NP -> NP PP,
P -> with,
PP -> P NP,
S -> NP VP,
V -> ate,
V -> fork,
VP -> Adv VP,
VP -> V NP,
VP -> VP Adv,
VP -> VP PP}
…including finding all rules that have a particular left side.
grammar.rules("VP"){VP -> Adv VP, VP -> V NP, VP -> VP Adv, VP -> VP PP}
Most importantly, I have implemented a ContextFreeGrammar.reduce method for finding all left sides that can be rewritten from a particular right side.
grammar.reduce("V", "NP"){'VP'}
Treebank
We will extract a CFG from the English Web Treebank. We can use the reader developed earlier in the course to read the treebank.
import tarfile
from abc import ABC
class TreeBank(ABC):
def __iter__(self):
return self
def __next__(self):
return next(self._tree_iter)
class EnglishWebTreebank(TreeBank):
def __init__(self, root='LDC2012T13.tgz'):
self._root = root
self._tree_iter = self._construct_tree_iter()
def _construct_tree_iter(self):
with tarfile.open(self._root) as corpus:
for fname in corpus.getnames():
if '.xml.tree' in fname:
with corpus.extractfile(fname) as treefile:
treestr = treefile.readline().decode()
yield fname, Tree.from_string(treestr)Task 1
Write a class method ContextFreeGrammar.from_treebank that extracts all of the alphabet elements, variables, and rules implied by the trees in the treebank. For instance, the rules implied by the following tree…
print(next(EnglishWebTreebank())[1].__repr__())…are:
VB -> try
PP-LOC -> IN NP
NP -> NNP NNP
JJ -> argentinian
UH -> please
NP -> DT JJ NN
VB -> like
-NONE- -> *PRO*
NP-SBJ-1 -> PRP
NNP -> tampa
NP-SBJ -> PRP
IN -> in
SQ -> MD NP-SBJ VP
S -> NP-SBJ-1 VP
WHADVP-9 -> WRB
VP -> VB NP PP-LOC ADVP-LOC-9
VP -> VB NP INTJ
NNP -> bay
NP -> NNS
PRP -> I
S -> NP-SBJ VP
NNS -> morcillas
VP -> TO VP
MD -> will
-NONE- -> *T*
NN -> type
WRB -> where
VP -> MD S
MD -> can
INTJ -> UH
. -> ?
CC -> but
, -> ,
NP-SBJ-1 -> -NONE-
S -> S , CC S
VB -> get
NNS -> anothers
S -> SBARQ , S .
ADVP-LOC-9 -> -NONE-
VP -> VB NP
DT -> the
VP -> MD VP
TO -> to
SBARQ -> WHADVP-9 SQ
I suggest breaking extraction into extraction from a tree using a rules attribute…
class Tree(Tree):
@property
def rules(self) -> set[Rule]:
raise NotImplementedError …which you can use to implement ContextFreeGrammar.from_treebank.
class ContextFreeGrammar(ContextFreeGrammar):
@classmethod
def from_treebank(cls, treebank: TreeBank) -> 'ContextFreeGrammar':
raise NotImplementedErrorOne thing you will need to make sure to do is to ensure that the alphabet and variables are disjoint. This is not guaranteed using naive extraction: nine symbols that show up as nonterminals also show up as terminals. Handling this will require some relabeling, which you can do using Tree.relabel.
Another thing you should make sure to do is to correctly handle trees whose root node is not S: a variety of “sentences” in EWT have NP or FRAG root nodes. But there are true sentences that have a root note other than S: S-IMP, SBAR, SQ, SINV, SBARQ, and S-HLN. Assume that all and only trees with variables at the root starting with S are sentences. Doing this will entail adding some rules to the grammar.
Finally, you should lower case all terminals. This can also be done with Tree.relabel.
Test your Tree.rules attribute against the tree we looked at above.
# write testsCKY parsing
We will implement CKY parsing and recognition using a system of classes: CKYCharts contain CKYChartEntrys and are filled with those entries by a CKYParser, which is a kind of ContextFreeGrammarParser. The reasoning behind having an abstract base class for ContextFreeGrammarParsers when we implement the EarleyParser.
type SpanIndices = tuple[int, int]type CKYBackPointer = tuple[str, SpanIndices]class Chart(ABC): """Abstract base class for parser charts.""" @property def parses(self): """The parses implied by this chart.""" raise NotImplementedErrorclass ChartEntry(ABC): """Abstract base class for chart entries.""" def __hash__(self) -> int: raise NotImplementedError @property def backpointers(self): """The backpointers for this entry.""" raise NotImplementedErrorclass CKYChartEntry(ChartEntry): """A chart entry for a CKY chart. Parameters ---------- symbol : str The nonterminal symbol. backpointers : CKYBackPointer Variable number of backpointer tuples. """ def __init__(self, symbol: str, *backpointers: CKYBackPointer): self._symbol = symbol self._backpointers = backpointers def to_tuple(self): """Convert to a hashable tuple representation. Returns ------- tuple[str, tuple[CKYBackPointer, ...]] The tuple representation. """ return (self._symbol, self._backpointers) def __hash__(self) -> int: return hash(self.to_tuple()) def __eq__(self, other: 'CKYChartEntry') -> bool: return self.to_tuple() == other.to_tuple() def __repr__(self) -> str: return self._symbol + ' -> ' + ' '.join( f"{bp[0]}({bp[1][0]}, {bp[1][1]})" for bp in self.backpointers ) def __str__(self) -> str: return self.__repr__() @property def symbol(self) -> str: """The nonterminal symbol.""" return self._symbol @property def backpointers(self) -> tuple[CKYBackPointer, ...]: """The backpointers for this entry.""" return self._backpointersclass CKYChart(Chart): """A chart for a CKY parser. Jurafsky & Martin call this a table. Parameters ---------- input_size : int The length of the input string. """ def __init__(self, input_size: int): self._input_size = input_size self._chart: list[list[set[CKYChartEntry]]] = [ [set({}) for _ in range(input_size+1)] for _ in range(input_size+1) ] def __getitem__(self, index: SpanIndices) -> set[CKYChartEntry]: i, j = index self._validate_index(i, j) return self._chart[i][j] def __setitem__(self, key: SpanIndices, item: set[CKYChartEntry]): i, j = key self._chart[i][j] = item def _validate_index(self, i, j): if i >= j: msg = "cannot index into the lower " +\ "triangle of the chart" raise ValueError(msg) try: self._chart[i] except IndexError: msg = "row index is too large" raise ValueError(msg) try: self._chart[i][j] except IndexError: msg = "column index is too large" raise ValueError(msg) @property def parses(self) -> set[Tree]: """The parse trees implied by the chart. Returns ------- set[Tree] The set of parse trees. """ try: return self._parses except AttributeError: self._parses = self._construct_parses() return self._parses def _construct_parses(self, entry: 'CKYChartEntry | None' = None) -> set[Tree]: """Construct the parses implied by the chart. Parameters ---------- entry : CKYChartEntry | None The chart entry to construct parses from, or None for the top-level entry. Returns ------- set[Tree] The set of parse trees. """ raise NotImplementedError class ContextFreeGrammarParser(ABC): """Abstract base class for CFG parsers. Parameters ---------- grammar : ContextFreeGrammar The grammar to parse with. """ def __init__(self, grammar: ContextFreeGrammar): self._grammar = grammar def __call__(self, string, mode="recognize"): """Parse or recognize a string. Parameters ---------- string : str | list[str] The string to process. mode : str Whether to "recognize" or "parse". Returns ------- bool | set[Tree] Boolean for recognize, set of parse trees for parse. """ if mode == "recognize": return self._recognize(string) elif mode == "parse": return self._parse(string) else: msg = 'mode must be "parse" or "recognize"' raise ValueError(msg)class CKYParser(ContextFreeGrammarParser): """A CKY parser. Parameters ---------- grammar : ContextFreeGrammar The grammar to parse with; must be in CNF. """ normal_form = NormalForm.CNF def _fill_chart(self, string: list[str]) -> CKYChart: """Fill the CKY chart for the given string. Parameters ---------- string : list[str] The input string to parse. Returns ------- CKYChart The filled chart. """ raise NotImplementedError def _parse(self, string): chart = self._fill_chart(string) return chart.parses def _recognize(self, string): chart = self._fill_chart(string) return any([self._grammar.start_variable == entry.symbol for entry in chart[0,len(string)]])Task 2
Implement the CKYParser._fill_chart and CKYChart._construct_parses methods.
class CKYParser(CKYParser):
def _fill_chart(self, string: list[str]) -> CKYChart:
raise NotImplementedError
class CKYChart(CKYChart):
def _construct_parses(self, entry: CKYChartEntry | None = None) -> set[Tree]:
raise NotImplementedErrorTest your implementation by ensuring that the following call yields the correct number of Trees. (We are not using the grammar you extracted from the treebank because we want to a relatively minimal example to test against. The grammar you extracted will be relevant for Task 4 below.)
ContextFreeGrammar.parser_class = CKYParser
grammar = ContextFreeGrammar(alphabet={'the', 'greyhound', 'ate', 'the', 'salmon',
'with', 'a', 'fork', 'again', 'quickly'},
variables={'S', 'NP', 'VP', 'PP', 'D', 'N', 'V', 'P', 'Adv'},
rules={Rule('S', 'NP', 'VP'),
Rule('NP', 'D', 'N'),
Rule('NP', 'NP', 'PP'),
Rule('VP', 'V', 'NP'),
Rule('VP', 'VP', 'PP'),
Rule('VP', 'Adv', 'VP'),
Rule('VP', 'VP', 'Adv'),
Rule('PP', 'P', 'NP'),
Rule('D', 'the'),
Rule('D', 'a'),
Rule('N', 'greyhound'),
Rule('N', 'salmon'),
Rule('N', 'fork'),
Rule('V', 'fork'),
Rule('V', 'ate'),
Rule('P', 'with'),
Rule('Adv', 'again'),
Rule('Adv', 'quickly')},
start_variable='S')
grammar(['the', 'greyhound', 'again', 'ate', 'the',
'salmon', 'with', 'a', 'fork', 'quickly'], mode="parse")Dotted Rules
To implement Earley parsing, we need to augment our vanilla CFG rule from above to a dotted CFG rule, which tracks which constituents we may have seen up to a particular position in a sentence. This dotted rule needs to track not only the position of the dot in the rule (what kinds of consistuents might have been seen so far), but also which substring the dotted rule is associated with.
class DottedRule(Rule):
def __init__(self, rule: Rule, span: SpanIndices, dot: int = 0):
self._rule = rule
self._left_side = rule.left_side
self._right_side = rule.right_side
self._span = span
self._dot = dot
def to_tuple(self) -> tuple[Rule, SpanIndices, int]:
return self._rule, self._span, self._dot
def __hash__(self) -> int:
return hash(self.to_tuple())
def __eq__(self, other) -> bool:
return self.to_tuple() == other.to_tuple()
def __repr__(self) -> str:
return self._left_side + ' -> ' +\
' '.join(self._right_side[:self._dot]) +\
' . ' +\
' '.join(self._right_side[self._dot:]) +\
' [' + str(self._span[0]) + ', ' + str(self._span[1]) + ']'
def complete(self, new_left_edge: int) -> tuple['DottedRule', int]:
"""Complete the next symbol in this rule
Parameters
----------
new_left_edge
Returns
-------
new_dotted_rule
completed_symbol
old_left_edge
"""
dot = self._dot + 1
span = (self._span[0], new_left_edge)
return DottedRule(self._rule, span, dot)
@property
def next_symbol(self) -> str:
if self.is_complete:
raise AttributeError('dotted rule is already complete')
else:
return self._right_side[self._dot]
@property
def dot(self) -> int:
return self._dot
@property
def span(self) -> SpanIndices:
return self._span
@property
def is_complete(self) -> bool:
return self._dot == len(self._right_side)
@property
def left_side(self) -> str:
return self._rule.left_sideTo initialize a dotted CFG rule, we pass a vanilla CFG rule along with a tuple of indices representing the span that rule has recognized. If it has recognized nothing, the left index will be equal to the right index. We don’t need to pass the dot position because we assume that, on initialization, the dot is before the first right side symbol.
dotted_rule = DottedRule(Rule('S', 'NP', 'VP'), (0, 0))
dotted_ruleWe can increment the dot by calling DottedRule.complete with the new right edge of the span.
dotted_rule.complete(2)This procedure creates an entirely new object.
id(dotted_rule), id(dotted_rule.complete(2))Calling DottedRule.complete twice will increment the dot twice.
dotted_rule.complete(2).complete(10)Finally, dotted rules are hashable and behave how you would expect when hashed.
{dotted_rule, DottedRule(Rule('S', 'NP', 'VP'), (0, 0))}Earley Parsing
We will implement Earley parsing and recognition using a similar system of classes to the ones we developed for CKY: EarleyCharts contain EarleyChartEntrys and are filled with those entries by a EarleyParser, which is a kind of ContextFreeGrammarParser.
EarleyBackPointer = tuple[str, int]
class EarleyChartEntry(ChartEntry):
"""A chart entry for a Earley chart
Parameters
----------
dotted_rule
backpointers
"""
def __init__(self, dotted_rule: DottedRule, *backpointers: EarleyBackPointer):
self._dotted_rule = dotted_rule
self._backpointers = backpointers
def to_tuple(self) -> tuple[DottedRule, tuple[EarleyBackPointer, ...]]:
return self._dotted_rule, self._backpointers
def __hash__(self) -> int:
return hash(self.to_tuple())
def __eq__(self, other) -> bool:
return self.to_tuple() == other.__key()
def __repr__(self) -> str:
return self._dotted_rule.__repr__()
def __str__(self) -> str:
return self.__repr__()
@property
def backpointers(self) -> tuple[EarleyBackPointer, ...]:
return self._backpointers
@property
def dotted_rule(self):
return self._dotted_rule
@property
def next_symbol(self) -> str:
return self._dotted_rule.next_symbol
@property
def span(self) -> tuple[int]:
return self._dotted_rule.span
@property
def is_complete(self):
return self._dotted_rule.is_complete
def complete(self, entry: 'EarleyChartEntry', new_left_edge: int) -> 'EarleyChartEntry':
new_dotted_rule = self._dotted_rule.complete(new_left_edge)
bp = (self._dotted_rule.next_symbol, self._dotted_rule.span[1])
backpointers = self._backpointers + (bp,)
return EarleyChartEntry(new_dotted_rule, backpointers)
def is_completion_of(self, other: 'EarleyChartEntry') -> bool:
return self._dotted_rule.left_side == other.dotted_rule.next_symbol
class EarleyChart(Chart):
"""A chart for an Earley parser
Parameters
----------
input_size
"""
def __init__(self, input_size: int, start_variable: str = 'S'):
self._start_variable = start_variable
self._chart: list[set[EarleyChartEntry]] = [
set() for _ in range(input_size+1)
]
def __getitem__(self, index) -> set[EarleyChartEntry]:
return self._chart[index]
def __setitem__(self, key, item) -> None:
self._chart[key] = item
@property
def parses(self) -> set[Tree]:
try:
return self._parses
except AttributeError:
self._parses = set()
for entry in self._chart[-1]:
is_start = entry.dotted_rule.left_side == self._start_variable
covers_string = entry.dotted_rule.span == (0, self.input_size)
if is_start and covers_string:
self._parses.add(self._construct_parses(entry))
return self._parses
def _construct_parses(self, entry: 'EarleyChartEntry') -> Tree:
"""Construct the parses implied by the chart
Parameters
----------
entry
"""
raise NotImplementedError
@property
def input_size(self) -> int:
return len(self._chart) - 1
class EarleyParser(ContextFreeGrammarParser):
"""
An Earley parser
Parameters
----------
grammar : ContextFreeGrammar
"""
normal_form = None
def _fill_chart(self, string: list[str]) -> EarleyChart:
"""
a chart for the string based on a CFG
Parameters
----------
string
"""
raise NotImplementedError
def _predict(self, chart: EarleyChart, entry: EarleyChartEntry, position: int):
for rule in self._grammar.rules(entry.next_symbol):
span = (position, position)
dotted_rule = DottedRule(rule, span)
entry = EarleyChartEntry(dotted_rule)
chart[position].add(entry)
def _scan(self, chart: EarleyChart, entry: EarleyChartEntry, position: int):
word = self._string[position]
pos_for_word = self._grammar.parts_of_speech(word)
if entry.next_symbol in pos_for_word:
rule = Rule(entry.next_symbol, word)
span = (position, position+1)
dotted_rule = DottedRule(rule, span, dot=1)
unary_entry = EarleyChartEntry(dotted_rule)
chart[position+1].add(unary_entry)
def _complete(self, chart: EarleyChart, entry: EarleyChartEntry, position: int):
start, end = entry.span
for prev_entry in chart[start]:
if not prev_entry.is_complete and entry.is_completion_of(prev_entry):
completed_entry = prev_entry.complete(entry, end)
chart[position].add(completed_entry)
def _parse(self, string: str | list[str]):
chart = self._fill_chart(string)
return chart.parses
def _recognize(self, string: str | list[str]):
chart = self._fill_chart(string)
for entry in chart[-1]:
is_start = entry.dotted_rule.left_side == self._grammar.start_variable
covers_string = entry.dotted_rule.span == (0, chart.input_size)
if is_start and covers_string:
return True
else:
return False
ContextFreeGrammar.parser_class = EarleyParserTask 3
Implement the EarleyParser._fill_chart and EarleyChart._construct_parses method. Note that EarleyChart._construct_parses works slightly differently than CKYChart._construct_parses in only producing a single Tree. The parses attributes collects all Trees implied by a chart into a set.
class EarleyParser(EarleyParser):
def _fill_chart(self, string: list[str]) -> EarleyChart:
"""
a chart for the string based on a CFG
Parameters
----------
string
"""
raise NotImplementedError
class EarleyChart(EarleyChart):
def _construct_parses(self, entry: 'EarleyChartEntry') -> Tree:
"""Construct the parses implied by the chart
Parameters
----------
entry
"""
raise NotImplementedErrorTest your implementation using the same grammar and sentence we used to test the CKY implementation.
ContextFreeGrammar.parser_class = EarleyParser
grammar = ContextFreeGrammar(alphabet={'the', 'greyhound', 'ate', 'the', 'salmon',
'with', 'a', 'fork', 'again', 'quickly'},
variables={'S', 'NP', 'VP', 'PP', 'D', 'N', 'V', 'P', 'Adv'},
rules={Rule('S', 'NP', 'VP'),
Rule('NP', 'D', 'N'),
Rule('NP', 'NP', 'PP'),
Rule('VP', 'V', 'NP'),
Rule('VP', 'VP', 'PP'),
Rule('VP', 'Adv', 'VP'),
Rule('VP', 'VP', 'Adv'),
Rule('PP', 'P', 'NP'),
Rule('D', 'the'),
Rule('D', 'a'),
Rule('N', 'greyhound'),
Rule('N', 'salmon'),
Rule('N', 'fork'),
Rule('V', 'fork'),
Rule('V', 'ate'),
Rule('P', 'with'),
Rule('Adv', 'again'),
Rule('Adv', 'quickly')},
start_variable='S')
grammar(['the', 'greyhound', 'again', 'ate', 'the',
'salmon', 'with', 'a', 'fork', 'quickly'], mode="parse")Task 4
Implement an instance method EarleyParser._predict_next_word that is used when EarleyParser is called in "predict" mode. This method should take in a tokenized string—the prefix —and output a dictionary mapping from parts-of-speech that could come after the prefix to words that could come after the prefix.
from collections import defaultdictclass EarleyParser(EarleyParser): """Extended Earley parser with next-word prediction. Parameters ---------- grammar : ContextFreeGrammar The grammar to parse with. """ def __call__(self, string, mode="recognize"): """Parse, recognize, or predict next words. Parameters ---------- string : str | list[str] The string to process. mode : str One of "recognize", "parse", or "predict". Returns ------- bool | set[Tree] | dict[str, set[str]] Depends on mode. """ if mode == "recognize": return self._recognize(string) elif mode == "parse": return self._parse(string) elif mode == "predict": return self._predict_next_word(string) else: msg = 'mode must be "parse", "recognize", or "predict"' raise ValueError(msg) def _predict_next_word(self, prefix: list[str]) -> dict[str, set[str]]: """Predict the next word given a prefix. Parameters ---------- prefix : list[str] The prefix string to predict from. Returns ------- dict[str, set[str]] Mapping from predicted words to the set of POS tags that could produce them. """ raise NotImplementedErrorContextFreeGrammar.parser_class = EarleyParserTest your prediction method against the grammar we used in class.
grammar = ContextFreeGrammar(alphabet={'the', 'greyhound', 'ate', 'the', 'salmon',
'with', 'a', 'fork', 'again', 'too'},
variables={'S', 'NP', 'VP', 'PP', 'D', 'N', 'V', 'P', 'Adv'},
rules={Rule('S', 'NP', 'VP'),
Rule('NP', 'D', 'N'),
Rule('NP', 'NP', 'PP'),
Rule('VP', 'V', 'NP'),
Rule('VP', 'VP', 'PP'),
Rule('VP', 'Adv', 'VP'),
Rule('VP', 'VP', 'Adv'),
Rule('PP', 'P', 'NP'),
Rule('D', 'the'),
Rule('D', 'a'),
Rule('N', 'greyhound'),
Rule('N', 'salmon'),
Rule('N', 'fork'),
Rule('V', 'ate'),
Rule('VP', 'ate'),
Rule('P', 'with'),
Rule('Adv', 'again'),
Rule('Adv', 'too')},
start_variable='S')For instance, with the prefix the greyhound, you should get the following dictionary:
{'Adv': {'again', 'too'},
'VP': {'ate'},
'P': {'with'},
'V': {'ate'}}
# write tests hereTask 5
Randomly sample sentences from EWT and compute the predicted words given the first one, two, and three words in that sentence. For each such set of predictions for each sentence count the total number of parts-of-speech and the total number of words that could follow that prefix.
grammar = ContextFreeGrammar.from_treebank(EnglishWebTreebank())
# sample and predict hereYou’ll see a general pattern in the numbers, but if you dig down into exactly what the next word predictions are, they will look at. What gives rise to this oddness?