Assignments 9 and 10

Assignment 9 consists of Tasks 1 and 2 and Assignment 10 consists of Tasks 3-5.

In these assignments, we will focus on (i) extracting context free grammars (CFGs) from a treebank; and (ii) implement the CKY and Earley algorithms for recognizing/parsing CFGs.

Trees

We’ll use a slightly augmented version of the Tree class we developed earlier in the class to represent trees that should be output by the parser.

import pyparsing

from typing import TypeVar, Optional
from collections.abc import Hashable, Callable

DataType = Hashable
TreeList = list[str, Optional[list['TreeList']]]
TreeTuple = tuple[DataType, Optional[tuple['TreeTuple', ...]]]

class Tree:

    LPAR = pyparsing.Suppress('(')
    RPAR = pyparsing.Suppress(')')
    DATA = pyparsing.Regex(r'[^\(\)\s]+')

    PARSER = pyparsing.Forward()
    SUBTREE = pyparsing.ZeroOrMore(PARSER)
    PARSERLIST = pyparsing.Group(LPAR + DATA + SUBTREE + RPAR)
    PARSER <<= DATA | PARSERLIST
    
    def __init__(self, data: DataType, children: list['Tree'] = []):
        self._data = data
        self._children = children
        
        self._validate()
  
    def to_tuple(self) -> TreeTuple:
        return self._data, tuple(c.to_tuple() for c in self._children)

    def __hash__(self) -> int:
        return hash(self.to_tuple())
    
    def __eq__(self, other: 'Tree') -> bool:
        return self.to_tuple() == other.to_tuple()

    def __str__(self) -> str:
        return ' '.join(self.terminals)
        
    def __repr__(self) -> str:
        return self.to_string()
     
    def to_string(self, depth=0) -> str:
        s = (depth - 1) * '  ' +\
            int(depth > 0) * '--' +\
            self._data + '\n'
        s += ''.join(c.to_string(depth+1)
                     for c in self._children)
        
        return s
    
    def __contains__(self, data: DataType) -> bool:
        # pre-order depth-first search
        if self._data == data:
            return True
        else:
            for child in self._children:
                if data in child:
                    return True
                
            return False
        
    def __getitem__(self, idx: int | tuple[int, ...]) -> 'Tree':
        if isinstance(idx, int):
            return self._children[idx]
        elif len(idx) == 1:
            return self._children[idx[0]]
        elif idx:
            return self._children[idx[0]].__getitem__(idx[1:])
        else:
            return self
        
    @property
    def data(self) -> DataType:
        return self._data 
    
    @property
    def children(self) -> list['Tree']:
        return self._children
     
    @property
    def terminals(self) -> list[str]:
        if self._children:
            return [w for c in self._children 
                    for w in c.terminals]
        else:
            return [str(self._data)]
        
    def _validate(self) -> None:
        try:
            assert all(isinstance(c, Tree)
                       for c in self._children)
        except AssertionError:
            msg = 'all children must be trees'
            raise TypeError(msg)
            
    def index(self, data: DataType, index_path: tuple[int, ...] = tuple()) -> list[tuple[int, ...]]:
        indices = [index_path] if self._data==data else []
        root_path = [] if index_path == -1 else index_path
        
        indices += [j 
                    for i, c in enumerate(self._children) 
                    for j in c.index(data, root_path+(i,))]

        return indices
    
    def relabel(self, label_map: Callable[[DataType], DataType], 
                nonterminals_only: bool = False, terminals_only: bool = False) -> 'Tree':
        if not nonterminals_only and not terminals_only:
            data = label_map(self._data)
        elif nonterminals_only and self._children:
            data = label_map(self._data)
        elif terminals_only and not self._children:
            data = label_map(self._data)
        else:
            data = self._data
        
        children = [c.relabel(label_map, nonterminals_only, terminals_only) 
                    for c in self._children]
        
        return self.__class__(data, children)
    
    @classmethod
    def from_string(cls, treestr: str) -> 'Tree':
        treelist = cls.PARSER.parseString(treestr[2:-2])[0]
        
        return cls.from_list(treelist)
    
    @classmethod
    def from_list(cls, treelist: TreeList) -> 'Tree':
        if isinstance(treelist, str):
            return cls(treelist[0])
        elif isinstance(treelist[1], str):
            return cls(treelist[0], [cls(treelist[1])])
        else:
            return cls(treelist[0], [cls.from_list(l) for l in treelist[1:]])

CFG rules

Vanilla context free grammar rules (in contrast to the dotted rules we use for Earley, define below) will be represented using a fairly lightweight wrapper around pairings of a string (the left side) with a tuple of strings (the right side).

from enum import Enum

class NormalForm(Enum):
    CNF = 0
    BNF = 1
    GNF = 2

class Rule:
    """A context free grammar rule

    Parameters
    ----------
    left_side
    right_side
    """

    def __init__(self, left_side: str, *right_side: str):
        self._left_side = left_side
        self._right_side = right_side

    def __repr__(self) -> str:
        return self._left_side + ' -> ' + ' '.join(self._right_side)
        
    def to_tuple(self) -> tuple[str, tuple[str, ...]]:
        return (self._left_side, self._right_side)
        
    def __hash__(self) -> int:
        return hash(self.to_tuple())

    def __eq__(self, other: 'Rule') -> bool:
        left_side_equal = self._left_side == other._left_side
        right_side_equal = self._right_side == other._right_side

        return left_side_equal and right_side_equal

    def validate(self, alphabet: set[str], variables: set[str], normal_form: NormalForm = NormalForm.CNF):
        """validate the rule

        Parameters
        ----------
        alphabet : set(str)
        variables : set(str)
        """

        if self._left_side not in variables:
            msg = "left side of rule must be a variable"
            raise ValueError(msg)

        acceptable = alphabet | variables | {''}
            
        if not all([s in acceptable for s in self._right_side]):
            msg = "right side of rule must contain only" +\
                  "a variable, a symbol, or the empty string"
            raise ValueError(msg)

        if normal_form == NormalForm.CNF:
            try:
                if len(self.right_side) == 1:
                    assert self.right_side[0] in alphabet
                elif len(self.right_side) == 2:
                    assert all([s in variables for s in self.right_side])
                else:
                    raise AssertionError

            except AssertionError:
                raise ValueError(f"{self} is not in CNF")
        

    @property
    def left_side(self) -> str:
        return self._left_side

    @property
    def right_side(self) -> tuple[str, ...]:
        return self._right_side

    @property
    def is_unary(self) -> bool:
        return len(self._right_side) == 1
    
    @property
    def is_binary(self) -> bool:
        return len(self._right_side) == 2

Defining a rule is straightforward.

Rule("S", "NP", "VP")
S -> NP VP

Note that these rules are hashable, so they can be members of a python set.

{Rule("S", "NP", "VP"), Rule("S", "NP", "VP")}
{S -> NP VP}

Context Free Grammar

As in the previous two assignments, we will define a ContextFreeGrammar to align very closely with the formal definition we covered in class. But because the recognition and parsing algorithms are slightly more complicated than those for finite state automata, we will factor the parsing algorithms themselves out of this class. To do this, we will specify a parser class that the grammar will initialize during its own initilization. We will set it to None to start, which will mean that we won’t be able to use the ContextFreeGrammar.__call__ method until we define a parser.

from typing import Literal
from functools import lru_cache

Mode = Literal["recognize", "parse"]

class ContextFreeGrammar:

    """
    A context free grammar

    Parameters
    ----------
    alphabet : set(str)
    variables : set(str)
    rules : set(Rule)
    start_variable : str

    Attributes
    ----------
    alphabet : set(str)
    variables : set(str)
    rules : set(Rule)
    start_variable : str

    Methods
    -------
    reduce(left_side)
    """
    
    # this will need to be filled in by the parser class, once it is defined:
    # - CKYParser
    # - EarleyParser
    parser_class = None
    
    def __init__(self, alphabet: set[str], variables: set[str], rules: set[Rule], start_variable: str):
        self._alphabet = alphabet
        self._variables = variables
        self._rules = rules
        self._start_variable = start_variable
        
        self._validate_variables()
        self._validate_rules()

        if self.parser_class is not None:
            self._parser = self.parser_class(self)
        else:
            self._parser = None

    def __call__(self, string: str | list[str], mode: Mode = "recognize"):
        if self._parser is not None:
            return self._parser(string, mode)
        else:
            raise AttributeError("no parser is specified")
        
    def _validate_variables(self):
        if self._alphabet & self._variables:
            raise ValueError('alphabet and variables must not share elements')
        
        if self._start_variable not in self._variables:
            raise ValueError('start variable must be in set of variables')

    def _validate_rules(self):
        if self.parser_class is not None:
            for r in self._rules:
                r.validate(self._alphabet, self._variables,
                           self.parser_class.normal_form)

    @property            
    def alphabet(self) -> set[str]:
        return self._alphabet

    @property    
    def variables(self) -> set[str]:
        return self._variables
   
    @lru_cache(2**10)
    def rules(self, left_side: str | None = None) -> set[Rule]:
        if left_side is None:
            return self._rules
        else:
            return {rule for rule in self._rules 
                    if rule.left_side == left_side}

    @property
    def start_variable(self) -> str:
        return self._start_variable

    @lru_cache(2**14)
    def parts_of_speech(self, word: str | None = None) -> set[str]:
        if word is None:
            return {rule.left_side for rule in self._rules 
                    if rule.is_unary 
                    if rule.right_side[0] in self._alphabet}
        
        else:
            return {rule.left_side for rule in self._rules 
                    if rule.is_unary 
                    if rule.right_side[0] == word}
  
    @property
    def phrase_variables(self) -> set[str]:
        try:
            return self._phrase_variables
        except AttributeError:
            self._phrase_variables = {rule.left_side for rule in self._rules 
                                      if not rule.is_unary or 
                                      rule.right_side[0] not in self._alphabet}
            return self._phrase_variables

    @lru_cache(2^15)
    def reduce(self, *right_side: str) -> set[str]:
        """
        the nonterminals that can be rewritten as right_side

        Parameters
        ----------
        right_side

        Returns
        -------
        set(str)
        """
        return {r.left_side for r in self._rules
                if r.right_side == tuple(right_side)}

We can then define a context free grammar as below.

grammar = ContextFreeGrammar(alphabet={'the', 'greyhound', 'ate', 'the', 'salmon',
                                       'with', 'a', 'fork', 'again', 'quickly'},
                             variables={'S', 'NP', 'VP', 'PP', 'D', 'N', 'V', 'P', 'Adv'},
                             rules={Rule('S', 'NP', 'VP'),
                                    Rule('NP', 'D', 'N'),
                                    Rule('NP', 'NP', 'PP'),
                                    Rule('VP', 'V', 'NP'),
                                    Rule('VP', 'VP', 'PP'),
                                    Rule('VP', 'Adv', 'VP'),
                                    Rule('VP', 'VP', 'Adv'),
                                    Rule('PP', 'P', 'NP'),
                                    Rule('D', 'the'),
                                    Rule('D', 'a'),
                                    Rule('N', 'greyhound'),
                                    Rule('N', 'salmon'),
                                    Rule('N', 'fork'),
                                    Rule('V', 'fork'),
                                    Rule('V', 'ate'),
                                    Rule('P', 'with'),
                                    Rule('Adv', 'again'),
                                    Rule('Adv', 'quickly')},
                             start_variable='S')

There are a few attributes and methods to take note of here. One allows you to easily work with parts of speech…

grammar.parts_of_speech()
{'Adv', 'D', 'N', 'P', 'V'}

…including finding all the parts of speech for a particular word.

grammar.parts_of_speech("fork")
{'N', 'V'}

Another allows easy access to rules…

grammar.rules()
{Adv -> again,
 Adv -> quickly,
 D -> a,
 D -> the,
 N -> fork,
 N -> greyhound,
 N -> salmon,
 NP -> D N,
 NP -> NP PP,
 P -> with,
 PP -> P NP,
 S -> NP VP,
 V -> ate,
 V -> fork,
 VP -> Adv VP,
 VP -> V NP,
 VP -> VP Adv,
 VP -> VP PP}

…including finding all rules that have a particular left side.

grammar.rules("VP")
{VP -> Adv VP, VP -> V NP, VP -> VP Adv, VP -> VP PP}

Most importantly, I have implemented a ContextFreeGrammar.reduce method for finding all left sides that can be rewritten from a particular right side.

grammar.reduce("V", "NP")
{'VP'}

Treebank

We will extract a CFG from the English Web Treebank. We can use the reader developed earlier in the course to read the treebank.

import tarfile
from abc import ABC

class TreeBank(ABC):
    
    def __iter__(self):
        return self
    
    def __next__(self):
        return next(self._tree_iter)
        
class EnglishWebTreebank(TreeBank):
    
    def __init__(self, root='LDC2012T13.tgz'):
        self._root = root

        self._tree_iter = self._construct_tree_iter()
        
    def _construct_tree_iter(self):
        with tarfile.open(self._root) as corpus:
            for fname in corpus.getnames():
                if '.xml.tree' in fname:
                    with corpus.extractfile(fname) as treefile:
                        treestr = treefile.readline().decode()
                        yield fname, Tree.from_string(treestr)

Task 1

Write a class method ContextFreeGrammar.from_treebank that extracts all of the alphabet elements, variables, and rules implied by the trees in the treebank. For instance, the rules implied by the following tree…

print(next(EnglishWebTreebank())[1].__repr__())

…are:

VB -> try
PP-LOC -> IN NP
NP -> NNP NNP
JJ -> argentinian
UH -> please
NP -> DT JJ NN
VB -> like
-NONE- -> *PRO*
NP-SBJ-1 -> PRP
NNP -> tampa
NP-SBJ -> PRP
IN -> in
SQ -> MD NP-SBJ VP
S -> NP-SBJ-1 VP
WHADVP-9 -> WRB
VP -> VB NP PP-LOC ADVP-LOC-9
VP -> VB NP INTJ
NNP -> bay
NP -> NNS
PRP -> I
S -> NP-SBJ VP
NNS -> morcillas
VP -> TO VP
MD -> will
-NONE- -> *T*
NN -> type
WRB -> where
VP -> MD S
MD -> can
INTJ -> UH
. -> ?
CC -> but
, -> ,
NP-SBJ-1 -> -NONE-
S -> S , CC S
VB -> get
NNS -> anothers
S -> SBARQ , S .
ADVP-LOC-9 -> -NONE-
VP -> VB NP
DT -> the
VP -> MD VP
TO -> to
SBARQ -> WHADVP-9 SQ

I suggest breaking extraction into extraction from a tree using a rules attribute…

class Tree(Tree):
    
    @property
    def rules(self) -> set[Rule]:
        raise NotImplementedError 

…which you can use to implement ContextFreeGrammar.from_treebank.

class ContextFreeGrammar(ContextFreeGrammar):
    
    @classmethod
    def from_treebank(cls, treebank: TreeBank) -> 'ContextFreeGrammar':
        raise NotImplementedError

One thing you will need to make sure to do is to ensure that the alphabet and variables are disjoint. This is not guaranteed using naive extraction: nine symbols that show up as nonterminals also show up as terminals. Handling this will require some relabeling, which you can do using Tree.relabel.

Another thing you should make sure to do is to correctly handle trees whose root node is not S: a variety of “sentences” in EWT have NP or FRAG root nodes. But there are true sentences that have a root note other than S: S-IMP, SBAR, SQ, SINV, SBARQ, and S-HLN. Assume that all and only trees with variables at the root starting with S are sentences. Doing this will entail adding some rules to the grammar.

Finally, you should lower case all terminals. This can also be done with Tree.relabel.

Test your Tree.rules attribute against the tree we looked at above.

# write tests

CKY parsing

We will implement CKY parsing and recognition using a system of classes: CKYCharts contain CKYChartEntrys and are filled with those entries by a CKYParser, which is a kind of ContextFreeGrammarParser. The reasoning behind having an abstract base class for ContextFreeGrammarParsers when we implement the EarleyParser.

from typing import Union

SpanIndices = tuple[int, int]
CKYBackPointer = tuple[str, SpanIndices]

class Chart(ABC):

    @property
    def parses(self):
        raise NotImplementedError

class ChartEntry(ABC):

    def __hash__(self) -> int:
        raise NotImplementedError

    @property
    def backpointers(self):
        raise NotImplementedError

class CKYChartEntry(ChartEntry):
    """
    A chart entry for a CKY chart

    Parameters
    ----------
    symbol
    backpointers

    Attributes
    ----------
    symbol
    backpointers
    """

    def __init__(self, symbol: str, *backpointers: CKYBackPointer):
        self._symbol = symbol
        self._backpointers = backpointers

    def to_tuple(self):
        return (self._symbol, self._backpointers)
        
    def __hash__(self) -> int:
        return hash(self.to_tuple())

    def __eq__(self, other: 'CKYChartEntry') -> bool:
        return self.to_tuple() == other.to_tuple()
    
    def __repr__(self) -> str:
        return self._symbol + ' -> ' + ' '.join(
            f"{bp[0]}({bp[1][0]}, {bp[1][1]})" 
            for bp in self.backpointers
        )

    def __str__(self) -> str:
        return self.__repr__()
    
    @property
    def symbol(self) -> str:
        return self._symbol

    @property
    def backpointers(self) -> tuple[CKYBackPointer, ...]:
        return self._backpointers

class CKYChart(Chart):
    """
    A chart for a CKY parser

    Jurafsky & Martin call this a table

    Parameters
    ----------
    input_size

    Attributes
    ----------
    parses
    """

    def __init__(self, input_size: int):
        self._input_size = input_size
        
        self._chart: list[list[set[CKYChartEntry]]] = [
            [set({})
             for _ in range(input_size+1)]
            for _ in range(input_size+1)
        ]
        
    def __getitem__(self, index: SpanIndices) -> set[CKYChartEntry]:
        i, j = index

        self._validate_index(i, j)
        
        return self._chart[i][j]

    def __setitem__(self, key: SpanIndices, item: set[CKYChartEntry]):
        i, j = key

        self._chart[i][j] = item
        
    def _validate_index(self, i, j):
        try:
            assert i < j
        except AssertionError:
            msg = "cannot index into the lower " +\
                  "triangle of the chart"
            raise ValueError(msg)

        try:
            self._chart[i]
        except IndexError:
            msg = "row index is too large"
            raise ValueError(msg)

        try:
            self._chart[i][j]
        except IndexError:
            msg = "column index is too large"
            raise ValueError(msg)

    @property
    def parses(self) -> set[Tree]:
        try:
            return self._parses
        except AttributeError:
            self._parses = self._construct_parses()
            return self._parses

    def _construct_parses(self, entry: Union['CKYChartEntry', None] = None) -> set[Tree]:
        """Construct the parses implied by the chart

        Parameters
        ----------
        entry
        """
        raise NotImplementedError
    
class ContextFreeGrammarParser(ABC):
    
    def __init__(self, grammar: ContextFreeGrammar):
        self._grammar = grammar

    def __call__(self, string, mode="recognize"):
        if mode == "recognize":
            return self._recognize(string)
        elif mode == "parse":
            return self._parse(string)            
        else:
            msg = 'mode must be "parse" or "recognize"'
            raise ValueError(msg)

class CKYParser(ContextFreeGrammarParser):
    """
    A CKY parser

    Parameters
    ----------
    grammar : ContextFreeGrammar
    """
    
    normal_form = NormalForm.CNF
    
    def _fill_chart(self, string: list[str]) -> CKYChart:
        raise NotImplementedError

    def _parse(self, string):
        chart = self._fill_chart(string)
        return chart.parses
        
    def _recognize(self, string):
        chart = self._fill_chart(string)
        
        return any([self._grammar.start_variable == entry.symbol
                    for entry in chart[0,len(string)]])
        

Task 2

Implement the CKYParser._fill_chart and CKYChart._construct_parses methods.

class CKYParser(CKYParser):
    
    def _fill_chart(self, string: list[str]) -> CKYChart:
        raise NotImplementedError 
    
class CKYChart(CKYChart):
    
    def _construct_parses(self, entry: CKYChartEntry | None = None) -> set[Tree]:
        raise NotImplementedError

Test your implementation by ensuring that the following call yields the correct number of Trees. (We are not using the grammar you extracted from the treebank because we want to a relatively minimal example to test against. The grammar you extracted will be relevant for Task 4 below.)

ContextFreeGrammar.parser_class = CKYParser

grammar = ContextFreeGrammar(alphabet={'the', 'greyhound', 'ate', 'the', 'salmon',
                                       'with', 'a', 'fork', 'again', 'quickly'},
                             variables={'S', 'NP', 'VP', 'PP', 'D', 'N', 'V', 'P', 'Adv'},
                             rules={Rule('S', 'NP', 'VP'),
                                    Rule('NP', 'D', 'N'),
                                    Rule('NP', 'NP', 'PP'),
                                    Rule('VP', 'V', 'NP'),
                                    Rule('VP', 'VP', 'PP'),
                                    Rule('VP', 'Adv', 'VP'),
                                    Rule('VP', 'VP', 'Adv'),
                                    Rule('PP', 'P', 'NP'),
                                    Rule('D', 'the'),
                                    Rule('D', 'a'),
                                    Rule('N', 'greyhound'),
                                    Rule('N', 'salmon'),
                                    Rule('N', 'fork'),
                                    Rule('V', 'fork'),
                                    Rule('V', 'ate'),
                                    Rule('P', 'with'),
                                    Rule('Adv', 'again'),
                                    Rule('Adv', 'quickly')},
                             start_variable='S')

grammar(['the', 'greyhound', 'again', 'ate', 'the',
         'salmon', 'with', 'a', 'fork', 'quickly'], mode="parse")

Dotted Rules

To implement Earley parsing, we need to augment our vanilla CFG rule from above to a dotted CFG rule, which tracks which constituents we may have seen up to a particular position in a sentence. This dotted rule needs to track not only the position of the dot in the rule (what kinds of consistuents might have been seen so far), but also which substring the dotted rule is associated with.

class DottedRule(Rule):
    
    def __init__(self, rule: Rule, span: SpanIndices, dot: int = 0):
        self._rule = rule
        self._left_side = rule.left_side
        self._right_side = rule.right_side
        
        self._span = span
        self._dot = dot
    
    def to_tuple(self) -> tuple[Rule, SpanIndices, int]:
        return self._rule, self._span, self._dot
    
    def __hash__(self) -> int:
        return hash(self.to_tuple())
    
    def __eq__(self, other) -> bool:
        return self.to_tuple() == other.to_tuple()
    
    def __repr__(self) -> str:
        return self._left_side + ' -> ' +\
               ' '.join(self._right_side[:self._dot]) +\
               ' . ' +\
               ' '.join(self._right_side[self._dot:]) +\
               ' [' + str(self._span[0]) + ', ' + str(self._span[1]) + ']'
    
    def complete(self, new_left_edge: int) -> tuple['DottedRule', int]:
        """Complete the next symbol in this rule
        
        Parameters
        ----------
        new_left_edge

        Returns
        -------
        new_dotted_rule
        completed_symbol
        old_left_edge
        """
        dot = self._dot + 1
        span = (self._span[0], new_left_edge)

        return DottedRule(self._rule, span, dot)

    @property
    def next_symbol(self) -> str:
        if self.is_complete:
            raise AttributeError('dotted rule is already complete')
        else:
            return self._right_side[self._dot]
        
    @property
    def dot(self) -> int:
        return self._dot
    
    @property
    def span(self) -> SpanIndices:
        return self._span
    
    @property
    def is_complete(self) -> bool:
        return self._dot == len(self._right_side)
    
    @property
    def left_side(self) -> str:
        return self._rule.left_side

To initialize a dotted CFG rule, we pass a vanilla CFG rule along with a tuple of indices representing the span that rule has recognized. If it has recognized nothing, the left index will be equal to the right index. We don’t need to pass the dot position because we assume that, on initialization, the dot is before the first right side symbol.

dotted_rule = DottedRule(Rule('S', 'NP', 'VP'), (0, 0))

dotted_rule

We can increment the dot by calling DottedRule.complete with the new right edge of the span.

dotted_rule.complete(2)

This procedure creates an entirely new object.

id(dotted_rule), id(dotted_rule.complete(2))

Calling DottedRule.complete twice will increment the dot twice.

dotted_rule.complete(2).complete(10)

Finally, dotted rules are hashable and behave how you would expect when hashed.

{dotted_rule, DottedRule(Rule('S', 'NP', 'VP'), (0, 0))}

Earley Parsing

We will implement Earley parsing and recognition using a similar system of classes to the ones we developed for CKY: EarleyCharts contain EarleyChartEntrys and are filled with those entries by a EarleyParser, which is a kind of ContextFreeGrammarParser.

EarleyBackPointer = tuple[str, int]

class EarleyChartEntry(ChartEntry):
    """A chart entry for a Earley chart

    Parameters
    ----------
    dotted_rule
    backpointers
    """

    def __init__(self, dotted_rule: DottedRule, *backpointers: EarleyBackPointer):
        self._dotted_rule = dotted_rule
        self._backpointers = backpointers

    def to_tuple(self) -> tuple[DottedRule, tuple[EarleyBackPointer, ...]]:
        return self._dotted_rule, self._backpointers
        
    def __hash__(self) -> int:
        return hash(self.to_tuple())

    def __eq__(self, other) -> bool:
        return self.to_tuple() == other.__key()
    
    def __repr__(self) -> str:
        return self._dotted_rule.__repr__()

    def __str__(self) -> str:
        return self.__repr__()

    @property
    def backpointers(self) -> tuple[EarleyBackPointer, ...]:
        return self._backpointers
    
    @property
    def dotted_rule(self):
        return self._dotted_rule
    
    @property
    def next_symbol(self) -> str:
        return self._dotted_rule.next_symbol
    
    @property
    def span(self) -> tuple[int]:
        return self._dotted_rule.span
    
    @property
    def is_complete(self):
        return self._dotted_rule.is_complete
    
    def complete(self, entry: 'EarleyChartEntry', new_left_edge: int) -> 'EarleyChartEntry':
        new_dotted_rule = self._dotted_rule.complete(new_left_edge)
        
        bp = (self._dotted_rule.next_symbol, self._dotted_rule.span[1])
        backpointers = self._backpointers + (bp,)
        
        return EarleyChartEntry(new_dotted_rule, backpointers)
    
    def is_completion_of(self, other: 'EarleyChartEntry') -> bool:
        return self._dotted_rule.left_side == other.dotted_rule.next_symbol

class EarleyChart(Chart):
    """A chart for an Earley parser

    Parameters
    ----------
    input_size
    """
    def __init__(self, input_size: int, start_variable: str = 'S'):
        self._start_variable = start_variable
        
        self._chart: list[set[EarleyChartEntry]] = [
            set() for _ in range(input_size+1)
        ]
        
    def __getitem__(self, index) -> set[EarleyChartEntry]:
        return self._chart[index]

    def __setitem__(self, key, item) -> None:
        self._chart[key] = item

    @property
    def parses(self) -> set[Tree]:
        try:
            return self._parses
        except AttributeError:
            self._parses = set()
            
            for entry in self._chart[-1]:
                is_start = entry.dotted_rule.left_side == self._start_variable
                covers_string = entry.dotted_rule.span == (0, self.input_size)
                
                if is_start and covers_string:
                    self._parses.add(self._construct_parses(entry))
            
            return self._parses

    def _construct_parses(self, entry: 'EarleyChartEntry') -> Tree:
        """Construct the parses implied by the chart

        Parameters
        ----------
        entry
        """
        raise NotImplementedError     
    
    @property
    def input_size(self) -> int:
        return len(self._chart) - 1
    
class EarleyParser(ContextFreeGrammarParser):
    """
    An Earley parser

    Parameters
    ----------
    grammar : ContextFreeGrammar
    """
    normal_form = None
    
    def _fill_chart(self, string: list[str]) -> EarleyChart:
        """
        a chart for the string based on a CFG

        Parameters
        ----------
        string
        """
        raise NotImplementedError
                    
    def _predict(self, chart: EarleyChart, entry: EarleyChartEntry, position: int):
        for rule in self._grammar.rules(entry.next_symbol):
            span = (position, position)
            dotted_rule = DottedRule(rule, span)
            entry = EarleyChartEntry(dotted_rule)

            chart[position].add(entry)
     
    def _scan(self, chart: EarleyChart, entry: EarleyChartEntry, position: int):
        word = self._string[position]
        pos_for_word = self._grammar.parts_of_speech(word)
        
        if entry.next_symbol in pos_for_word:
            rule = Rule(entry.next_symbol, word)
            span = (position, position+1)
            dotted_rule = DottedRule(rule, span, dot=1)
            
            unary_entry = EarleyChartEntry(dotted_rule)
            
            chart[position+1].add(unary_entry)
        
    def _complete(self, chart: EarleyChart, entry: EarleyChartEntry, position: int):
        start, end = entry.span
        
        for prev_entry in chart[start]:
            if not prev_entry.is_complete and entry.is_completion_of(prev_entry):
                completed_entry = prev_entry.complete(entry, end)
                
                chart[position].add(completed_entry)
        
    def _parse(self, string: str | list[str]):
        chart = self._fill_chart(string)
        return chart.parses

    def _recognize(self, string: str | list[str]):
        chart = self._fill_chart(string)
        
        for entry in chart[-1]:
            is_start = entry.dotted_rule.left_side == self._grammar.start_variable
            covers_string = entry.dotted_rule.span == (0, chart.input_size)
            
            if is_start and covers_string:
                return True
            
        else:
            return False
        
ContextFreeGrammar.parser_class = EarleyParser

Task 3

Implement the EarleyParser._fill_chart and EarleyChart._construct_parses method. Note that EarleyChart._construct_parses works slightly differently than CKYChart._construct_parses in only producing a single Tree. The parses attributes collects all Trees implied by a chart into a set.

class EarleyParser(EarleyParser):
    
    def _fill_chart(self, string: list[str]) -> EarleyChart:
        """
        a chart for the string based on a CFG

        Parameters
        ----------
        string
        """
        raise NotImplementedError
    
class EarleyChart(EarleyChart):
    
    def _construct_parses(self, entry: 'EarleyChartEntry') -> Tree:
        """Construct the parses implied by the chart

        Parameters
        ----------
        entry
        """
        raise NotImplementedError

Test your implementation using the same grammar and sentence we used to test the CKY implementation.

ContextFreeGrammar.parser_class = EarleyParser

grammar = ContextFreeGrammar(alphabet={'the', 'greyhound', 'ate', 'the', 'salmon',
                                       'with', 'a', 'fork', 'again', 'quickly'},
                             variables={'S', 'NP', 'VP', 'PP', 'D', 'N', 'V', 'P', 'Adv'},
                             rules={Rule('S', 'NP', 'VP'),
                                    Rule('NP', 'D', 'N'),
                                    Rule('NP', 'NP', 'PP'),
                                    Rule('VP', 'V', 'NP'),
                                    Rule('VP', 'VP', 'PP'),
                                    Rule('VP', 'Adv', 'VP'),
                                    Rule('VP', 'VP', 'Adv'),
                                    Rule('PP', 'P', 'NP'),
                                    Rule('D', 'the'),
                                    Rule('D', 'a'),
                                    Rule('N', 'greyhound'),
                                    Rule('N', 'salmon'),
                                    Rule('N', 'fork'),
                                    Rule('V', 'fork'),
                                    Rule('V', 'ate'),
                                    Rule('P', 'with'),
                                    Rule('Adv', 'again'),
                                    Rule('Adv', 'quickly')},
                             start_variable='S')

grammar(['the', 'greyhound', 'again', 'ate', 'the',
         'salmon', 'with', 'a', 'fork', 'quickly'], mode="parse")

Task 4

Implement an instance method EarleyParser._predict_next_word that is used when EarleyParser is called in "predict" mode. This method should take in a tokenized string—the prefix —and output a dictionary mapping from parts-of-speech that could come after the prefix to words that could come after the prefix.

from typing import Dict
from collections import defaultdict

class EarleyParser(EarleyParser):
    
    def __call__(self, string, mode="recognize"):
        if mode == "recognize":
            return self._recognize(string)
        elif mode == "parse":
            return self._parse(string)  
        elif mode == "predict":
            return self._predict_next_word(string)  
        else:
            msg = 'mode must be "parse", "recognize", or "predict"'
            raise ValueError(msg)
    
    def _predict_next_word(self, prefix: list[str]) -> Dict[str, set[str]]:
        raise NotImplementedError

ContextFreeGrammar.parser_class = EarleyParser

Test your prediction method against the grammar we used in class.

grammar = ContextFreeGrammar(alphabet={'the', 'greyhound', 'ate', 'the', 'salmon',
                                       'with', 'a', 'fork', 'again', 'too'},
                             variables={'S', 'NP', 'VP', 'PP', 'D', 'N', 'V', 'P', 'Adv'},
                             rules={Rule('S', 'NP', 'VP'),
                                    Rule('NP', 'D', 'N'),
                                    Rule('NP', 'NP', 'PP'),
                                    Rule('VP', 'V', 'NP'),
                                    Rule('VP', 'VP', 'PP'),
                                    Rule('VP', 'Adv', 'VP'),
                                    Rule('VP', 'VP', 'Adv'),
                                    Rule('PP', 'P', 'NP'),
                                    Rule('D', 'the'),
                                    Rule('D', 'a'),
                                    Rule('N', 'greyhound'),
                                    Rule('N', 'salmon'),
                                    Rule('N', 'fork'),                                        
                                    Rule('V', 'ate'),
                                    Rule('VP', 'ate'),
                                    Rule('P', 'with'),
                                    Rule('Adv', 'again'),
                                    Rule('Adv', 'too')},
                             start_variable='S')

For instance, with the prefix the greyhound, you should get the following dictionary:

{'Adv': {'again', 'too'},
 'VP': {'ate'},
 'P': {'with'},
 'V': {'ate'}}
# write tests here

Task 5

Randomly sample sentences from EWT and compute the predicted words given the first one, two, and three words in that sentence. For each such set of predictions for each sentence count the total number of parts-of-speech and the total number of words that could follow that prefix.

grammar = ContextFreeGrammar.from_treebank(EnglishWebTreebank())

# sample and predict here

You’ll see a general pattern in the numbers, but if you dig down into exactly what the next word predictions are, they will look at. What gives rise to this oddness?