Definition of Tree up to this point
from typing import TypeVar

DataType = TypeVar("DataType")

class Tree:
    """A tree
    
    Parameters
    ----------
    data
        The data contained in this tree
    children
        The subtrees of this tree
    """
    def __init__(self, data: DataType, children: list['Tree']=[]):
        self._data = data
        self._children = children
        
        self._validate()
        
    def _validate(self) -> None:
        try:
            assert all(isinstance(c, Tree)
                       for c in self._children)
        except AssertionError:
            msg = 'all children must be trees'
            raise TypeError(msg)
        
    @property
    def data(self) -> DataType:
        return self._data 
    
    @property
    def children(self) -> list['Tree']:
        return self._children

    def __str__(self):
        if self._children:
            return ' '.join(c.__str__() for c in self._children)
        else:
            return str(self._data)
        
    def __repr__(self):
        return self.to_string(0)
     
    def to_string(self, depth: int) -> str:
        s = (depth - 1) * '  ' +\
            int(depth > 0) * '--' +\
            self._data + '\n'
        s += ''.join(c.to_string(depth+1)
                     for c in self._children)
        
        return s

    def __contains__(self, data: DataType) -> bool:
        # pre-order depth-first search
        if self._data == data:
            return True
        else:
            for child in self._children:
                if data in child:
                    return True
                
            return False

Let’s return to the motivating example from the last section: suppose I’m interested in finding all sentences with a definite determiner in a subject.

We know how to compute containment. The next question is: how do we find particular subtrees? The basic idea is going to be that, when searching, we want to return a subtree instead of a boolean. But it’s also going to be important to know where that subtree is with respect to other subtrees–e.g. to figure out whether it is a subject. This is going to require a way of indexing trees.

Indexation by search traversal

One way to index trees is analogous to lists: an int. But what does that int represent? The idea is that it will represent when a particular search algorithm visits a node. One way to do this is to flatten or linearize the tree according to the order in which, say, pre-order depth-first search visits subtrees, then to index into the flattened version of the tree.

class Tree(Tree):
            
    def __getitem__(self, idx):
        return self.flattened[idx]
    
    def __len__(self):
        return len(self.flattened)

    @property
    def flattened(self):
        try:
            return self._flattened
        except AttributeError:
            # pre-order depth-first search
            self._flattened = [self] +\
                              [elem 
                               for c in self._children
                               for elem in c.flattened]
            return self._flattened
tree = Tree('S', 
             [Tree('NP', 
                   [Tree('D', 
                         [Tree('a')]),
                    Tree('N', 
                         [Tree('greyhound')])]),
             Tree('VP', 
                   [Tree('V', 
                         [Tree('loves')]),
                    Tree('NP',
                         [Tree('D',
                               [Tree('a')]),
                          Tree('N',
                               [Tree('greyhound')])])])])
tree[0]
S
--NP
  --D
    --a
  --N
    --greyhound
--VP
  --V
    --loves
  --NP
    --D
      --a
    --N
      --greyhound
tree[1]
NP
--D
  --a
--N
  --greyhound
tree[2]
D
--a
tree[4]
N
--greyhound
for i in range(len(tree1)):
    print(i, tree[i].data)
0 S
1 NP
2 D
3 a
4 N
5 greyhound
6 VP
7 V
8 loves
9 NP
10 D
11 a
12 N
13 greyhound

Indexation by path to root

One issue with this indexation scheme is that it makes it a bit hard to represent relations like parenthood or sisterhood in a tree. One way to deal with this issue is to instead index using tuples representing the index path to the root.

class Tree(Tree):
        
    def __getitem__(self, idx: tuple[int]) -> 'Tree':
        idx = (idx,) if isinstance(idx, int) else idx
        
        try:
            assert all(isinstance(i, int) for i in idx)
            assert all(i >= 0 for i in idx)
        except AssertionError:
            errmsg = 'index must be a positive int or tuple of positive ints'
            raise IndexError(errmsg)
        
        if not idx:
            return self
        elif len(idx) == 1:
            return self._children[idx[0]]
        else:
            return self._children[idx[0]][idx[1:]]
tree = Tree('S', 
             [Tree('NP', 
                   [Tree('D', 
                         [Tree('a')]),
                    Tree('N', 
                         [Tree('greyhound')])]),
             Tree('VP', 
                   [Tree('V', 
                         [Tree('loves')]),
                    Tree('NP',
                         [Tree('D',
                               [Tree('a')]),
                          Tree('N',
                               [Tree('greyhound')])])])])
tree1[tuple()]
S
--NP
  --D
    --a
  --N
    --greyhound
--VP
  --V
    --loves
  --NP
    --D
      --a
    --N
      --greyhound
tree1[0]
NP
--D
  --a
--N
  --greyhound
tree1[0,0]
D
--a
tree1[0,1]
N
--greyhound
tree1[0,1,0]
greyhound
tree1[1]
VP
--V
  --loves
--NP
  --D
    --a
  --N
    --greyhound
tree1[1,1]
NP
--D
  --a
--N
  --greyhound
tree1[1,1,0]
D
--a
tree1[1,1,0,0]
a