---
title: Building a corpus reader
jupyter: python3
execute:
eval: false
---
```{python}
#| code-fold: true
#| code-summary: Definition of `Tree` up to this point
from rdflib import Graph, URIRef
class Tree:
"""A tree.
Parameters
----------
data : str
The data contained in this tree.
children : list[Tree]
The subtrees of this tree.
"""
def __init__(self, data: str, children: list['Tree']=[]):
self._data = data
self._children = children
self._validate()
def _validate(self) -> None:
try:
assert all(isinstance(c, Tree)
for c in self._children)
except AssertionError:
msg = 'all children must be trees'
raise TypeError(msg)
@property
def data(self) -> str:
"""The data at this node."""
return self._data
@property
def children(self) -> list['Tree']:
"""The subtrees of this node."""
return self._children
def __str__(self):
if self._children:
return ' '.join(c.__str__() for c in self._children)
else:
return str(self._data)
def __repr__(self):
return self.to_string(0)
def to_string(self, depth: int) -> str:
"""Render the tree as an indented string.
Parameters
----------
depth : int
The current depth for indentation.
Returns
-------
str
An indented text representation of the tree.
"""
s = (depth - 1) * ' ' +\
int(depth > 0) * '--' +\
self._data + '\n'
s += ''.join(c.to_string(depth+1)
for c in self._children)
return s
def __contains__(self, data: str) -> bool:
# pre-order depth-first search
if self._data == data:
return True
else:
for child in self._children:
if data in child:
return True
return False
def __getitem__(self, idx: tuple[int]) -> 'Tree':
idx = (idx,) if isinstance(idx, int) else idx
try:
assert all(isinstance(i, int) for i in idx)
assert all(i >= 0 for i in idx)
except AssertionError:
errmsg = 'index must be a positive int or tuple of positive ints'
raise IndexError(errmsg)
if not idx:
return self
elif len(idx) == 1:
return self._children[idx[0]]
else:
return self._children[idx[0]][idx[1:]]
RDF_TYPES = {}
RDF_EDGES = {'is': URIRef('is-a'),
'parent': URIRef('is-the-parent-of'),
'child': URIRef('is-a-child-of'),
'sister': URIRef('is-a-sister-of')}
def to_rdf(self, graph=None, nodes={}, idx=tuple()) -> Graph:
"""Convert the tree to an RDF graph for SPARQL querying.
Parameters
----------
graph : Graph, optional
An existing graph to add triples to.
nodes : dict, optional
A mapping from index tuples to URI nodes.
idx : tuple, optional
The index of this node in the parent tree.
Returns
-------
Graph
The RDF graph representing the tree.
"""
graph = Graph() if graph is None else graph
idxstr = '_'.join(str(i) for i in idx)
nodes[idx] = URIRef(idxstr)
if self._data not in Tree.RDF_TYPES:
Tree.RDF_TYPES[self._data] = URIRef(self._data)
typetriple = (nodes[idx],
Tree.RDF_EDGES['is'],
Tree.RDF_TYPES[self.data])
graph.add(typetriple)
for i, child in enumerate(self._children):
childidx = idx+(i,)
child.to_rdf(graph, nodes, childidx)
partriple = (nodes[idx],
Tree.RDF_EDGES['parent'],
nodes[childidx])
chitriple = (nodes[childidx],
Tree.RDF_EDGES['child'],
nodes[idx])
graph.add(partriple)
graph.add(chitriple)
for i, child1 in enumerate(self._children):
for j, child2 in enumerate(self._children):
child1idx = idx+(i,)
child2idx = idx+(j,)
sistriple = (nodes[child1idx],
Tree.RDF_EDGES['sister'],
nodes[child2idx])
graph.add(sistriple)
self._rdf_nodes = nodes
return graph
@property
def rdf(self) -> Graph:
"""The lazily-constructed RDF graph for this tree."""
if not hasattr(self, "_rdf"):
self._rdf = self.to_rdf()
return self._rdf
def find(self, query: str) -> list[tuple[int]]:
"""Find subtrees matching a SPARQL query.
Parameters
----------
query : str
A SPARQL SELECT query.
Returns
-------
list[tuple[int]]
Index paths to matching nodes.
"""
return [tuple([int(i)
for i in str(res[0]).split('_')])
for res in self.rdf.query(query)]
```
Now that we can search over individual trees, let's now see how to automatically load all trees from a corpus. We'll use the constituency-parsed [English Web TreeBank](https://catalog.ldc.upenn.edu/LDC2012T13) for this purpose. This corpus is separated into different genres, sources, and documents, with each `.tree` file containing possibly multiple parse trees (one per line).
```{python}
#| colab: {base_uri: 'https://localhost:8080/'}
#| executionInfo: {elapsed: 511, status: ok, timestamp: 1680619992100, user: {displayName: Aaron Steven White, userId: 06256629009318567325}, user_tz: 240}
#| outputId: 75090126-df60-4208-a45b-e5b16cf207bd
!tar -xzf LDC2012T13.tgz --to-command=cat 'eng_web_tbk/data/newsgroup/penntree/groups.google.com_8TRACKGROUPFORCOOLPEOPLE_3b43577fb9121c9f_ENG_20050320_090500.xml.tree'
```
We will talk about how to actually parse these sorts of strings against a grammar later in the class, but for current purposes, we'll use [`pyparsing`](https://github.com/pyparsing/pyparsing) to define a grammar and parse threse strings to a list of lists.
```{python}
#| executionInfo: {elapsed: 15, status: ok, timestamp: 1680619999957, user: {displayName: Aaron Steven White, userId: 06256629009318567325}, user_tz: 240}
import pyparsing
LPAR = pyparsing.Suppress('(')
RPAR = pyparsing.Suppress(')')
data = pyparsing.Regex(r'[^\(\)\s]+')
exp = pyparsing.Forward()
expList = pyparsing.Group(LPAR + data + exp[...] + RPAR)
exp <<= data | expList
Tree.PARSER = exp
```
```{python}
#| colab: {base_uri: 'https://localhost:8080/'}
#| executionInfo: {elapsed: 1126, status: ok, timestamp: 1680620001069, user: {displayName: Aaron Steven White, userId: 06256629009318567325}, user_tz: 240}
#| outputId: d8f65200-e826-4fb8-81ab-0f1af7792f81
import tarfile
from pprint import pprint
fname = "eng_web_tbk/data/newsgroup/penntree/groups.google.com_8TRACKGROUPFORCOOLPEOPLE_3b43577fb9121c9f_ENG_20050320_090500.xml.tree"
with tarfile.open("LDC2012T13.tgz") as corpus:
with corpus.extractfile(fname) as treefile:
treestr = treefile.readline().decode()[2:-2]
treelist = exp.parseString(treestr)[0]
treelist
```
First, we'll define a method for building a `Tree` from this `ParseResults` object, which can be viewed as a list of list of lists...
```{python}
#| executionInfo: {elapsed: 3, status: ok, timestamp: 1680620001405, user: {displayName: Aaron Steven White, userId: 06256629009318567325}, user_tz: 240}
class Tree(Tree):
@classmethod
def from_string(cls, treestr: str) -> 'Tree':
"""Parse a bracketed tree string into a Tree.
Parameters
----------
treestr : str
A parenthesized tree string.
Returns
-------
Tree
The parsed tree.
"""
treelist = cls.PARSER.parseString(treestr[2:-2])[0]
return cls.from_list(treelist)
@classmethod
def from_list(cls, treelist) -> 'Tree':
"""Build a Tree from a nested list structure.
Parameters
----------
treelist : list or str
A nested list (from pyparsing) or a terminal string.
Returns
-------
Tree
The constructed tree.
"""
if isinstance(treelist, str):
return cls(treelist[0])
elif isinstance(treelist[1], str):
return cls(treelist[0], [cls(treelist[1])])
else:
return cls(treelist[0], [cls.from_list(l) for l in treelist[1:]])
```
We can now build a lightweight container for our trees.
```{python}
#| colab: {base_uri: 'https://localhost:8080/'}
#| executionInfo: {elapsed: 595, status: ok, timestamp: 1680620001997, user: {displayName: Aaron Steven White, userId: 06256629009318567325}, user_tz: 240}
#| outputId: 9dac79d6-f474-488a-ec60-e1750ffacd6d
import tarfile
from collections import defaultdict
class EnglishWebTreebank:
"""Lazy reader for the English Web Treebank.
Parameters
----------
root : str
Path to the LDC tgz archive.
"""
def __init__(self, root='LDC2012T13.tgz'):
def trees():
with tarfile.open(root) as corpus:
for fname in corpus.getnames():
if '.xml.tree' in fname:
with corpus.extractfile(fname) as treefile:
treestr = treefile.readline().decode()
yield fname, Tree.from_string(treestr)
self._trees = trees()
def items(self):
"""Yield filename-tree pairs from the treebank."""
for fn, tlist in self._trees:
yield fn, tlist
ewt = EnglishWebTreebank()
next(ewt.items())
```
Now, we can run arbitrary queries across trees.
```{python}
#| executionInfo: {elapsed: 83578, status: ok, timestamp: 1680622491695, user: {displayName: Aaron Steven White, userId: 06256629009318567325}, user_tz: 240}
ewt = EnglishWebTreebank()
n_subj = 0
n_subj_prp = 0
n_obj_prp = 0
n_obj = 0
for _, tree in ewt.items():
idx_subj_prp = tree.find('''SELECT ?node
WHERE { ?node <is-a> <NP-SBJ>;
<is-the-parent-of> ?child.
?child <is-a> <PRP>.
}''')
idx_subj = tree.find('''SELECT ?node
WHERE { ?node <is-a> <NP-SBJ>. }''')
idx_obj_prp = tree.find('''SELECT ?node
WHERE { ?parent <is-the-parent-of> ?node.
{ ?parent <is-a> <VP> } UNION { ?parent <is-a> <PP> }
?node <is-the-parent-of> ?child;
<is-a> <NP>.
?child <is-a> <PRP>.
}''')
idx_obj = tree.find('''SELECT ?node
WHERE { ?parent <is-the-parent-of> ?node.
{ ?parent <is-a> <VP> } UNION { ?parent <is-a> <PP> }
?node <is-a> <NP>.
}''')
```