import pyparsing
from typing import TypeVar, Union
from rdflib import Graph, URIRef
= TypeVar("DataType")
DataType = list[str | tuple[Union[str, 'TreeList']]]
TreeList
class TreeOld:
"""A tree
Parameters
----------
data
children
"""
= {}
RDF_TYPES = {'is': URIRef('is-a'),
RDF_EDGES 'parent': URIRef('is-the-parent-of'),
'child': URIRef('is-a-child-of'),
'sister': URIRef('is-a-sister-of')}
= pyparsing.Suppress('(')
LPAR = pyparsing.Suppress(')')
RPAR = pyparsing.Regex(r'[^\(\)\s]+')
DATA
= pyparsing.Forward()
PARSER = pyparsing.ZeroOrMore(PARSER)
SUBTREE = pyparsing.Group(LPAR + DATA + SUBTREE + RPAR)
PARSERLIST <<= DATA | PARSERLIST
PARSER
def __init__(self, data: DataType, children: list['TreeOld'] = []):
self._data = data
self._children = children
self._validate()
def __str__(self):
if self._children:
return ' '.join(c.__str__() for c in self._children)
else:
return str(self._data)
def __repr__(self):
return self.to_string()
def to_string(self, depth: int = 0) -> str:
= (depth - 1) * ' ' +\
s int(depth > 0) * '--' +\
self._data + '\n'
+= ''.join(c.to_string(depth+1)
s for c in self._children)
return s
def __contains__(self, data: DataType) -> bool:
# pre-order depth-first search
if self._data == data:
return True
else:
for child in self._children:
if data in child:
return True
return False
def __getitem__(self, idx: tuple[int]) -> DataType:
if isinstance(idx, int):
return self._children[idx]
elif len(idx) == 1:
return self._children[idx[0]]
elif idx:
return self._children[idx[0]].__getitem__(idx[1:])
else:
return self
@property
def data(self) -> DataType:
return self._data
@property
def children(self) -> list['TreeOld']:
return self._children
def _validate(self):
try:
assert all(isinstance(c, Tree)
for c in self._children)
except AssertionError:
= 'all children must be trees'
msg raise TypeError(msg)
def index(self, data: DataType, index_path: tuple[int] = tuple()):
= [index_path] if self._data==data else []
indices = [] if index_path == -1 else index_path
root_path
+= [j
indices for i, c in enumerate(self._children)
for j in c.index(data, root_path+(i,))]
return indices
def to_rdf(
self,
| None=None,
graph: Graph dict[int, URIRef] = {},
nodes: tuple[int] = tuple()
idx: -> Graph:
) = Graph() if graph is None else graph
graph
= '_'.join(str(i) for i in idx)
idxstr = URIRef(idxstr)
nodes[idx]
if self._data not in self.RDF_TYPES:
self.RDF_TYPES[self._data] = URIRef(self._data)
= (nodes[idx],
typetriple self.RDF_EDGES['is'],
self.RDF_TYPES[self.data])
graph.add(typetriple)
for i, child in enumerate(self._children):
= idx+(i,)
childidx
child.to_rdf(graph, nodes, childidx)
= (nodes[idx],
partriple self.RDF_EDGES['parent'],
nodes[childidx])= (nodes[childidx],
chitriple self.RDF_EDGES['child'],
nodes[idx])
graph.add(partriple)
graph.add(chitriple)
for i, child1 in enumerate(self._children):
for j, child2 in enumerate(self._children):
= idx+(i,)
child1idx = idx+(j,)
child2idx = (nodes[child1idx],
sistriple 'sister'],
Tree.RDF_EDGES[
nodes[child2idx])
graph.add(sistriple)
self._rdf_nodes = nodes
return graph
@property
def rdf(self) -> Graph:
return self.to_rdf()
def find(self, query):
return [tuple([int(i)
for i in str(res[0]).split('_')])
for res in self.rdf.query(query)]
@classmethod
def from_string(cls, treestr: str) -> 'TreeOld':
= cls.PARSER.parseString(treestr[2:-2])[0]
treelist
return cls.from_list(treelist)
@classmethod
def from_list(cls, treelist: TreeList):
if isinstance(treelist, str):
return cls(treelist[0])
elif isinstance(treelist[1], str):
return cls(treelist[0], [cls(treelist[1])])
else:
return cls(treelist[0], [cls.from_list(l) for l in treelist[1:]])
Assignments 5 and 6
Like Assignments 1/2 and 3/4, Assignments 5 and 6 are bundled together. You only need to do Task 1 for Assignment 5 and Tasks 2 and 3 for Assignment 6.
These assignments focus on implementing fuzzy tree search using edit distance and regular expressions. In these supplementary notes, various tree search algorithms are developedthat look for an exact match between a query and the data contained in particular trees. I’ve copied in the relevant class below as TreeOld
.
In fuzzy search, we allow this exact matching restriction to be loosened by instead allowing that matches be (i) within some fixed edit distance; and/or (ii) closest (in terms of edit distance) to the query among all pieces of data. I’ve copied simplified version of the edit distance class that we developed in class below as EditDistance
.
import numpy as np
class EditDistance:
'''Distance between strings
Parameters
----------
insertion_cost
deletion_cost
substitution_cost
'''
def __init__(self, insertion_cost: float = 1.,
float = 1.,
deletion_cost: float | None = None):
substitution_cost: self._insertion_cost = insertion_cost
self._deletion_cost = deletion_cost
if substitution_cost is None:
self._substitution_cost = insertion_cost + deletion_cost
else:
self._substitution_cost = substitution_cost
def __call__(self, source: str | list[str], target: str | list[str]) -> float:
'''The edit distance between the source and target
The use of lists enables digraphs to be identified
Parameters
----------
source
target
'''
# coerce to list if not a list
if isinstance(source, str):
= list(source)
source
if isinstance(target, str):
= list(target)
target
= len(source), len(target)
n, m = ['#']+source, ['#']+target
source, target
= np.zeros([n+1, m+1], dtype=float)
distance
for i in range(1,n+1):
0] = distance[i-1,0]+self._deletion_cost
distance[i,
for j in range(1,m+1):
0,j] = distance[0,j-1]+self._insertion_cost
distance[
for i in range(1,n+1):
for j in range(1,m+1):
if source[i] == target[j]:
= 0.
substitution_cost else:
= self._substitution_cost
substitution_cost
= np.array([distance[i-1,j]+self._deletion_cost,
costs -1,j-1]+substitution_cost,
distance[i-1]+self._insertion_cost])
distance[i,j
= costs.min()
distance[i,j]
return distance[n,m]
Task 1
Lines: 14
Define an instance method fuzzy_find
. This method should take a piece of query data
and optionally a distance
and return all of the nodes that have data that is within edit distance distance
from data
; and/or if , closest
is True
, it should return all of the nodes closest to data
among all nodes in the tree.
For instance:
fuzzy_find('review', distance=3., closest=False)
will return a tuple of every piece of data in the tree within edit distance 3 from review (e.g. view, reviewer, reviews, etc.), its distance to review, and its index; if there is nothing within that edit distance, an empty list will be returnedfuzzy_find('review', distance=3., closest=True)
will return a tuple of the closest pieces of data in the tree that are also within edit distance 3 from review (e.g. view, reviewer, reviews, etc.), its distance to review, and its index; if there is nothing within that edit distance, an empty list will be returnedfuzzy_find('review', distance=np.inf, closest=True)
will return a tuple of the closest pieces of data in the tree to review (e.g. view, reviewer, reviews, etc.), regardless of edit distance, its distance to review, and its index; this will always return somethingfuzzy_find('review', distance=np.inf, closest=False)
will return a tuple of every piece of data in the tree to review (e.g. view, reviewer, reviews, etc.), regardless of edit distance, its distance to review, and its index; this will always return a list with as many elements as there are nodes in the tree
This method should also support only searching the terminal nodes (leaves) of the tree with the flag terminals_only
.
Hint: you should look back at the methods we defined for searching and indexing the tree above. Specifically, to understand why you might want something like index_path
defaulting to the empty tuple, look at the index
method of TreeOld
.
= tuple[tuple, str | list[str], float]
FuzzyFindResult
class Tree(TreeOld):
= EditDistance(1., 1.)
DIST
def fuzzy_find(self, data: Union[str, list[str]],
bool = True,
closest: float = np.inf,
distance: bool = True,
case_fold: bool = True,
terminals_only: tuple = tuple()) -> list[FuzzyFindResult]:
index_path:
'''Find the (closest) strings within a certain distance
Defaults to computing the closest strings among the terminals and ignoring case
The format of the things returned is [((0,1,0), "view", 2.0), ...]. Note that
edit distance can be computed on either a `str` or `List[str]`; that's why
the middle element of each tuple might be either.
Parameters
----------
data
the data to match against
closest
whether to return only the closest strings or all strings within distance
distance
the distance within which a string must be
case_fold
whether to lower-case the data
terminals_only
whether to only search the terminals
index_path
'''
raise NotImplementedError
Write tests that use the following tree as input data.
= '( (SBARQ (WHNP-1 (WP What)) (SQ (NP-SBJ-1 (-NONE- *T*)) (VP (VBZ is) (NP-PRD (NP (DT the) (JJS best) (NN place)) (SBAR (WHADVP-2 (-NONE- *0*)) (S (NP-SBJ (-NONE- *PRO*)) (VP (TO to) (VP (VB get) (NP (NP (NNS discounts)) (PP (IN for) (NP (NML (NNP San) (NNP Francisco)) (NNS restaurants)))) (ADVP-LOC-2 (-NONE- *T*))))))))) (. ?)) )'
treestr
= Tree.from_string(treestr)
testtree
testtree
SBARQ
--WHNP-1
--WP
--What
--SQ
--NP-SBJ-1
---NONE-
--*T*
--VP
--VBZ
--is
--NP-PRD
--NP
--DT
--the
--JJS
--best
--NN
--place
--SBAR
--WHADVP-2
---NONE-
--*0*
--S
--NP-SBJ
---NONE-
--*PRO*
--VP
--TO
--to
--VP
--VB
--get
--NP
--NP
--NNS
--discounts
--PP
--IN
--for
--NP
--NML
--NNP
--San
--NNP
--Francisco
--NNS
--restaurants
--ADVP-LOC-2
---NONE-
--*T*
--.
--?
The tests should test the four combinations of distance
and closest
listed above with the same query data
, both with and without terminals_only=True
and terminals_only=False
(eight tests in total). Two further tests should test distance=np.inf, closest=True, terminals_only=True
for a case where only a single element should be returned and a case where multiple elements in the tree should be returned.
# write tests here
Remember that we talked in class about what it would mean to take the distance between a string and a collection of strings: basically, the minimum of the edit distances between the string and each string in that set.
We can use this concept in two ways here. The first is to view the tree as a container for some data and to compute the minimum distance between a query and any data contained in the tree. Alternatively, we can think of the query itself as determining a set and compute the minimum distance of each piece of data in the tree to that set. Task 2 will implement the former and Task 3 the latter.
I’ve copied the corpus reader we developed for the English Web Treebank in class below. We’ll make use of this for Task 2. (You’ll need to grab LDC2012T13.tgz
from the course Google drive.)
!wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1ygMIl1w6wz6A24oxkLwirunSKXb9EW12' -O 'LDC2012T13.tgz'
import tarfile
from collections import defaultdict
class EnglishWebTreebankOld:
def __init__(self, root='LDC2012T13.tgz'):
def trees():
with tarfile.open(root) as corpus:
for fname in corpus.getnames():
if '.xml.tree' in fname:
with corpus.extractfile(fname) as treefile:
= treefile.readline().decode()
treestr yield fname, Tree.from_string(treestr)
self._trees = trees()
def items(self):
for fn, tlist in self._trees:
yield fn, tlist
Task 2
Lines: 3
Define an instance method fuzzy_find
for the corpus reader class that computes the minimum distance between a query and a tree for all trees in the corpus. It should return a list of tuples with the first element a tree ID, the second an index in that tree, the third the data at that index and the fourth the distance between the query and that index. A tuple should be included in the returned list only if the distance is equal to the minimum across trees in the corpus.
Hint: this should be very straightforward using a particular parameterization for Tree1.fuzzy_find
. Which one?
class EnglishWebTreebank(EnglishWebTreebankOld):
def fuzzy_find(self, data: str | list[str]) -> list[FuzzyFindResult]:
'''Find the trees in the corpus closest to the query data
Parameters
----------
data
'''
raise NotImplementedError
Now, load this corpus.
= EnglishWebTreebank() ewt
Write a single test for a piece of data you know exists in some tree in the corpus. (Determiners or auxiliary verbs are good candidates.) Thus, the minimum distance will be zero and your method should return only trees that contain that element. Note that this test should use some already existing method to produce the correct set of trees.
Hint: such a method already exists in the TreeOld
class.
# write test here
The next task will look at computing distance between the elements of a tree and a query set defined by a regular expression. Here is a regular expression class based on the formal definition of regular expressions I gave you in class.
from itertools import product
class Regex:
"""A regular expression
Parameters
----------
regex_parsed
maxlength
"""
= pyparsing.Word(pyparsing.alphas, exact=1).setName("character") # <- use 'exact', not 'max'
CHAR
= pyparsing.Suppress('(')
LPAR = pyparsing.Suppress(')')
RPAR
= pyparsing.Forward()
PARSER = pyparsing.Group(LPAR + PARSER + RPAR)
GROUP = pyparsing.oneOf("* ? +")
QUANT = '|'
DSJ
= pyparsing.Group((CHAR | GROUP) + QUANT) | pyparsing.Group(CHAR + DSJ + CHAR) | CHAR | GROUP
ITEM = pyparsing.OneOrMore(ITEM)
ITEMSEQ
<<= pyparsing.delimitedList(ITEMSEQ, pyparsing.Empty())
PARSER
def __init__(self, regex_parsed: List[Union[str, List]], maxlength: int):
self._regex_parsed = regex_parsed
self._maxlength = maxlength
@classmethod
def from_string(cls, regexstr: str, maxlength: int = 30):
if regexstr[0] != '(':
= '(' + regexstr
regexstr
if regexstr[-1] != ')':
= regexstr +')'
regexstr
= cls.PARSER.parseString(regexstr)[0]
regex_parsed
return cls(regex_parsed, maxlength)
def __iter__(self):
self._gen = self._construct_regex_generator()
return self
def __next__(self):
return next(self._gen)
def _construct_regex_generator(self, regex=None):
if regex is None:
= self._regex_parsed
regex
if isinstance(regex, str):
if len(regex) > self._maxlength:
raise StopIteration
yield regex
elif regex[1] in ['*', '+']:
= 0 if regex[1] == '*' else 1
i while True:
for s in self._construct_regex_generator(regex[0]):
yield s*i
+= 1
i
if i > self._maxlength:
break
elif regex[1] == '?':
yield ''
yield regex[0]
elif regex[1] == '|':
= self._construct_regex_generator(regex[0])
left = self._construct_regex_generator(regex[2])
right
= s2 = ''
s1
while True:
if len(s1) <= self._maxlength:
= next(left)
s1 yield s1
if len(s2) <= self._maxlength:
= next(right)
s2 yield s2
if len(s1) > self._maxlength and len(s2) > self._maxlength:
break
else:
= [self._construct_regex_generator(r) for r in regex]
evaluated for p in product(*evaluated):
= ''.join(p)
c
if len(c) <= self._maxlength:
yield c
The way to use this class to generate the set of strings associated with a regular expression is tree an instance of the Regex
class as a generator.
Importantly, I’ve include a way of only generating strings of less than some length threshold maxlength
in the case that your regular expression evaluates to an infinite set.
for s in Regex.from_string('co+lou?r', 20):
print(s)
Task 3
Lines: 15
Define a new version of fuzzy_find
that behaves exactly the same as your version from Task 1 except that it allows the query data
to be a regular expression parsable by Regex.from_string
. Make sure that you correctly handle infinite sets.
Hint: your new fuzzy_find
will be nearly identical to the old one. My implementation only has a single additional line.
class Tree(TreeOld):
= EditDistance(1., 1., 1.)
DIST
def fuzzy_find(self, data: str | list[str],
bool = True,
closest: float = np.inf,
distance: bool = True,
case_fold: bool = True,
terminals_only: tuple[int] = tuple()) -> list[FuzzyFindResult]:
index_path:
'''Find the (closest) strings within a distance of the set defined by a regex
Defaults to computing the closest strings among the terminals and ignoring case
Parameters
----------
data
the regex to match against
closest
whether to return only the closest strings or all strings within distance
distance
the distance within which a string must be
case_fold
whether to lower-case the data
terminals_only
whether to only search the terminals
index_path
'''
raise NotImplementedError
Write tests analogous to the ones you wrote for Task 1.
# write tests here