Source code for discodop.demos

"""Examples of various formalisms encoded in LCFRS grammars."""
from __future__ import print_function, absolute_import
from math import exp
from . import treetransforms, plcfrs, kbest
from .tree import Tree
from .grammar import treebankgrammar
from .containers import Grammar


[docs]def tree_adjoining_grammar():
	"""Example of a tree-adjoining grammar (TAG) encoded as an LCFRS.

	Taken from: Chen & Vijay-Shanker (IWPT 2000), Automated extraction of TAGs
	from the Penn treebank. http://nlp.cs.nyu.edu/nycnlp/autoextract.ps

	- no epsilon productions
	- non-terminals have identifiers to encode elementary trees of depth > 2.
	"""
	print("Tree-Adjoining Grammars in LCFRS")
	print('''initial trees:
(S (NP ) (VP (V fell)))
(NP (NN prices))
auxiliary trees:
(S (ADVP (RB Later) (S* ))
(VP (ADVP (RB drastically)) (VP* ))''')
	grammar = Grammar([
			((('ADVP#1', 'RB#1'), ((0, ), )), 1),
			((('ADVP#2', 'RB#2'), ((0, ), )), 1),
			((('NP', 'NN#1'), ((0, ), )), 1),
			((('ROOT', 'S'), ((0, ), )), 1),
			((('S', 'NP', 'VP'), ((0, 1), )), 1),
			((('S', 'ADVP#1', 'S'), ((0, 1), )), 1),
			((('VP', 'V#1'), ((0, ), )), 1),
			((('VP', 'ADVP#2', 'VP'), ((0, 1), )), 1),
			((('RB#1', 'Epsilon'), ('Later', )), 1),
			((('NN#1', 'Epsilon'), ('prices', )), 1),
			((('V#1', 'Epsilon'), ('fell', )), 1),
			((('RB#2', 'Epsilon'), ('drastically', )), 1)])
	print(grammar)
	assert parse(grammar, "prices fell".split())
	assert parse(grammar, "prices drastically fell".split())
	assert parse(grammar, "Later prices fell".split())
	assert parse(grammar, "Later prices drastically fell".split())

	# taken from: slides for course Grammar Formalisms, Kallmeyer (2011),
	# Mildly Context-Sensitive Grammar Formalisms:
	# LCFRS: Relations to other Formalisms
	# https://user.phil.hhu.de/~kallmeyer/GrammarFormalisms/4lcfrs-related-formalisms.pdf
	print("the language {d} + {a**n b**m c**m d **n} with n>0, m>=0")
	print('''initial trees:
(S a (S Epsilon) F)
(F d)
auxiliary trees:
(S b S* c)''')
	grammar = Grammar([
			((('ROOT', 'a1'), ((0, ), )), 1),
			((('ROOT', 'a2'), ((0, ), )), 1),
			((('a1', 'a_b', 'a2'), ((0, 1), )), 1),
			((('a1', '_a', 'a2'), ((0, 1), )), 1),
			((('a2', '_d'), ((0, ), )), 1),
			((('a_b', '_a', 'b'), ((0, 1), )), 1),
			((('b', '_b', '_c'), ((0, 1), )), 1),
			((('b', 'b_2', 'b'), ((0, 1, 0), )), 1),
			((('b_2', '_b', '_c'), ((0, ), (1, ))), 1),
			((('_a', 'Epsilon'), ('a', )), 1),
			((('_b', 'Epsilon'), ('b', )), 1),
			((('_c', 'Epsilon'), ('c', )), 1),
			((('_d', 'Epsilon'), ('d', )), 1)])
	print(grammar)
	assert parse(grammar, list("d"))
	assert parse(grammar, list("ad"))
	assert parse(grammar, list("abcd"))
	assert parse(grammar, list("abbccd"))
	print("wrong:")
	assert not parse(grammar, list("abbbccd"))

	# Taken from: Boullier (1998), Generalization of Mildly
	# Context-Sensitive Formalisms. http://aclweb.org/anthology/W98-0105
	# Epsilon replaced with '|', added preterminal rules w/underscores
	print("the language { ww | w in {a,b}* }")
	print('''initial trees:
		(S (A Epsilon))
		auxiliary trees:
		(A a (A A*) a)
		(A b (A A*) b)
		(A (A A*))''')
	grammar = Grammar([
			((('_aa', '_a', '_a'), ((0, ), (1, ))), 1),
			((('_bb', '_b', '_b'), ((0, ), (1, ))), 1),
			((('A', '_aa', 'A'), ((0, 1), (0, 1))), 1),
			((('A', '_bb', 'A'), ((0, 1), (0, 1))), 1),
			((('A', '_aa'), ((0, ), (0, ))), 1),
			((('A', '_bb'), ((0, ), (0, ))), 1),
			((('ROOT', '_|'), ((0, ), )), 1),
			((('ROOT', 'A', '_|'), ((0, 1, 0), )), 1),
			((('_a', 'Epsilon'), ('a', )), 1),
			((('_b', 'Epsilon'), ('b', )), 1),
			((('_|', 'Epsilon'), ('|', )), 1)])
	print(grammar)
	assert parse(grammar, list("a|a"))
	assert parse(grammar, list("ab|ab"))
	assert parse(grammar, list("abaab|abaab"))
	print("wrong:")
	assert not parse(grammar, list("a|b"))
	assert not parse(grammar, list("aa|bb"))


[docs]def dependencygrammar():
	"""An example dependency structure encoded in an LCFRS grammar.

	Taken from: Gildea (2010, fig. 4), Optimal Parsing Strategies for Linear
	Context-Free Rewriting Systems. http://aclweb.org/anthology/N10-1118

	- rules have to be binarized
	- lexical rules have to be unary

	These have been dealt with by introducing nodes w/underscores."""
	print("A dependency grammar in an LCFRS:")
	grammar = Grammar([
			((('NMOD', '_A'), ((0, ), )), 1),
			((('NMOD', '_the'), ((0, ), )), 1),
			((('NMOD_hearing', 'NMOD', '_hearing'), ((0, 1), )), 1),
			((('NP', 'NMOD', '_issue'), ((0, 1), )), 1),
			((('PP', '_on', 'NP'), ((0, 1, ), )), 1),
			((('ROOT', 'SBJ', 'is_VC'), ((0, 1, 0, 1), )), 1),
			((('SBJ', 'NMOD_hearing', 'PP'), ((0, ), (1, ))), 1),
			((('TMP', '_today'), ((0, ), )), 1),
			((('VC', '_scheduled', 'TMP'), ((0, ), (1, ))), 1),
			((('is_VC', '_is', 'VC'), ((0, 1), (1, ))), 1),
			((('_A', 'Epsilon'), ('A', )), 1),
			((('_hearing', 'Epsilon'), ('hearing', )), 1),
			((('_is', 'Epsilon'), ('is', )), 1),
			((('_scheduled', 'Epsilon'), ('scheduled', )), 1),
			((('_on', 'Epsilon'), ('on', )), 1),
			((('_the', 'Epsilon'), ('the', )), 1),
			((('_issue', 'Epsilon'), ('issue', )), 1),
			((('_today', 'Epsilon'), ('today', )), 1)])
	print(grammar)
	testsent = "A hearing is scheduled on the issue today".split()
	assert parse(grammar, testsent)


[docs]def bitext():
	"""Bitext parsing with a synchronous CFG.

	Translation would require a special decoder (instead of normal k-best
	derivations where the whole sentence is given)."""
	print("bitext parsing with a synchronous CFG")
	trees = [Tree(a) for a in '''\
	(ROOT (S (NP (NNP (John 0) (John 7))) (VP (VB (misses 1) (manque 5))\
		(PP (IN (a` 6)) (NP (NNP (Mary 2) (Mary 4)))))) (SEP (| 3)))
	(ROOT (S (NP (NNP (Mary 0) (Mary 4))) (VP (VB (likes 1) (aimes 5))\
		(NP (DT (la 6)) (NN (pizza 2) (pizza 7))))) (SEP (| 3)))'''.split('\n')]
	sents = [["0"] * len(a.leaves()) for a in trees]
	for a in trees:
		treetransforms.binarize(a)
	compiled_scfg = Grammar(treebankgrammar(trees, sents))
	print("sentences:")
	for tree in trees:
		print(' '.join(w for _, w in sorted(tree.pos())))
	print("treebank:")
	for tree in trees:
		print(tree)
	print(compiled_scfg, "\n")

	print("correct translations:")
	assert parse(compiled_scfg, ["0"] * 7,
			"John likes Mary | John aimes Mary".split())
	assert parse(compiled_scfg, ["0"] * 9,
			"John misses pizza | la pizza manque a` John".split())

	print("incorrect translations:")
	assert not parse(compiled_scfg, ["0"] * 7,
			"John likes Mary | Mary aimes John".split())
	assert not parse(compiled_scfg, ["0"] * 9,
			"John misses pizza | John manque a` la pizza".split())

	# the following SCFG is taken from:
	# http://cdec-decoder.org/index.php?title=SCFG_translation
	# the grammar has been binarized and some new non-terminals had to be
	# introduced because terminals cannot appear in binary rules.
	lexicon = ("|", "ein", "ich", "Haus", "kleines", "grosses", "sah", "fand",
		"small", "little", "big", "large", "house", "shell", "a", "I",
		"saw", "found")
	another_scfg = Grammar([
			((('DT', '_ein', '_a'), ((0, ), (1, ))), 0.5),
			((('JJ', '_kleines', '_small'), ((0, ), (1, ))), 0.1),
			((('JJ', '_kleines', '_little'), ((0, ), (1, ))), 0.9),
			((('JJ', '_grosses', '_big'), ((0, ), (1, ))), 0.8),
			((('JJ', '_grosses', '_large'), ((0, ), (1, ))), 0.2345),
			((('NN_house', '_Haus', '_house'), ((0, ), (1, ))), 1),
			((('NN_shell', '_Haus', '_shell'), ((0, ), (1, ))), 1),
			((('NP', '_ich', '_I'), ((0, ), (1, ), )), 0.6),
			((('NP', 'DT', 'NP|<JJ-NN>'), ((0, 1), (0, 1))), 0.5),
			((('NP|<JJ-NN>', 'JJ', 'NN_house'), ((0, 1), (0, 1))), 0.1),
			((('NP|<JJ-NN>', 'JJ', 'NN_shell'), ((0, 1), (0, 1))), 1.3),
			((('ROOT', 'S', '_|'), ((0, 1, 0), )), 1),
			((('S', 'NP', 'VP'), ((0, 1), (0, 1))), 0.2),
			((('VP', 'V', 'NP'), ((0, 1), (0, 1))), 0.1),
			((('V', '_sah', '_saw'), ((0, ), (1, ))), 0.4),
			((('V', '_fand', '_found'), ((0, ), (1, ))), 0.4)]
			+ [((('_%s' % word, 'Epsilon'), (word, )), 1)
					for word in lexicon])
	print(another_scfg)
	sents = [
		"ich sah ein kleines Haus | I saw a small house".split(),
		"ich sah ein kleines Haus | I saw a little house".split(),
		"ich sah ein kleines Haus | I saw a small shell".split(),
		"ich sah ein kleines Haus | I saw a little shell".split()]
	for sent in sents:
		assert parse(another_scfg, sent), sent


def parse(compiledgrammar, testsent, testtags=None):
	"""Parse a sentence with a grammar."""
	chart, _ = plcfrs.parse(testsent,
		compiledgrammar, tags=testtags, exhaustive=True)
	print("input:", ' '.join("%d:%s" % a
			for a in enumerate(testtags if testtags else testsent)), end=' ')
	if chart:
		print()
		results = kbest.lazykbest(chart, 10)[0]
		for tree, prob in results:
			tree = Tree(tree)
			treetransforms.unbinarize(tree)
			print(exp(-prob), tree)
		print()
		return True
	else:
		print("no parse!\n")
		print(chart)
		return False


def test():
	"""Alias for main()."""
	main()


def main():
	"""Run all examples."""
	dependencygrammar()
	tree_adjoining_grammar()
	bitext()


__all__ = ['tree_adjoining_grammar', 'dependencygrammar', 'bitext']