Source code for discodop.demos

"""Examples of various formalisms encoded in LCFRS grammars."""
from __future__ import print_function, absolute_import
from math import exp
from . import treetransforms, plcfrs, kbest
from .tree import Tree
from .grammar import treebankgrammar
from .containers import Grammar


[docs]def tree_adjoining_grammar(): """Example of a tree-adjoining grammar (TAG) encoded as an LCFRS. Taken from: Chen & Vijay-Shanker (IWPT 2000), Automated extraction of TAGs from the Penn treebank. http://nlp.cs.nyu.edu/nycnlp/autoextract.ps - no epsilon productions - non-terminals have identifiers to encode elementary trees of depth > 2. """ print("Tree-Adjoining Grammars in LCFRS") print('''initial trees: (S (NP ) (VP (V fell))) (NP (NN prices)) auxiliary trees: (S (ADVP (RB Later) (S* )) (VP (ADVP (RB drastically)) (VP* ))''') grammar = Grammar([ ((('ADVP#1', 'RB#1'), ((0, ), )), 1), ((('ADVP#2', 'RB#2'), ((0, ), )), 1), ((('NP', 'NN#1'), ((0, ), )), 1), ((('ROOT', 'S'), ((0, ), )), 1), ((('S', 'NP', 'VP'), ((0, 1), )), 1), ((('S', 'ADVP#1', 'S'), ((0, 1), )), 1), ((('VP', 'V#1'), ((0, ), )), 1), ((('VP', 'ADVP#2', 'VP'), ((0, 1), )), 1), ((('RB#1', 'Epsilon'), ('Later', )), 1), ((('NN#1', 'Epsilon'), ('prices', )), 1), ((('V#1', 'Epsilon'), ('fell', )), 1), ((('RB#2', 'Epsilon'), ('drastically', )), 1)]) print(grammar) assert parse(grammar, "prices fell".split()) assert parse(grammar, "prices drastically fell".split()) assert parse(grammar, "Later prices fell".split()) assert parse(grammar, "Later prices drastically fell".split()) # taken from: slides for course Grammar Formalisms, Kallmeyer (2011), # Mildly Context-Sensitive Grammar Formalisms: # LCFRS: Relations to other Formalisms # https://user.phil.hhu.de/~kallmeyer/GrammarFormalisms/4lcfrs-related-formalisms.pdf print("the language {d} + {a**n b**m c**m d **n} with n>0, m>=0") print('''initial trees: (S a (S Epsilon) F) (F d) auxiliary trees: (S b S* c)''') grammar = Grammar([ ((('ROOT', 'a1'), ((0, ), )), 1), ((('ROOT', 'a2'), ((0, ), )), 1), ((('a1', 'a_b', 'a2'), ((0, 1), )), 1), ((('a1', '_a', 'a2'), ((0, 1), )), 1), ((('a2', '_d'), ((0, ), )), 1), ((('a_b', '_a', 'b'), ((0, 1), )), 1), ((('b', '_b', '_c'), ((0, 1), )), 1), ((('b', 'b_2', 'b'), ((0, 1, 0), )), 1), ((('b_2', '_b', '_c'), ((0, ), (1, ))), 1), ((('_a', 'Epsilon'), ('a', )), 1), ((('_b', 'Epsilon'), ('b', )), 1), ((('_c', 'Epsilon'), ('c', )), 1), ((('_d', 'Epsilon'), ('d', )), 1)]) print(grammar) assert parse(grammar, list("d")) assert parse(grammar, list("ad")) assert parse(grammar, list("abcd")) assert parse(grammar, list("abbccd")) print("wrong:") assert not parse(grammar, list("abbbccd")) # Taken from: Boullier (1998), Generalization of Mildly # Context-Sensitive Formalisms. http://aclweb.org/anthology/W98-0105 # Epsilon replaced with '|', added preterminal rules w/underscores print("the language { ww | w in {a,b}* }") print('''initial trees: (S (A Epsilon)) auxiliary trees: (A a (A A*) a) (A b (A A*) b) (A (A A*))''') grammar = Grammar([ ((('_aa', '_a', '_a'), ((0, ), (1, ))), 1), ((('_bb', '_b', '_b'), ((0, ), (1, ))), 1), ((('A', '_aa', 'A'), ((0, 1), (0, 1))), 1), ((('A', '_bb', 'A'), ((0, 1), (0, 1))), 1), ((('A', '_aa'), ((0, ), (0, ))), 1), ((('A', '_bb'), ((0, ), (0, ))), 1), ((('ROOT', '_|'), ((0, ), )), 1), ((('ROOT', 'A', '_|'), ((0, 1, 0), )), 1), ((('_a', 'Epsilon'), ('a', )), 1), ((('_b', 'Epsilon'), ('b', )), 1), ((('_|', 'Epsilon'), ('|', )), 1)]) print(grammar) assert parse(grammar, list("a|a")) assert parse(grammar, list("ab|ab")) assert parse(grammar, list("abaab|abaab")) print("wrong:") assert not parse(grammar, list("a|b")) assert not parse(grammar, list("aa|bb"))
[docs]def dependencygrammar(): """An example dependency structure encoded in an LCFRS grammar. Taken from: Gildea (2010, fig. 4), Optimal Parsing Strategies for Linear Context-Free Rewriting Systems. http://aclweb.org/anthology/N10-1118 - rules have to be binarized - lexical rules have to be unary These have been dealt with by introducing nodes w/underscores.""" print("A dependency grammar in an LCFRS:") grammar = Grammar([ ((('NMOD', '_A'), ((0, ), )), 1), ((('NMOD', '_the'), ((0, ), )), 1), ((('NMOD_hearing', 'NMOD', '_hearing'), ((0, 1), )), 1), ((('NP', 'NMOD', '_issue'), ((0, 1), )), 1), ((('PP', '_on', 'NP'), ((0, 1, ), )), 1), ((('ROOT', 'SBJ', 'is_VC'), ((0, 1, 0, 1), )), 1), ((('SBJ', 'NMOD_hearing', 'PP'), ((0, ), (1, ))), 1), ((('TMP', '_today'), ((0, ), )), 1), ((('VC', '_scheduled', 'TMP'), ((0, ), (1, ))), 1), ((('is_VC', '_is', 'VC'), ((0, 1), (1, ))), 1), ((('_A', 'Epsilon'), ('A', )), 1), ((('_hearing', 'Epsilon'), ('hearing', )), 1), ((('_is', 'Epsilon'), ('is', )), 1), ((('_scheduled', 'Epsilon'), ('scheduled', )), 1), ((('_on', 'Epsilon'), ('on', )), 1), ((('_the', 'Epsilon'), ('the', )), 1), ((('_issue', 'Epsilon'), ('issue', )), 1), ((('_today', 'Epsilon'), ('today', )), 1)]) print(grammar) testsent = "A hearing is scheduled on the issue today".split() assert parse(grammar, testsent)
[docs]def bitext(): """Bitext parsing with a synchronous CFG. Translation would require a special decoder (instead of normal k-best derivations where the whole sentence is given).""" print("bitext parsing with a synchronous CFG") trees = [Tree(a) for a in '''\ (ROOT (S (NP (NNP (John 0) (John 7))) (VP (VB (misses 1) (manque 5))\ (PP (IN (a` 6)) (NP (NNP (Mary 2) (Mary 4)))))) (SEP (| 3))) (ROOT (S (NP (NNP (Mary 0) (Mary 4))) (VP (VB (likes 1) (aimes 5))\ (NP (DT (la 6)) (NN (pizza 2) (pizza 7))))) (SEP (| 3)))'''.split('\n')] sents = [["0"] * len(a.leaves()) for a in trees] for a in trees: treetransforms.binarize(a) compiled_scfg = Grammar(treebankgrammar(trees, sents)) print("sentences:") for tree in trees: print(' '.join(w for _, w in sorted(tree.pos()))) print("treebank:") for tree in trees: print(tree) print(compiled_scfg, "\n") print("correct translations:") assert parse(compiled_scfg, ["0"] * 7, "John likes Mary | John aimes Mary".split()) assert parse(compiled_scfg, ["0"] * 9, "John misses pizza | la pizza manque a` John".split()) print("incorrect translations:") assert not parse(compiled_scfg, ["0"] * 7, "John likes Mary | Mary aimes John".split()) assert not parse(compiled_scfg, ["0"] * 9, "John misses pizza | John manque a` la pizza".split()) # the following SCFG is taken from: # http://cdec-decoder.org/index.php?title=SCFG_translation # the grammar has been binarized and some new non-terminals had to be # introduced because terminals cannot appear in binary rules. lexicon = ("|", "ein", "ich", "Haus", "kleines", "grosses", "sah", "fand", "small", "little", "big", "large", "house", "shell", "a", "I", "saw", "found") another_scfg = Grammar([ ((('DT', '_ein', '_a'), ((0, ), (1, ))), 0.5), ((('JJ', '_kleines', '_small'), ((0, ), (1, ))), 0.1), ((('JJ', '_kleines', '_little'), ((0, ), (1, ))), 0.9), ((('JJ', '_grosses', '_big'), ((0, ), (1, ))), 0.8), ((('JJ', '_grosses', '_large'), ((0, ), (1, ))), 0.2345), ((('NN_house', '_Haus', '_house'), ((0, ), (1, ))), 1), ((('NN_shell', '_Haus', '_shell'), ((0, ), (1, ))), 1), ((('NP', '_ich', '_I'), ((0, ), (1, ), )), 0.6), ((('NP', 'DT', 'NP|<JJ-NN>'), ((0, 1), (0, 1))), 0.5), ((('NP|<JJ-NN>', 'JJ', 'NN_house'), ((0, 1), (0, 1))), 0.1), ((('NP|<JJ-NN>', 'JJ', 'NN_shell'), ((0, 1), (0, 1))), 1.3), ((('ROOT', 'S', '_|'), ((0, 1, 0), )), 1), ((('S', 'NP', 'VP'), ((0, 1), (0, 1))), 0.2), ((('VP', 'V', 'NP'), ((0, 1), (0, 1))), 0.1), ((('V', '_sah', '_saw'), ((0, ), (1, ))), 0.4), ((('V', '_fand', '_found'), ((0, ), (1, ))), 0.4)] + [((('_%s' % word, 'Epsilon'), (word, )), 1) for word in lexicon]) print(another_scfg) sents = [ "ich sah ein kleines Haus | I saw a small house".split(), "ich sah ein kleines Haus | I saw a little house".split(), "ich sah ein kleines Haus | I saw a small shell".split(), "ich sah ein kleines Haus | I saw a little shell".split()] for sent in sents: assert parse(another_scfg, sent), sent
def parse(compiledgrammar, testsent, testtags=None): """Parse a sentence with a grammar.""" chart, _ = plcfrs.parse(testsent, compiledgrammar, tags=testtags, exhaustive=True) print("input:", ' '.join("%d:%s" % a for a in enumerate(testtags if testtags else testsent)), end=' ') if chart: print() results = kbest.lazykbest(chart, 10)[0] for tree, prob in results: tree = Tree(tree) treetransforms.unbinarize(tree) print(exp(-prob), tree) print() return True else: print("no parse!\n") print(chart) return False def test(): """Alias for main().""" main() def main(): """Run all examples.""" dependencygrammar() tree_adjoining_grammar() bitext() __all__ = ['tree_adjoining_grammar', 'dependencygrammar', 'bitext']