# -*- coding: UTF-8 -*-
"""Treebank transformations.
- Transforms (primarily state splits) listed by name
- Relational-realizational transform
"""
from __future__ import division, print_function, absolute_import, \
unicode_literals
import re
from itertools import islice
from .tree import Tree, ParentedTree, escape, unescape, ptbescape
from .treebank import EXPORTNONTERMINAL
from .treetransforms import addfanoutmarkers, removefanoutmarkers
from .punctuation import punctprune, PUNCTUATION
from .util import ishead
FIELDS = tuple(range(6))
WORD, LEMMA, TAG, MORPH, FUNC, PARENT = FIELDS
STATESPLIT = '^'
LABELRE = re.compile("[^^|<>,;:_-]+")
CASERE = re.compile(r'\b(Nom|Acc|Gen|Dat)\b')
DERE = re.compile("^([Dd]es?|du|d')$")
PPORNP = re.compile('^(NP|PP)+PP$')
YEARRE = re.compile('^(?:19|20)[0-9]{2}$')
PRESETS = {
# basic state splits, German, English, Dutch:
'negra': ('S-RC', 'VP-GF', 'NP', 'PUNCT'),
'wsj': ('PTBbrackets', 'S-WH', 'VP-HD', 'S-INF'),
'alpino': ('PUNCT', ),
# extensive state splits following particular papers:
# French
'green2013ftb': ('markinf,markpart,de2,markp1,mwadvs,mwadvsel1,'
'mwadvsel2,mwnsel1,mwnsel2,PUNCT,TAGPA').split(','),
# English
# These are the "-goodPCFG" options of the Stanford Parser
'km2003wsj': ('PTBbrackets,splitIN4,splitPercent,splitPoss,splitCC,'
'unaryDT,unaryRB,splitAux2,splitVP3,splitSGapped,splitTMP,'
'splitBaseNP,dominatesV,splitNPADV,markDitransV,MARK-YEAR'
).split(','),
# a simpler variant mentioned in Bansal & Klein 2010
'km2003simple': ('PTBbrackets,splitIN4,splitPercent,splitPoss,splitCC,'
'unaryDT,unaryRB,splitAux2,splitSGapped,splitBaseNP,dominatesV,'
'splitNPADV,markDitransV,MARK-YEAR').split(','),
# German
'fraser2013tiger': ('elimNKCJ,addUnary,APPEND-FUNC,addCase,lexPrep,'
'PUNCT,adjAttach,relPath,whFeat,nounSeq,properChunks,markAP,'
'subConjType,VPfeat,noHead,noSubj,MARK-YEAR').split(','),
# Dutch
'lassy': ('nladdunary,nlelimcnj,nlselectmorph,PUNCT,'
'MARK-YEAR,nlpercolatemorph,nlmwuhead').split(','),
# this variant adds function tags to non-terminal labels
'lassy-func': ('nladdunary,nlelimcnj,APPEND-FUNC,nlselectmorph,PUNCT,'
'MARK-YEAR,nlpercolatemorph,nlmwuhead').split(','),
}
# Mappings for multi-level coarse-to-fine parsing
# following Charniak et al. (2006), multi-level coarse-to-fine parsing.
# http://aclweb.org/anthology/N06-1022
MAPPINGS = {
'ptb': {
# level 0: P (all phrase labels)
0: {'P': {'S', 'VP', 'UCP', 'SQ', 'SBAR', 'SBARQ', 'SINV',
'NP', 'NAC', 'NX', 'LST', 'X', 'FRAG', 'PRT|ADVP',
'ADJP', 'QP', 'CONJP', 'ADVP', 'INTJ', 'PRN', 'PRT',
'PP', 'RRC', 'WHADJP', 'WHADVP', 'WHNP', 'WHPP'}},
# level 1: HP (arguments), MP (modifiers)
1: {'HP': {'S', 'VP', 'UCP', 'SQ', 'SBAR', 'SBARQ', 'SINV',
'NP', 'NAC', 'NX', 'LST', 'X', 'FRAG'},
'MP': {'ADJP', 'QP', 'CONJP', 'ADVP', 'INTJ', 'PRN', 'PRT',
'PRT|ADVP', 'PP', 'RRC', 'WHADJP', 'WHADVP', 'WHNP',
'WHPP'}},
# level 2: S (verbal), N (nominal), A (adjectival),
# P (prepositional)
# note: PRT is part of both A_ and P_ in the paper;
# UCP is part of both S_ and N_
2: {'S_': {'S', 'VP', 'SQ', 'SBAR', 'SBARQ', 'SINV'},
'N_': {'NP', 'NAC', 'NX', 'LST', 'X', 'UCP', 'FRAG'},
'A_': {'ADJP', 'QP', 'CONJP', 'ADVP', 'INTJ', 'PRN', 'PRT',
'PRT|ADVP'},
'P_': {'PP', 'RRC', 'WHADJP', 'WHADVP', 'WHNP', 'WHPP'}},
# level 3: no-op, return original treebank labels
},
'negra': {
# level 0: P (all phrase labels)
0: {'P': {'--', 'AA', 'AP', 'AVP', 'CAC', 'CAP', 'CAVP', 'CCP',
'CH', 'CNP', 'CO', 'CPP', 'CS', 'CVP', 'CVZ', 'DL', 'ISU',
'MPN', 'MTA', 'NM', 'NP', 'PN', 'PP', 'QL', 'S', 'VP', 'VZ'}},
# level 1: HP (arguments), MP (modifiers)
1: {'HP': {'NP', 'S', 'VP', 'VZ', 'CO', 'AA', 'CNP', 'CS', 'CVP',
'CVZ', 'PN', 'MPN', 'NM', 'CH', 'CCP', 'DL', 'ISU', 'QL'},
'MP': {'--', 'AP', 'PP', 'AVP', 'CAP', 'CPP', 'CAVP', 'CAC',
'MTA'}},
# level 2: S (verbal), N (nominal), A (adjectival),
# P (prepositional)
2: {'S_': {'S', 'VP', 'VZ', 'CO', 'AA', 'CS', 'CVP',
'CVZ', 'CCP', 'DL', 'ISU', 'QL'},
'N_': {'NP', 'CNP', 'PN', 'MPN', 'NM', 'CH'},
'A_': {'--', 'AP', 'AVP', 'CAP', 'CAVP', 'MTA'}},
'P_': {'PP', 'CPP', 'CAC'}
# level 3: no-op, return original treebank labels
},
'alpino': {
# level 0: P (all phrase labels)
0: {'P': {'ADVP', 'AHI', 'AP', 'CONJ', 'CP', 'DETP', 'DU', 'INF',
'MWU', 'NP', 'OTI', 'PP', 'PPART', 'PPRES', 'REL', 'SMAIN',
'SSUB', 'SV1', 'SVAN', 'TI', 'WHQ', 'WHREL', 'WHSUB'}},
# level 1: HP (arguments), MP (modifiers)
1: {'HP': {'AHI', 'CONJ', 'CP', 'DETP', 'DU', 'INF', 'MWU', 'NP',
'OTI', 'PPART', 'PPRES', 'REL', 'SMAIN', 'SSUB', 'SVAN',
'SV1', 'TI', 'WHSUB', 'WHQ'},
'MP': {'AP', 'ADVP', 'PP', 'REL', 'WHREL'}},
# level 2: S (verbal), N (nominal), A (adjectival),
# P (prepositional)
2: {'S_': {'AHI', 'CP', 'DU', 'INF', 'OTI', 'PPART', 'PPRES',
'SMAIN', 'SSUB', 'SVAN', 'SV1', 'TI', 'WHSUB', 'WHQ'},
'N_': {'CONJ', 'DETP', 'MWU', 'NP'},
'A_': {'AP', 'ADVP', 'REL', 'WHREL'},
'P_': {'PP'}},
# level 3: no-op, return original treebank labels
},
}
[docs]def expandpresets(transformations):
"""Expand aliases for presets."""
return [a for name in transformations
for a in PRESETS.get(name, [name])]
def negratransforms(name, tree, sent):
"""Negra / Tiger transforms."""
if name == 'S-RC': # relative clause => S becomes S^RC
for s in tree.subtrees(lambda n: n.label == 'S'
and function(n) == 'RC'):
s.label += STATESPLIT + 'RC'
elif name == 'NP': # case
for np in tree.subtrees(lambda n: n.label == 'NP'):
np.label += STATESPLIT + function(np)
elif name == 'PP-NP': # un-flatten PPs by introducing NPs
addtopp = ('AC', )
for pp in tree.subtrees(lambda n: n.label == 'PP'):
ac = [a for a in pp if function(a) in addtopp]
# anything before an initial preposition goes to the PP
# (modifiers, punctuation), otherwise it goes to the NP;
# mutatis mutandis for postpositions.
funcs = [function(x) for x in pp]
if 'AC' in funcs and 'NK' in funcs:
if funcs.index('AC') < funcs.index('NK'):
ac[:0] = pp[:funcs.index('AC')]
if rindex(funcs, 'AC') > rindex(funcs, 'NK'):
ac += pp[rindex(funcs, 'AC') + 1:]
# else:
# print('PP but no AC or NK', ' '.join(funcs))
nk = [a for a in pp if a not in ac]
# introduce a PP unless there is already an NP in the PP
# (annotation mistake?), or there is a PN and we want to avoid
# a cylic unary of NP -> PN -> NP.
if ac and nk and (len(nk) > 1
or nk[0].label not in s('NP', 'PN')):
pp[:] = []
pp[:] = ac + [ParentedTree('NP', nk)]
elif name == 'DP': # introduce determiner phrases (DPs)
# determiners = {'ART', 'PDS', 'PDAT', 'PIS', 'PIAT', 'PPOSAT',
# 'PRELS', 'PRELAT', 'PWS', 'PWAT', 'PWAV'}
determiners = {'ART'}
for np in list(tree.subtrees(lambda n: n.label == 'NP')):
if np[0].label in determiners:
np.label = 'DP'
if len(np) > 1 and np[1].label != 'PN':
np1 = np[1:]
np[1:] = []
np[1:] = [ParentedTree('NP', np1)]
elif name == 'VP-GF': # VP category split based on head
for vp in tree.subtrees(lambda n: n.label == 'VP'):
vp.label += STATESPLIT + function(vp)
elif name == 'VP-FIN_NEGRA': # introduce finite VP at S level
# collect objects and modifiers
# introduce new S level for discourse markers
newlevel = 'DM'.split()
addtovp = 'HD AC DA MO NG OA OA2 OC OG PD VO SVP'.split()
def finitevp(s):
"""Introduce finite VPs grouping verbs and their objects."""
if any(x.label.startswith('V') and x.label.endswith('FIN')
for x in s if isinstance(x, Tree)):
vp = [a for a in s if function(a) in addtovp]
# introduce a VP unless it would lead to a unary
# VP -> VP production
if len(vp) != 1 or vp[0].label != 'VP':
s[:] = [pop(a) for a in s if function(a) not in addtovp
] + [pop(a) for a in vp]
toplevel_s = []
if 'S' in labels(tree):
toplevel_s = [a for a in tree if a.label == 'S']
for s in toplevel_s:
while function(s[0]) in newlevel:
s[:] = [s[0], ParentedTree('S', s[1:])]
s = s[1]
toplevel_s = [s]
elif 'CS' in labels(tree):
cs = tree[labels(tree).index('CS')]
toplevel_s = [a for a in cs if a.label == 'S']
for a in toplevel_s:
finitevp(a)
elif name == 'POS-PART': # introduce POS tag for particle verbs
for a in tree.subtrees(
lambda n: any(function(x) == 'SVP' for x in n)):
svp = [x for x in a if function(x) == 'SVP'].pop()
# apparently there can be a _verb_ particle without a verb.
# headlines? annotation mistake?
if any(map(ishead, a)):
hd = [x for x in a if ishead(x)].pop()
if hd.label != a.label:
particleverb = ParentedTree(hd.label, [hd, svp])
a[:] = [particleverb if ishead(x) else x
for x in a if function(x) != 'SVP']
elif name == 'SBAR': # introduce SBAR level
sbarfunc = 'CP'.split()
# in the annotation, complementizers belong to the first S
# in S conjunctions, even when they appear to apply to the whole
# conjunction.
for s in list(tree.subtrees(lambda n: n.label == 'S'
and function(n[0]) in sbarfunc and len(n) > 1)):
s.label = 'SBAR'
s[:] = [s[0], ParentedTree('S', s[1:])]
elif name == 'NEST': # introduce nested structures for modifiers
# (iterated adjunction instead of sister adjunction)
adjunctable = {'NP'} # PP AP VP
for a in list(tree.subtrees(lambda n: n.label in adjunctable
and any(function(x) == 'MO' for x in n))):
modifiers = [x for x in a if function(x) == 'MO']
if min(n for n, x in enumerate(a) if function(x) == 'MO') == 0:
modifiers[:] = modifiers[::-1]
while modifiers:
modifier = modifiers.pop()
a[:] = [ParentedTree(a.label,
[x for x in a if x != modifier]), modifier]
a = a[0]
# The following transformations as described in Fraser et al (CL, 2013)
# http://aclweb.org/anthology/J13-1005
elif name == 'addUnary': # introduce unary NPs
maxid = getmaxid(tree)
for node in tree.postorder(lambda n: strip(n.label) in
{'NN', 'PPER', 'PDS', 'PIS', 'PRELS', 'CARD', 'PN'}
and strip(n.parent.label) in {'S', 'VP', 'ROOT', 'DL'}):
if node.label == 'PN' and len(node) == 1: # only complex PNs
continue
children = node[:]
node[:] = []
tag = ParentedTree(node.label, children)
tag.source = node.source[:]
node[:] = [tag]
node.source[TAG] = node.label = 'NP'
node.source[FUNC] = tag.source[FUNC]
maxid += 1
node.source[WORD] = '#%d' % maxid
node.source[LEMMA] = node.source[MORPH] = '--'
tag.source[FUNC] = 'HD'
elif name == 'addCase': # add case features to POS tags
for node in tree.subtrees(lambda n: n and isinstance(n[0], int)):
case = CASERE.match(node.source[MORPH])
if case:
node.label += '/' + case.group(1)
elif name == 'elimNKCJ': # eliminate NK and CJ functions
for node in tree.subtrees(lambda n: function(n) in {'NK', 'CJ'}):
if function(node) == 'NK':
node.source[FUNC] = 'HD'
else: # elif function(node) == 'CJ':
node.source[FUNC] = function(node.parent) or '--'
elif name == 'lexPrep': # lexicalize frequent prepositions/conjunctions
for node in tree.subtrees(lambda n: n and isinstance(n[0], int)):
word = sent[node[0]].lower()
if base(node, 'APPR') and word in {
'in', 'von', 'auf', 'durch', 'um',
'unter', 'unters', 'unterm'}:
node.label += STATESPLIT + word
elif base(node, 'KON') and function(node) == 'CD' and (word in {
'sowohl', 'als', 'weder', 'entweder', 'noch'}
or (word == 'oder' # 'oder' if preceded by entweder
and any(a.lower() == 'entweder'
for a in sent[:node[0]]))):
node.label += STATESPLIT + word
elif name == 'adjAttach': # annotate attachments of adjuncts
for node in tree.subtrees(
lambda n: strip(n.label) in {'PP', 'AVP', 'ADV', 'ADJD'}):
if strip(node.parent.label) in {'S', 'VP'}:
annot = 'V'
elif strip(node.parent.label) in {'NP', 'PP'}:
annot = 'N'
else:
annot = '0'
node.label += STATESPLIT + annot
if strip(node.label) == 'AVP': # propagate to head child
for child in node:
if ishead(child):
child.label += STATESPLIT + annot
break
elif name == 'relPath': # mark path from relative clause to rel. pronoun
for node in tree.subtrees(lambda n: base(n, 'S')
and function(n) == 'RC'):
for child in node.subtrees(lambda n: strip(n.label) in
{'PRELS', 'PRELAT', 'PWAV', 'PWS'}):
child = child.parent
while child is not node:
child.label += STATESPLIT + 'rel'
child = child.parent
node.label += STATESPLIT + 'rel'
break
else: # no rel. pronoun found
node.label += STATESPLIT + 'norel'
elif name == 'whFeat': # mark NP/PP that immediately dominates WH-pronoun
for node in tree.subtrees(lambda n: strip(n.label) in {'NP', 'PP'} and
any(strip(a.label) in {'PWAT', 'PWS', 'PWAV'} for a in n)):
node.label += STATESPLIT + 'wh'
elif name == 'nounSeq': # consecutive nouns in NP
for node in tree.subtrees(lambda n: base(n, 'NP')):
for n, a in enumerate(node[:-1]):
if base(a, 'NN') and base(node[n + 1], 'NN'):
a.label += STATESPLIT + 'seq'
break
elif name == 'properChunks': # mark POS tags in proper noun chunks
for node in tree.subtrees(lambda n: base(n, 'NP')
and function(n) == 'PNC'):
for tag in node:
tag.label += STATESPLIT + 'name'
elif name == 'markAP': # mark predicative APs, APs with nominal head
for node in tree.subtrees(lambda n: base(n, 'AP')):
if any(base(a, 'ADJD') for a in node.subtrees()):
node.label += STATESPLIT + 'pred'
if any(ishead(child) and strip(child.label) in {'NN', 'NP'}
for child in node):
node.label += STATESPLIT + 'nom'
elif name == 'subConjType': # mark type of subordinating conj.
for node in tree.subtrees(lambda n: base(n, 'S')
and function(n) in {'SB', 'OC', 'MO', 'RE'}):
for child in node:
if base(child, 'KOUS'):
child.label += STATESPLIT + function(node)
break
elif name == 'VPfeat': # mark object VPs with head label
for node in tree.subtrees(lambda n: base(n, 'VP')
and function(n) == 'OC'):
for child in node:
if ishead(child):
node.label += STATESPLIT + strip(child.label)
break
elif name == 'noHead': # constituents without head child
for node in tree.subtrees(lambda n: n is not tree
and n and isinstance(n[0], Tree)):
# The heuristically found heads do not count.
if not any(function(child) in {'HD', 'PNC', 'AC', 'AVC', 'NMC',
'PH', 'PD', 'ADC', 'UC', 'DH'}
for child in node):
node.label += STATESPLIT + 'nohead'
elif name == 'noSubj': # conjunct clauses without subject
for node in tree.subtrees(lambda n: n and isinstance(n[0], Tree)
and base(n, 'S') and function(n) == 'CJ'):
if not any(function(child) in {'SB', 'EP'}
or strip(child.label) in {'VVIMP', 'VAIMP'}
for child in node):
node.label += STATESPLIT + 'nosubj'
else:
return False
return True
def ptbtransforms(name, tree, sent):
"""Transforms for WSJ section of Penn treebank."""
if name == 'S-WH':
for sbar in tree.subtrees(lambda n: n.label == 'SBAR'):
for s in sbar:
if (s.label == 'S'
and any(a.label.startswith('WH') for a in s)):
s.label += STATESPLIT + 'WH'
elif name == 'VP-HD': # VP category split based on head
for vp in tree.subtrees(lambda n: n.label == 'VP'):
hd = [x for x in vp if ishead(x)].pop()
if hd.label == 'VB':
vp.label += STATESPLIT + 'HINF'
elif hd.label == 'TO':
vp.label += STATESPLIT + 'HTO'
elif hd.label in ('VBN', 'VBG'):
vp.label += STATESPLIT + 'HPART'
elif name == 'S-INF':
for s in tree.subtrees(lambda n: n.label == 'S'):
hd = [x for x in s if ishead(x)].pop()
if hd.label in ('VP' + STATESPLIT + 'HINF',
'VP' + STATESPLIT + 'HTO'):
s.label += STATESPLIT + 'INF'
elif name == 'VP-FIN_WSJ': # add disc. finite VP when verb is under S
# this counters the flattening when a VP is not possible because of
# non-standard word order; e.g. is John happy
for s in tree.postorder(lambda n: n.label == 'S'):
if not any(a.label.startswith('VP') for a in s):
vp = ParentedTree('VP', [])
for child in list(s):
# FIXME: check which functions should not go in the VP
# (pre)modifiers unclear.
if 'SBJ' not in functions(child):
vp.append(s.pop(child))
s.append(vp)
elif name == 'MARK-UNARY': # add -U to unary nodes to avoid cycles
for unary in tree.subtrees(lambda n: len(n) == 1
and isinstance(n[0], Tree)):
unary.label += STATESPLIT + 'U'
# The following transformations are translations of
# the Stanford Parser state splits described in
# Accurate Unlexicalized Parsing (ACL 2003).
# http://aclweb.org/anthology/P03-1054
elif name == 'splitIN': # Stanford Parser splitIN=3
for node in tree.subtrees(lambda n: base(n, 'IN')):
if base(node.parent.parent, 'N') and (
base(node.parent, 'P') or
base(node.parent, 'A')):
node.label += STATESPLIT + 'N'
elif base(node.parent, 'Q') and (
base(node.parent.parent, 'N') or
base(node.parent.parent, 'ADJP')):
node.label += STATESPLIT + 'Q'
elif base(node.parent.parent, 'S'):
if base(node.parent, 'SBAR'):
node.label += STATESPLIT + 'SCC'
else:
node.label += STATESPLIT + 'SC'
elif base(node.parent, 'SBAR') or base(
node.parent, 'WHNP'):
node.label += STATESPLIT + 'T'
elif name == 'splitIN4': # Stanford Parser splitIN=4
for node in tree.subtrees(lambda n: base(n, 'IN')):
if base(node.parent.parent, 'N') and (
base(node.parent, 'P') or
base(node.parent, 'A')):
node.label += STATESPLIT + 'N'
elif base(node.parent, 'Q') and (
base(node.parent.parent, 'N') or
base(node.parent.parent, 'ADJP')):
node.label += STATESPLIT + 'Q'
elif node.parent.parent.label[0] == 'S' and not base(
node.parent.parent, 'SBAR'):
if base(node.parent, 'SBAR'):
node.label += STATESPLIT + 'SCC'
elif not base(node.parent, 'NP') and not base(
node.parent, 'ADJP'):
node.label += STATESPLIT + 'SC'
elif base(node.parent, 'SBAR') or base(
node.parent, 'WHNP') or base(node.parent, 'WHADVP'):
node.label += STATESPLIT + 'T'
elif name == 'splitPercent': # Stanford Parser splitPercent=1
for node in tree.subtrees(lambda n: n and isinstance(n[0], int)
and sent[n[0]] == '%'):
node.label += STATESPLIT + r'%'
elif name == 'splitPoss': # Stanford Parser splitPoss=1
for node in tree.subtrees(lambda n: base(n, 'NP')
and n[-1].label.startswith('POS')):
node.label += STATESPLIT + 'P'
elif name == 'splitCC': # Stanford Parser splitCC=2
for node in tree.subtrees(lambda n: base(n, 'CC')):
if sent[node[0]].lower() == 'but':
node.label += STATESPLIT + 'B'
elif sent[node[0]] == '&':
node.label += STATESPLIT + 'A'
elif name == 'unaryDT': # Stanford Parser unaryDT=true
for node in tree.subtrees(lambda n: base(n, 'DT')
and len(n.parent) == 1):
node.label += STATESPLIT + 'U'
elif name == 'unaryRB': # Stanford Parser unaryRB=true
for node in tree.subtrees(lambda n: base(n, 'RB')
and len(n.parent) == 1):
node.label += STATESPLIT + 'U'
elif name == 'splitAux': # Stanford Parser splitAux=1
for node in tree.subtrees(lambda n: strip(n.label)
in {'VBZ', 'VBP', 'VBD', 'VBN', 'VBG', 'VB'}):
if sent[node[0]].lower() in {
'is', 'am', 'are', 'was', 'were', "'m", "'re", "'s",
'being', 'be', 'been'}:
node.label += STATESPLIT + 'BE'
elif sent[node[0]].lower() in {
'have', "'ve", 'having', 'has', 'had', "'d"}:
node.label += STATESPLIT + 'HV'
elif name == 'splitAux2': # Stanford Parser splitAux=2
for node in tree.subtrees(lambda n: strip(n.label)
in {'VBZ', 'VBP', 'VBD', 'VBN', 'VBG', 'VB'}):
if sent[node[0]].lower() in {"'s", "s"}:
# 's can be a contraction of both "is" and "have"
foundAux = False
for sibling in node.parent:
if foundAux:
if base(sibling, 'VP') and any(strip(a.label)
in {'VBD', 'VBN'} for a in sibling):
node.label += STATESPLIT + 'HV'
break
elif sibling.label.startswith('VBZ'):
foundAux = True
else:
node.label += STATESPLIT + 'BE'
if sent[node[0]].lower() in {'am', 'is', 'are', 'was', 'were',
"'m", "'re", 'be', 'being', 'been', 'ai'}:
node.label += STATESPLIT + 'BE'
elif sent[node[0]].lower() in {
'have', "'ve", 'having', 'has', 'had', "'d"}:
node.label += STATESPLIT + 'HV'
elif name == 'splitVP': # Stanford Parser splitVP=2
for node in tree.subtrees(lambda n: base(n, 'VP')):
for child in node:
if ishead(child):
if strip(child.label) in {'VBZ', 'VBP', 'VBD', 'MD'}:
node.label += STATESPLIT + 'VBF'
else:
node.label += STATESPLIT + strip(child.label)
break
elif name == 'splitVP3': # Stanford Parser splitVP=3
for node in tree.subtrees(lambda n: base(n, 'VP')):
for child in node:
if ishead(child):
if strip(child.label) in {'VBZ', 'VBP', 'VBD', 'MD'}:
node.label += STATESPLIT + 'VBF'
elif strip(child.label) in {'TO', 'VBG', 'VBN', 'VB'}:
node.label += STATESPLIT + strip(child.label)
break
elif name == 'splitSGapped': # Stanford Parser splitSGapped=3
seenPredCat = seenCC = seenS = False
seenNP = 0
for node in tree.subtrees(lambda n: base(n, 'S')):
for child in node:
cat2 = child.label
if cat2.startswith('NP'):
seenNP += 1
elif strip(cat2) in {'VP', 'ADJP', 'PP', 'UCP'}:
seenPredCat = True
elif cat2.startswith('CC'):
seenCC = True
elif cat2.startswith('S'):
seenS = True
if (not (seenCC and seenS)) and (
seenNP == 0 or (seenNP == 1 and not seenPredCat)):
node.label += STATESPLIT + 'G'
elif name == 'splitTMP': # Stanford Parser splitTMP=TEMPORAL_ACL03PCFG
for node in tree.postorder(lambda n: 'TMP' in functions(n)):
child = node
hd = None
while node and isinstance(node[0], Tree):
try:
i, hd = next(a for a in enumerate(child) if ishead(a[1]))
except StopIteration:
break
if strip(hd) == 'POS' and i > 0:
hd = child[i - 1]
child = hd
if 'TMP' in functions(node):
node.label += STATESPLIT + 'TMP'
if hd and hd.label.startswith('N'):
hd.label += STATESPLIT + 'TMP'
elif name == 'splitBaseNP': # Stanford Parser splitBaseNP=1
# Mark NPs that only dominate preterminals
for node in tree.subtrees(lambda n: base(n, 'NP')):
if all(a and isinstance(a[0], int) for a in node):
node.label += STATESPLIT + 'B'
elif name == 'dominatesV': # Stanford Parser dominatesV=1
for node in tree.subtrees(lambda n: base(n, 'VP')):
if any(tag.startswith('V') or tag.startswith('MD')
for _, tag in node.pos()):
node.label += STATESPLIT + 'v'
elif name == 'splitNPADV': # Stanford Parser splitNPADV=1
for node in tree.subtrees(lambda n:
base(n, 'NP') and 'ADV' in functions(n)):
node.label += STATESPLIT + 'ADV'
try:
hd = next(a for a in node if ishead(a))
except StopIteration:
continue
if base(hd, 'POS') and hd.parent_index > 0:
hd = node[hd.parent_index - 1]
while base(hd, 'NP'):
hd.label += STATESPLIT + 'ADV'
try:
hd = next(a for a in hd if ishead(a))
except StopIteration:
break
elif name == 'markDitransV': # Stanford Parser markDitransV=2
for node in tree.subtrees(lambda n: n.label.startswith('VB')):
npargs = sum(1 for a in node.parent if base(a, 'NP')
and 'TMP' not in functions(a))
if npargs >= 2:
node.label += STATESPLIT + '2Arg'
elif name == 'PTBbrackets': # ensure that brackets are in PTB format
sent[:] = [ptbescape(token) for token in sent]
else:
return False
return True
def ftbtransforms(name, tree, sent):
"""Port of manual FTB enrichments specified in Stanford parser.
cf. ``FrenchTreebankParserParams.java``"""
if name == 'markinf':
for t in tree.subtrees(lambda n: strip(n.label) == "V"
and isinstance(n.parent, Tree)
and isinstance(n.parent.parent, Tree)
and strip(n.parent.label) == "VN"
and strip(n.parent.parent.label) == "VPinf"):
t.label += STATESPLIT + "infinitive"
elif name == 'markpart':
for t in tree.subtrees(lambda n: strip(n.label) == "V"
and isinstance(n.parent, Tree)
and isinstance(n.parent.parent, Tree)
and strip(n.parent.label) == "VN"
and strip(n.parent.parent.label) == "VPpart"):
t.label += STATESPLIT + "participle"
elif name == 'markvn':
for t in tree.subtrees(lambda n: strip(n.label) == "VN"):
for sub in islice(t.subtrees(), 1, None):
sub.label += STATESPLIT + "withVN"
elif name == 'coord1':
for t in tree.subtrees(lambda n: strip(n.label) == 'COORD'
and len(n) >= 2):
t.label += STATESPLIT + strip(t[1].label)
elif name == 'de2':
for t in tree.subtrees(lambda n: strip(n.label) == 'P'
and DERE.match(sent[n[0]])):
t.label += STATESPLIT + "de2"
elif name == 'de3':
# @NP|PP|COORD >+(@NP|PP) (@PP <, (@P < /^([Dd]es?|du|d')$/))
for t in tree.subtrees(lambda n:
strip(n.label) in ("PP", "COORD")):
a = list(ancestors(t))
for n in range(2, len(a)):
if PPORNP.match("".join(strip(x.label) for x in a[:n])):
if (strip(a[n - 1][0].label) == "P"
and DERE.match(sent[a[n - 1][0][0]])):
t.label += STATESPLIT + "de3"
break
elif name == 'markp1':
for t in tree.subtrees(lambda n: strip(n.label) == "P"
and strip(n.parent.label) == "PP"
and strip(n.parent.parent.label) == "NP"):
t.label += STATESPLIT + "n"
elif name == 'mwadvs':
for t in tree.subtrees(lambda n: strip(n.label) == "MWADV"
and "S" in n.parent.label):
t.label += STATESPLIT + "mwadv-s"
elif name == 'mwadvsel1':
for t in tree.subtrees(lambda n: strip(n.label) == "MWADV"
and len(n) == 2
and strip(n[0].label) == "P"
and strip(n[1].label) == "N"):
t.label += STATESPLIT + "mwadv1"
elif name == 'mwadvsel2':
for t in tree.subtrees(lambda n: strip(n.label) == "MWADV"
and len(n) == 3
and strip(n[0].label) == "P"
and strip(n[1].label) == "D"
and strip(n[2].label) == "N"):
t.label += STATESPLIT + "mwadv2"
elif name == 'mwnsel1':
for t in tree.subtrees(lambda n: strip(n.label) == "MWN"
and len(n) == 2
and strip(n[0].label) == "N"
and strip(n[1].label) == "A"):
t.label += STATESPLIT + "mwn1"
elif name == 'mwnsel2':
for t in tree.subtrees(lambda n: strip(n.label) == "MWN"
and len(n) == 3
and strip(n[0].label) == "N"
and strip(n[1].label) == "P"
and strip(n[2].label) == "N"):
t.label += STATESPLIT + "mwn2"
elif name == 'mwnsel3': # noun-noun compound joined with dash.
for t in tree.subtrees(lambda n: strip(n.label) == "MWN"
and len(n) == 3
and strip(n[0].label) == "N"
and sent[n[1][0]] == "-"
and strip(n[2].label) == "N"):
t.label += STATESPLIT + "mwn3"
else:
return False
return True
def lassytransforms(name, tree, _sent):
"""Transformations for the Dutch Lassy & Alpino treebanks."""
if name == 'nlselectmorph': # add select morph. feats to coarse POS tags
SELECTMORPH = {'eigen', 'det', 'pron', 'init', 'fin', 'neven', 'onder',
'prenom', 'nom', 'vrij', 'pv', 'inf', 'vd', 'od'}
for pos in tree.subtrees(lambda n: n and isinstance(n[0], int)):
tag = pos.source[MORPH].split('(')[0]
selected = [feat for feat in morphfeats(pos)
if feat in SELECTMORPH]
pos.label += '/%s[%s]' % (tag, ','.join(selected))
elif name == 'nlpercolatemorph': # percolate select morph tags upwards
PERCOLATE = {'pv': 2, 'inf': 2}
for feat in sorted(PERCOLATE):
lvl = PERCOLATE[feat]
for pos in tree.subtrees(lambda n, f=feat: n
and isinstance(n[0], int) and f in morphfeats(n)):
cnt = 0
node = pos.parent
while (cnt < lvl and node is not None
and node.parent is not None):
if not node.label.endswith(STATESPLIT + feat):
node.label += STATESPLIT + feat
node = node.parent
cnt += 1
elif name == 'nlmwuhead': # add label of head child to MWU nodes
EXPANDCAT = {'MWU'}
for node in tree.subtrees(lambda n: strip(n.label) in EXPANDCAT):
node.label += STATESPLIT + next(
iter(strip(a.label) for a in node
if ishead(a) or a is node[-1]))
elif name == 'nladdunary': # introduce unary NPs
maxid = getmaxid(tree)
for node in tree.postorder(lambda n: strip(n.label) in {'n', 'vnw'}
and strip(n.parent.label) in {'SMAIN', 'PP', 'INF'}):
children = node[:]
node[:] = []
tag = ParentedTree(node.label, children)
tag.source = node.source[:]
node[:] = [tag]
node.source[TAG] = node.label = 'NP'
node.source[FUNC] = tag.source[FUNC]
if node.source[FUNC] and node.source[FUNC][0].isupper():
tag.source[FUNC] = 'HD'
else:
tag.source[FUNC] = 'hd'
maxid += 1
node.source[WORD] = '#%d' % maxid
node.source[LEMMA] = node.source[MORPH] = '--'
elif name == 'nlelimcnj': # assign conjuncts the function of the parent
for node in tree.subtrees(lambda n: function(n) == 'cnj'):
node.source[FUNC] = function(node.parent) or '--'
else:
return False
return True
[docs]def collapselabels(trees, _sents=None, tbmapping=None):
"""Collapse non-root phrasal labels with specified mapping.
Trees are modified in-place.
:param tbmapping: a mapping of treebank labels of the form::
{coarselabel1: {finelabel1, finelabel2, ...}, ...}
Cf. ``treebanktransforms.MAPPINGS``
:returns: a tuple ``(trees, mapping)`` with the transformed trees
and a mapping of their original labels to the collapsed labels.
"""
def collapse(tree):
"""Collapse labels of a single tree."""
for node in islice(tree.subtrees(), 1, None):
if node and isinstance(node[0], Tree):
# anything not part of the mapping is stripped
# (state splits, function tags, &c.)
mapping[node.label] = LABELRE.sub(
lambda x: revmapping.get(x.group(), ''),
# lambda x: revmapping[x.group()],
node.label).replace('-', '').rstrip('^')
assert (mapping[node.label]
and mapping[node.label][0].isalpha()), node.label
node.label = mapping[node.label]
# maps original treebank labels to coarser labels; e.g. NP => X
revmapping = {finelabel: coarselabel for coarselabel in tbmapping
for finelabel in tbmapping[coarselabel]}
# maps labels after binarization and other transformations,
# e.g., NP<DT,JJ,NN> => X<X,X,X>
mapping = {'Epsilon': 'Epsilon', trees[0].label: trees[0].label}
# collect POS tags, will not be changed
for tree in trees:
for node in tree.subtrees(lambda n: not isinstance(n[0], Tree)):
mapping[node.label] = node.label
revmapping[node.label] = node.label
for tree in trees:
collapse(tree)
return trees, mapping
[docs]def dlevel(tree, lang='nl'):
"""Return the D-level measure of syntactic complexity.
Original version:
Rosenberg & Abbeduto (1987), https://doi.org/10.1017/S0142716400000047
Covington et al. (2006), http://ai1.ai.uga.edu/caspr/2006-01-Covington.pdf
Dutch version implemented here: Appendix A of T-Scan manual,
https://github.com/proycon/tscan/raw/master/docs/tscanhandleiding.pdf
:param tree: A tree from the Alpino parser (i.e., not binarized, with
function and morphological tags).
:returns: integer 0-7; 7 is most complex."""
if lang != 'nl':
raise NotImplementedError
poslist = []
pv_counter = neven_counter = 0
for pos in tree.subtrees(lambda n: n and isinstance(n[0], int)):
poslist.append(pos)
if strip(pos.label) == 'ww' and 'pv' in morphfeats(pos):
pv_counter += 1
elif strip(pos.label) == 'vg' and 'neven' in morphfeats(pos):
neven_counter += 1
# 7: sentence with multiple subordinate clauses
# (disregarding clauses in conjunctions)
if pv_counter - neven_counter > 2:
return 7
# 6: a subordinate clause modifying the subject
for node in tree.subtrees():
if (strip(node.label) == 'REL' and function(node) == 'mod'
and function(node.parent) == 'su'):
return 6
elif (strip(node.label) in ('CP', 'WHSUB', 'WHREL', 'TI', 'OTI', 'INF')
and function(node) == 'su'):
return 6
elif (strip(node.label) == 'ww' and function(node.parent) == 'su'
and strip(node.parent.label) == 'NP'):
return 6
# 5: subordinate clause
for pos in poslist:
if (strip(pos.label) == 'vg' and 'onder' in morphfeats(pos)
and pos.source[LEMMA] != 'dat'):
return 5
# 4: non-finite clause as object with overt subject
for node in tree.subtrees():
if function(node) == 'obcomp':
return 4
for node in tree.subtrees(lambda n: function(n) == 'vc'):
if strip(node.label) in ('TI', 'OTI', 'INF'):
vcid = node.source[WORD].lstrip('#')
for sib in node.parent:
if function(sib) == 'obj1' and hassecedge(sib, 'su', vcid):
return 4
# 3: finite clause as objects (and equivalents)
for node in tree.subtrees():
if strip(node.label) == 'REL' and function(node) == 'mod':
if function(node.parent) == 'obj1':
return 3
elif strip(node.label) == 'ww':
if strip(node.parent.label) == 'NP' and function(
node.parent) == 'obj1':
return 3
elif strip(node.label) in ('CP', 'WHSUB') and function(node) == 'vc':
return 3
elif function(node) == 'sup':
return 3
# 2: coordinated structure
for pos in poslist:
if strip(pos.label) == 'vg' and 'neven' in morphfeats(pos):
return 2
# 1: non-finite clause with subject coindexed from main clause
for node in tree.subtrees(lambda n: function(n) == 'vc'):
if strip(node.label) in ('TI', 'OTI', 'INF'):
vcid = node.source[WORD].lstrip('#')
for sib in node.parent:
if function(sib) == 'su' and hassecedge(sib, 'su', vcid):
return 2
# 0: simple sentence
return 0
[docs]def rindex(l, v):
"""Like list.index(), but go from right to left."""
return len(l) - 1 - l[::-1].index(v)
[docs]def labels(tree):
""":returns: the labels of the children of this node."""
return [a.label for a in tree if isinstance(a, Tree)]
[docs]def pop(node):
"""Remove this node from its parent node, if it has one.
Convenience function for ParentedTrees."""
try:
return node.parent.pop(node.parent_index)
except AttributeError:
return node
def base(node, match):
"""Test whether ``node.label`` equals ``match`` after stripping features."""
return (node.label == match
or node.label.startswith(match + STATESPLIT)
or node.label.startswith(match + '-'))
[docs]def strip(label):
"""Equivalent to the effect of the @ operator in tregex."""
if '-' in label:
return label[:label.index('-')]
elif STATESPLIT in label:
return label[:label.index(STATESPLIT)]
return label
[docs]def ancestors(node):
"""Yield ancestors of node from direct parent to root node."""
while node:
node = node.parent
yield node
[docs]def bracketings(tree):
"""Labeled bracketings of a tree."""
return [(a.label, tuple(sorted(a.leaves())))
for a in tree.subtrees(lambda t: t and isinstance(t[0], Tree))]
# morphological features
[docs]def morphfeats(node):
"""Return the morphological features of a preterminal node.
Features may be separated by dots or commas."""
try:
morph = node.source[MORPH].replace('(', '[').replace(')', ']')
morph = morph[morph.index('[') + 1:morph.index(']')]
except (TypeError, ValueError):
return ()
return morph.replace('.', ',').split(',')
[docs]def unifymorphfeat(feats, percolatefeatures=None):
"""Get the sorted union of features for a sequence of feature vectors.
:param feats: a sequence of strings of comma/dot separated feature vectors.
:param percolatefeatures: if a set is given, select only these features;
by default all features are used.
>>> print(unifymorphfeat({'Def.*.*', '*.Sg.*', '*.*.Akk'}))
Akk.Def.Sg
>>> print(unifymorphfeat({'LID[bep,stan,rest]', 'N[soort,ev,zijd,stan]'}))
bep,ev,rest,soort,stan,zijd"""
sep = '.' if any('.' in a for a in feats) else ','
result = set()
for a in feats:
if '[' in a:
a = a[a.index('[') + 1:a.index(']')]
result.update(a.split(sep))
if percolatefeatures:
result.intersection_update(percolatefeatures)
return sep.join(sorted(result - {'*', '--'}))
# Function tags
[docs]def function(node):
""":returns: The first function tag for node, or the empty string."""
if getattr(node, 'source', None) is None:
return ''
return node.source[FUNC].split('-')[0]
[docs]def functions(node):
""":returns: list of function tags for node, or an empty list."""
if getattr(node, 'source', None) is None:
return []
a = node.source[FUNC]
if a == '--' or a == '' or a is None:
return []
return a.split('-')
# Secondary edges
[docs]def hassecedge(node, func, parentid):
"""Test whether this node has a secondary edge ``(func, parentid)``."""
if getattr(node, 'source', None) is None:
return False
return any(f == func and pid == parentid
for f, pid in zip(node.source[6::2], node.source[7::2]))
def getmaxid(tree):
"""Return highest export non-terminal ID in tree."""
return max((int(node.source[WORD].lstrip('#'))
for n, node in enumerate(
tree.subtrees(lambda n: n.source
and EXPORTNONTERMINAL.match(n.source[WORD])),
500)),
default=500)
__all__ = ['expandpresets', 'transform', 'reversetransform', 'collapselabels',
'dlevel', 'rrtransform', 'rrbacktransform', 'rindex', 'labels', 'pop',
'strip', 'ancestors', 'bracketings', 'morphfeats', 'unifymorphfeat',
'function', 'functions', 'hassecedge']