Source code for discodop.treebanktransforms

# -*- coding: UTF-8 -*-
"""Treebank transformations.

- Transforms (primarily state splits) listed by name
- Relational-realizational transform
"""
from __future__ import division, print_function, absolute_import, \
		unicode_literals
import re
from itertools import islice
from .tree import Tree, ParentedTree, escape, unescape, ptbescape
from .treebank import EXPORTNONTERMINAL
from .treetransforms import addfanoutmarkers, removefanoutmarkers
from .punctuation import punctprune, PUNCTUATION
from .util import ishead

FIELDS = tuple(range(6))
WORD, LEMMA, TAG, MORPH, FUNC, PARENT = FIELDS
STATESPLIT = '^'
LABELRE = re.compile("[^^|<>,;:_-]+")
CASERE = re.compile(r'\b(Nom|Acc|Gen|Dat)\b')
DERE = re.compile("^([Dd]es?|du|d')$")
PPORNP = re.compile('^(NP|PP)+PP$')
YEARRE = re.compile('^(?:19|20)[0-9]{2}$')
PRESETS = {
		# basic state splits, German, English, Dutch:
		'negra': ('S-RC', 'VP-GF', 'NP', 'PUNCT'),
		'wsj': ('PTBbrackets', 'S-WH', 'VP-HD', 'S-INF'),
		'alpino': ('PUNCT', ),
		# extensive state splits following particular papers:
		# French
		'green2013ftb': ('markinf,markpart,de2,markp1,mwadvs,mwadvsel1,'
			'mwadvsel2,mwnsel1,mwnsel2,PUNCT,TAGPA').split(','),
		# English
		# These are the "-goodPCFG" options of the Stanford Parser
		'km2003wsj': ('PTBbrackets,splitIN4,splitPercent,splitPoss,splitCC,'
			'unaryDT,unaryRB,splitAux2,splitVP3,splitSGapped,splitTMP,'
			'splitBaseNP,dominatesV,splitNPADV,markDitransV,MARK-YEAR'
			).split(','),
		# a simpler variant mentioned in Bansal & Klein 2010
		'km2003simple': ('PTBbrackets,splitIN4,splitPercent,splitPoss,splitCC,'
			'unaryDT,unaryRB,splitAux2,splitSGapped,splitBaseNP,dominatesV,'
			'splitNPADV,markDitransV,MARK-YEAR').split(','),
		# German
		'fraser2013tiger': ('elimNKCJ,addUnary,APPEND-FUNC,addCase,lexPrep,'
			'PUNCT,adjAttach,relPath,whFeat,nounSeq,properChunks,markAP,'
			'subConjType,VPfeat,noHead,noSubj,MARK-YEAR').split(','),
		# Dutch
		'lassy': ('nladdunary,nlelimcnj,nlselectmorph,PUNCT,'
			'MARK-YEAR,nlpercolatemorph,nlmwuhead').split(','),
		# this variant adds function tags to non-terminal labels
		'lassy-func': ('nladdunary,nlelimcnj,APPEND-FUNC,nlselectmorph,PUNCT,'
			'MARK-YEAR,nlpercolatemorph,nlmwuhead').split(','),
		}

# Mappings for multi-level coarse-to-fine parsing
# following Charniak et al. (2006), multi-level coarse-to-fine parsing.
# http://aclweb.org/anthology/N06-1022
MAPPINGS = {
		'ptb': {
			# level 0: P (all phrase labels)
			0: {'P': {'S', 'VP', 'UCP', 'SQ', 'SBAR', 'SBARQ', 'SINV',
					'NP', 'NAC', 'NX', 'LST', 'X', 'FRAG', 'PRT|ADVP',
					'ADJP', 'QP', 'CONJP', 'ADVP', 'INTJ', 'PRN', 'PRT',
					'PP', 'RRC', 'WHADJP', 'WHADVP', 'WHNP', 'WHPP'}},
			# level 1: HP (arguments), MP (modifiers)
			1: {'HP': {'S', 'VP', 'UCP', 'SQ', 'SBAR', 'SBARQ', 'SINV',
					'NP', 'NAC', 'NX', 'LST', 'X', 'FRAG'},
				'MP': {'ADJP', 'QP', 'CONJP', 'ADVP', 'INTJ', 'PRN', 'PRT',
					'PRT|ADVP', 'PP', 'RRC', 'WHADJP', 'WHADVP', 'WHNP',
					'WHPP'}},
			# level 2: S (verbal), N (nominal), A (adjectival),
			# 	P (prepositional)
			# note: PRT is part of both A_ and P_ in the paper;
			# UCP is part of both S_ and N_
			2: {'S_': {'S', 'VP', 'SQ', 'SBAR', 'SBARQ', 'SINV'},
				'N_': {'NP', 'NAC', 'NX', 'LST', 'X', 'UCP', 'FRAG'},
				'A_': {'ADJP', 'QP', 'CONJP', 'ADVP', 'INTJ', 'PRN', 'PRT',
					'PRT|ADVP'},
				'P_': {'PP', 'RRC', 'WHADJP', 'WHADVP', 'WHNP', 'WHPP'}},
			# level 3: no-op, return original treebank labels
		},
		'negra': {
			# level 0: P (all phrase labels)
			0: {'P': {'--', 'AA', 'AP', 'AVP', 'CAC', 'CAP', 'CAVP', 'CCP',
				'CH', 'CNP', 'CO', 'CPP', 'CS', 'CVP', 'CVZ', 'DL', 'ISU',
				'MPN', 'MTA', 'NM', 'NP', 'PN', 'PP', 'QL', 'S', 'VP', 'VZ'}},
			# level 1: HP (arguments), MP (modifiers)
			1: {'HP': {'NP', 'S', 'VP', 'VZ', 'CO', 'AA', 'CNP', 'CS', 'CVP',
					'CVZ', 'PN', 'MPN', 'NM', 'CH', 'CCP', 'DL', 'ISU', 'QL'},
				'MP': {'--', 'AP', 'PP', 'AVP', 'CAP', 'CPP', 'CAVP', 'CAC',
					'MTA'}},
			# level 2: S (verbal), N (nominal), A (adjectival),
			# 	P (prepositional)
			2: {'S_': {'S', 'VP', 'VZ', 'CO', 'AA', 'CS', 'CVP',
					'CVZ', 'CCP', 'DL', 'ISU', 'QL'},
				'N_': {'NP', 'CNP', 'PN', 'MPN', 'NM', 'CH'},
				'A_': {'--', 'AP', 'AVP', 'CAP', 'CAVP', 'MTA'}},
				'P_': {'PP', 'CPP', 'CAC'}
			# level 3: no-op, return original treebank labels
		},
		'alpino': {
			# level 0: P (all phrase labels)
			0: {'P': {'ADVP', 'AHI', 'AP', 'CONJ', 'CP', 'DETP', 'DU', 'INF',
					'MWU', 'NP', 'OTI', 'PP', 'PPART', 'PPRES', 'REL', 'SMAIN',
					'SSUB', 'SV1', 'SVAN', 'TI', 'WHQ', 'WHREL', 'WHSUB'}},
			# level 1: HP (arguments), MP (modifiers)
			1: {'HP': {'AHI', 'CONJ', 'CP', 'DETP', 'DU', 'INF', 'MWU', 'NP',
					'OTI', 'PPART', 'PPRES', 'REL', 'SMAIN', 'SSUB', 'SVAN',
					'SV1', 'TI', 'WHSUB', 'WHQ'},
				'MP': {'AP', 'ADVP', 'PP', 'REL', 'WHREL'}},
			# level 2: S (verbal), N (nominal), A (adjectival),
			# 	P (prepositional)
			2: {'S_': {'AHI', 'CP', 'DU', 'INF', 'OTI', 'PPART', 'PPRES',
					'SMAIN', 'SSUB', 'SVAN', 'SV1', 'TI', 'WHSUB', 'WHQ'},
				'N_': {'CONJ', 'DETP', 'MWU', 'NP'},
				'A_': {'AP', 'ADVP', 'REL', 'WHREL'},
				'P_': {'PP'}},
			# level 3: no-op, return original treebank labels
		},
	}


[docs]def expandpresets(transformations): """Expand aliases for presets.""" return [a for name in transformations for a in PRESETS.get(name, [name])]
[docs]def transform(tree, sent, transformations): """Perform specified sequence of transformations on a tree. State-splits are preceded by '^'. ``transformations`` is a sequence of transformation names (order matters) that will be performed on the given tree (in-place). There are presets for particular treebanks. The name of a preset can be used as an alias that expands to a sequence of transformations; see the variable ``PRESETS``.""" # unfreeze attributes so that they can be modified for a in tree.subtrees(lambda n: isinstance( getattr(n, 'source', None), tuple)): a.source = list(a.source) for name in transformations: if name == 'APPEND-FUNC': # add function to phrasal label for a in tree.subtrees(): func = functions(a) if func and not a.label.endswith('-'): # -LRB- a.label += '-' + '-'.join(func) elif name == 'FUNC-NODE': # insert node w/function above phrasal label for a in tree.postorder(): func = functions(a) if func and not a.label.endswith('-'): # -LRB- a[:] = [a.__class__(a.label, [a.pop() for _ in range(len(a))][::-1])] a.label = '-' + '-'.join(func) elif name == 'APPEND-MORPH': # Append morph. features to POS tag for a in tree.subtree(lambda n: n and isinstance(n[0], int)): morph = '--' if getattr(a, 'source', None): morph = a.source[MORPH].replace('(', '[').replace(')', ']') a.label += STATESPLIT + morph elif name == 'MORPH-NODE': # insert node w/morph. features above POS for a in tree.postorder(lambda n: n and isinstance(n[0], int)): morph = '--' if getattr(a, 'source', None): morph = a.source[MORPH].replace('(', '[').replace(')', ']') a[:] = [a.__class__(morph, [a.pop() for _ in range(len(a))][::-1])] elif name == 'LEMMA-NODE': # insert node w/lemma above terminal for a in tree.postorder(lambda n: n and isinstance(n[0], int)): lemma = '--' if getattr(a, 'source', None): lemma = escape(a.source[LEMMA]) a[:] = [a.__class__(lemma, [a.pop() for _ in range(len(a))][::-1])] elif name == 'MARK-YEAR': # mark POS label of year terminals for node in tree.subtrees(lambda n: n and isinstance(n[0], int) and YEARRE.match(sent[n[0]])): node.label += STATESPLIT + 'year' elif name == 'PUNCT': # distinguish sentence-ending punctuation. for punct in tree.subtrees(lambda n: n and isinstance(n[0], int) and sent[n[0]] in '.?!'): punct.label += STATESPLIT + sent[punct[0]] elif name == 'PUNCT-PRUNE': # remove initial/ending quotes & period punctprune(tree, sent) elif name == 'FANOUT': # add fan-out markers addfanoutmarkers(tree) elif name == 'PARENT': # add one level of parent annotation # Useful to do here to add the parent annotations before # adding any other annotations to the labels. # Skips preterminals. for node in islice(tree.subtrees( lambda n: n and isinstance(n[0], Tree)), 1, None): node.label += STATESPLIT + strip(node.parent.label) elif name == 'TAGPA': # Add parent annotation to non-punct. POS tags for node in tree.subtrees(lambda n: n and isinstance(n[0], int) and sent[n[0]] not in PUNCTUATION): node.label += STATESPLIT + strip(node.parent.label) elif name == 'NP-PP': # mark PPs under NPs for pp in tree.subtrees(lambda n: n.label == 'PP' and n.parent.label == 'NP'): pp.label += STATESPLIT + 'NP' elif (negratransforms(name, tree, sent) or lassytransforms(name, tree, sent) or ptbtransforms(name, tree, sent) or ftbtransforms(name, tree, sent)): pass else: raise ValueError('unrecognized transformation %r' % name) for a in reversed(list(tree.subtrees(lambda x: len(x) > 1))): a.children.sort(key=Tree.leaves) return tree
def negratransforms(name, tree, sent): """Negra / Tiger transforms.""" if name == 'S-RC': # relative clause => S becomes S^RC for s in tree.subtrees(lambda n: n.label == 'S' and function(n) == 'RC'): s.label += STATESPLIT + 'RC' elif name == 'NP': # case for np in tree.subtrees(lambda n: n.label == 'NP'): np.label += STATESPLIT + function(np) elif name == 'PP-NP': # un-flatten PPs by introducing NPs addtopp = ('AC', ) for pp in tree.subtrees(lambda n: n.label == 'PP'): ac = [a for a in pp if function(a) in addtopp] # anything before an initial preposition goes to the PP # (modifiers, punctuation), otherwise it goes to the NP; # mutatis mutandis for postpositions. funcs = [function(x) for x in pp] if 'AC' in funcs and 'NK' in funcs: if funcs.index('AC') < funcs.index('NK'): ac[:0] = pp[:funcs.index('AC')] if rindex(funcs, 'AC') > rindex(funcs, 'NK'): ac += pp[rindex(funcs, 'AC') + 1:] # else: # print('PP but no AC or NK', ' '.join(funcs)) nk = [a for a in pp if a not in ac] # introduce a PP unless there is already an NP in the PP # (annotation mistake?), or there is a PN and we want to avoid # a cylic unary of NP -> PN -> NP. if ac and nk and (len(nk) > 1 or nk[0].label not in s('NP', 'PN')): pp[:] = [] pp[:] = ac + [ParentedTree('NP', nk)] elif name == 'DP': # introduce determiner phrases (DPs) # determiners = {'ART', 'PDS', 'PDAT', 'PIS', 'PIAT', 'PPOSAT', # 'PRELS', 'PRELAT', 'PWS', 'PWAT', 'PWAV'} determiners = {'ART'} for np in list(tree.subtrees(lambda n: n.label == 'NP')): if np[0].label in determiners: np.label = 'DP' if len(np) > 1 and np[1].label != 'PN': np1 = np[1:] np[1:] = [] np[1:] = [ParentedTree('NP', np1)] elif name == 'VP-GF': # VP category split based on head for vp in tree.subtrees(lambda n: n.label == 'VP'): vp.label += STATESPLIT + function(vp) elif name == 'VP-FIN_NEGRA': # introduce finite VP at S level # collect objects and modifiers # introduce new S level for discourse markers newlevel = 'DM'.split() addtovp = 'HD AC DA MO NG OA OA2 OC OG PD VO SVP'.split() def finitevp(s): """Introduce finite VPs grouping verbs and their objects.""" if any(x.label.startswith('V') and x.label.endswith('FIN') for x in s if isinstance(x, Tree)): vp = [a for a in s if function(a) in addtovp] # introduce a VP unless it would lead to a unary # VP -> VP production if len(vp) != 1 or vp[0].label != 'VP': s[:] = [pop(a) for a in s if function(a) not in addtovp ] + [pop(a) for a in vp] toplevel_s = [] if 'S' in labels(tree): toplevel_s = [a for a in tree if a.label == 'S'] for s in toplevel_s: while function(s[0]) in newlevel: s[:] = [s[0], ParentedTree('S', s[1:])] s = s[1] toplevel_s = [s] elif 'CS' in labels(tree): cs = tree[labels(tree).index('CS')] toplevel_s = [a for a in cs if a.label == 'S'] for a in toplevel_s: finitevp(a) elif name == 'POS-PART': # introduce POS tag for particle verbs for a in tree.subtrees( lambda n: any(function(x) == 'SVP' for x in n)): svp = [x for x in a if function(x) == 'SVP'].pop() # apparently there can be a _verb_ particle without a verb. # headlines? annotation mistake? if any(map(ishead, a)): hd = [x for x in a if ishead(x)].pop() if hd.label != a.label: particleverb = ParentedTree(hd.label, [hd, svp]) a[:] = [particleverb if ishead(x) else x for x in a if function(x) != 'SVP'] elif name == 'SBAR': # introduce SBAR level sbarfunc = 'CP'.split() # in the annotation, complementizers belong to the first S # in S conjunctions, even when they appear to apply to the whole # conjunction. for s in list(tree.subtrees(lambda n: n.label == 'S' and function(n[0]) in sbarfunc and len(n) > 1)): s.label = 'SBAR' s[:] = [s[0], ParentedTree('S', s[1:])] elif name == 'NEST': # introduce nested structures for modifiers # (iterated adjunction instead of sister adjunction) adjunctable = {'NP'} # PP AP VP for a in list(tree.subtrees(lambda n: n.label in adjunctable and any(function(x) == 'MO' for x in n))): modifiers = [x for x in a if function(x) == 'MO'] if min(n for n, x in enumerate(a) if function(x) == 'MO') == 0: modifiers[:] = modifiers[::-1] while modifiers: modifier = modifiers.pop() a[:] = [ParentedTree(a.label, [x for x in a if x != modifier]), modifier] a = a[0] # The following transformations as described in Fraser et al (CL, 2013) # http://aclweb.org/anthology/J13-1005 elif name == 'addUnary': # introduce unary NPs maxid = getmaxid(tree) for node in tree.postorder(lambda n: strip(n.label) in {'NN', 'PPER', 'PDS', 'PIS', 'PRELS', 'CARD', 'PN'} and strip(n.parent.label) in {'S', 'VP', 'ROOT', 'DL'}): if node.label == 'PN' and len(node) == 1: # only complex PNs continue children = node[:] node[:] = [] tag = ParentedTree(node.label, children) tag.source = node.source[:] node[:] = [tag] node.source[TAG] = node.label = 'NP' node.source[FUNC] = tag.source[FUNC] maxid += 1 node.source[WORD] = '#%d' % maxid node.source[LEMMA] = node.source[MORPH] = '--' tag.source[FUNC] = 'HD' elif name == 'addCase': # add case features to POS tags for node in tree.subtrees(lambda n: n and isinstance(n[0], int)): case = CASERE.match(node.source[MORPH]) if case: node.label += '/' + case.group(1) elif name == 'elimNKCJ': # eliminate NK and CJ functions for node in tree.subtrees(lambda n: function(n) in {'NK', 'CJ'}): if function(node) == 'NK': node.source[FUNC] = 'HD' else: # elif function(node) == 'CJ': node.source[FUNC] = function(node.parent) or '--' elif name == 'lexPrep': # lexicalize frequent prepositions/conjunctions for node in tree.subtrees(lambda n: n and isinstance(n[0], int)): word = sent[node[0]].lower() if base(node, 'APPR') and word in { 'in', 'von', 'auf', 'durch', 'um', 'unter', 'unters', 'unterm'}: node.label += STATESPLIT + word elif base(node, 'KON') and function(node) == 'CD' and (word in { 'sowohl', 'als', 'weder', 'entweder', 'noch'} or (word == 'oder' # 'oder' if preceded by entweder and any(a.lower() == 'entweder' for a in sent[:node[0]]))): node.label += STATESPLIT + word elif name == 'adjAttach': # annotate attachments of adjuncts for node in tree.subtrees( lambda n: strip(n.label) in {'PP', 'AVP', 'ADV', 'ADJD'}): if strip(node.parent.label) in {'S', 'VP'}: annot = 'V' elif strip(node.parent.label) in {'NP', 'PP'}: annot = 'N' else: annot = '0' node.label += STATESPLIT + annot if strip(node.label) == 'AVP': # propagate to head child for child in node: if ishead(child): child.label += STATESPLIT + annot break elif name == 'relPath': # mark path from relative clause to rel. pronoun for node in tree.subtrees(lambda n: base(n, 'S') and function(n) == 'RC'): for child in node.subtrees(lambda n: strip(n.label) in {'PRELS', 'PRELAT', 'PWAV', 'PWS'}): child = child.parent while child is not node: child.label += STATESPLIT + 'rel' child = child.parent node.label += STATESPLIT + 'rel' break else: # no rel. pronoun found node.label += STATESPLIT + 'norel' elif name == 'whFeat': # mark NP/PP that immediately dominates WH-pronoun for node in tree.subtrees(lambda n: strip(n.label) in {'NP', 'PP'} and any(strip(a.label) in {'PWAT', 'PWS', 'PWAV'} for a in n)): node.label += STATESPLIT + 'wh' elif name == 'nounSeq': # consecutive nouns in NP for node in tree.subtrees(lambda n: base(n, 'NP')): for n, a in enumerate(node[:-1]): if base(a, 'NN') and base(node[n + 1], 'NN'): a.label += STATESPLIT + 'seq' break elif name == 'properChunks': # mark POS tags in proper noun chunks for node in tree.subtrees(lambda n: base(n, 'NP') and function(n) == 'PNC'): for tag in node: tag.label += STATESPLIT + 'name' elif name == 'markAP': # mark predicative APs, APs with nominal head for node in tree.subtrees(lambda n: base(n, 'AP')): if any(base(a, 'ADJD') for a in node.subtrees()): node.label += STATESPLIT + 'pred' if any(ishead(child) and strip(child.label) in {'NN', 'NP'} for child in node): node.label += STATESPLIT + 'nom' elif name == 'subConjType': # mark type of subordinating conj. for node in tree.subtrees(lambda n: base(n, 'S') and function(n) in {'SB', 'OC', 'MO', 'RE'}): for child in node: if base(child, 'KOUS'): child.label += STATESPLIT + function(node) break elif name == 'VPfeat': # mark object VPs with head label for node in tree.subtrees(lambda n: base(n, 'VP') and function(n) == 'OC'): for child in node: if ishead(child): node.label += STATESPLIT + strip(child.label) break elif name == 'noHead': # constituents without head child for node in tree.subtrees(lambda n: n is not tree and n and isinstance(n[0], Tree)): # The heuristically found heads do not count. if not any(function(child) in {'HD', 'PNC', 'AC', 'AVC', 'NMC', 'PH', 'PD', 'ADC', 'UC', 'DH'} for child in node): node.label += STATESPLIT + 'nohead' elif name == 'noSubj': # conjunct clauses without subject for node in tree.subtrees(lambda n: n and isinstance(n[0], Tree) and base(n, 'S') and function(n) == 'CJ'): if not any(function(child) in {'SB', 'EP'} or strip(child.label) in {'VVIMP', 'VAIMP'} for child in node): node.label += STATESPLIT + 'nosubj' else: return False return True def ptbtransforms(name, tree, sent): """Transforms for WSJ section of Penn treebank.""" if name == 'S-WH': for sbar in tree.subtrees(lambda n: n.label == 'SBAR'): for s in sbar: if (s.label == 'S' and any(a.label.startswith('WH') for a in s)): s.label += STATESPLIT + 'WH' elif name == 'VP-HD': # VP category split based on head for vp in tree.subtrees(lambda n: n.label == 'VP'): hd = [x for x in vp if ishead(x)].pop() if hd.label == 'VB': vp.label += STATESPLIT + 'HINF' elif hd.label == 'TO': vp.label += STATESPLIT + 'HTO' elif hd.label in ('VBN', 'VBG'): vp.label += STATESPLIT + 'HPART' elif name == 'S-INF': for s in tree.subtrees(lambda n: n.label == 'S'): hd = [x for x in s if ishead(x)].pop() if hd.label in ('VP' + STATESPLIT + 'HINF', 'VP' + STATESPLIT + 'HTO'): s.label += STATESPLIT + 'INF' elif name == 'VP-FIN_WSJ': # add disc. finite VP when verb is under S # this counters the flattening when a VP is not possible because of # non-standard word order; e.g. is John happy for s in tree.postorder(lambda n: n.label == 'S'): if not any(a.label.startswith('VP') for a in s): vp = ParentedTree('VP', []) for child in list(s): # FIXME: check which functions should not go in the VP # (pre)modifiers unclear. if 'SBJ' not in functions(child): vp.append(s.pop(child)) s.append(vp) elif name == 'MARK-UNARY': # add -U to unary nodes to avoid cycles for unary in tree.subtrees(lambda n: len(n) == 1 and isinstance(n[0], Tree)): unary.label += STATESPLIT + 'U' # The following transformations are translations of # the Stanford Parser state splits described in # Accurate Unlexicalized Parsing (ACL 2003). # http://aclweb.org/anthology/P03-1054 elif name == 'splitIN': # Stanford Parser splitIN=3 for node in tree.subtrees(lambda n: base(n, 'IN')): if base(node.parent.parent, 'N') and ( base(node.parent, 'P') or base(node.parent, 'A')): node.label += STATESPLIT + 'N' elif base(node.parent, 'Q') and ( base(node.parent.parent, 'N') or base(node.parent.parent, 'ADJP')): node.label += STATESPLIT + 'Q' elif base(node.parent.parent, 'S'): if base(node.parent, 'SBAR'): node.label += STATESPLIT + 'SCC' else: node.label += STATESPLIT + 'SC' elif base(node.parent, 'SBAR') or base( node.parent, 'WHNP'): node.label += STATESPLIT + 'T' elif name == 'splitIN4': # Stanford Parser splitIN=4 for node in tree.subtrees(lambda n: base(n, 'IN')): if base(node.parent.parent, 'N') and ( base(node.parent, 'P') or base(node.parent, 'A')): node.label += STATESPLIT + 'N' elif base(node.parent, 'Q') and ( base(node.parent.parent, 'N') or base(node.parent.parent, 'ADJP')): node.label += STATESPLIT + 'Q' elif node.parent.parent.label[0] == 'S' and not base( node.parent.parent, 'SBAR'): if base(node.parent, 'SBAR'): node.label += STATESPLIT + 'SCC' elif not base(node.parent, 'NP') and not base( node.parent, 'ADJP'): node.label += STATESPLIT + 'SC' elif base(node.parent, 'SBAR') or base( node.parent, 'WHNP') or base(node.parent, 'WHADVP'): node.label += STATESPLIT + 'T' elif name == 'splitPercent': # Stanford Parser splitPercent=1 for node in tree.subtrees(lambda n: n and isinstance(n[0], int) and sent[n[0]] == '%'): node.label += STATESPLIT + r'%' elif name == 'splitPoss': # Stanford Parser splitPoss=1 for node in tree.subtrees(lambda n: base(n, 'NP') and n[-1].label.startswith('POS')): node.label += STATESPLIT + 'P' elif name == 'splitCC': # Stanford Parser splitCC=2 for node in tree.subtrees(lambda n: base(n, 'CC')): if sent[node[0]].lower() == 'but': node.label += STATESPLIT + 'B' elif sent[node[0]] == '&': node.label += STATESPLIT + 'A' elif name == 'unaryDT': # Stanford Parser unaryDT=true for node in tree.subtrees(lambda n: base(n, 'DT') and len(n.parent) == 1): node.label += STATESPLIT + 'U' elif name == 'unaryRB': # Stanford Parser unaryRB=true for node in tree.subtrees(lambda n: base(n, 'RB') and len(n.parent) == 1): node.label += STATESPLIT + 'U' elif name == 'splitAux': # Stanford Parser splitAux=1 for node in tree.subtrees(lambda n: strip(n.label) in {'VBZ', 'VBP', 'VBD', 'VBN', 'VBG', 'VB'}): if sent[node[0]].lower() in { 'is', 'am', 'are', 'was', 'were', "'m", "'re", "'s", 'being', 'be', 'been'}: node.label += STATESPLIT + 'BE' elif sent[node[0]].lower() in { 'have', "'ve", 'having', 'has', 'had', "'d"}: node.label += STATESPLIT + 'HV' elif name == 'splitAux2': # Stanford Parser splitAux=2 for node in tree.subtrees(lambda n: strip(n.label) in {'VBZ', 'VBP', 'VBD', 'VBN', 'VBG', 'VB'}): if sent[node[0]].lower() in {"'s", "s"}: # 's can be a contraction of both "is" and "have" foundAux = False for sibling in node.parent: if foundAux: if base(sibling, 'VP') and any(strip(a.label) in {'VBD', 'VBN'} for a in sibling): node.label += STATESPLIT + 'HV' break elif sibling.label.startswith('VBZ'): foundAux = True else: node.label += STATESPLIT + 'BE' if sent[node[0]].lower() in {'am', 'is', 'are', 'was', 'were', "'m", "'re", 'be', 'being', 'been', 'ai'}: node.label += STATESPLIT + 'BE' elif sent[node[0]].lower() in { 'have', "'ve", 'having', 'has', 'had', "'d"}: node.label += STATESPLIT + 'HV' elif name == 'splitVP': # Stanford Parser splitVP=2 for node in tree.subtrees(lambda n: base(n, 'VP')): for child in node: if ishead(child): if strip(child.label) in {'VBZ', 'VBP', 'VBD', 'MD'}: node.label += STATESPLIT + 'VBF' else: node.label += STATESPLIT + strip(child.label) break elif name == 'splitVP3': # Stanford Parser splitVP=3 for node in tree.subtrees(lambda n: base(n, 'VP')): for child in node: if ishead(child): if strip(child.label) in {'VBZ', 'VBP', 'VBD', 'MD'}: node.label += STATESPLIT + 'VBF' elif strip(child.label) in {'TO', 'VBG', 'VBN', 'VB'}: node.label += STATESPLIT + strip(child.label) break elif name == 'splitSGapped': # Stanford Parser splitSGapped=3 seenPredCat = seenCC = seenS = False seenNP = 0 for node in tree.subtrees(lambda n: base(n, 'S')): for child in node: cat2 = child.label if cat2.startswith('NP'): seenNP += 1 elif strip(cat2) in {'VP', 'ADJP', 'PP', 'UCP'}: seenPredCat = True elif cat2.startswith('CC'): seenCC = True elif cat2.startswith('S'): seenS = True if (not (seenCC and seenS)) and ( seenNP == 0 or (seenNP == 1 and not seenPredCat)): node.label += STATESPLIT + 'G' elif name == 'splitTMP': # Stanford Parser splitTMP=TEMPORAL_ACL03PCFG for node in tree.postorder(lambda n: 'TMP' in functions(n)): child = node hd = None while node and isinstance(node[0], Tree): try: i, hd = next(a for a in enumerate(child) if ishead(a[1])) except StopIteration: break if strip(hd) == 'POS' and i > 0: hd = child[i - 1] child = hd if 'TMP' in functions(node): node.label += STATESPLIT + 'TMP' if hd and hd.label.startswith('N'): hd.label += STATESPLIT + 'TMP' elif name == 'splitBaseNP': # Stanford Parser splitBaseNP=1 # Mark NPs that only dominate preterminals for node in tree.subtrees(lambda n: base(n, 'NP')): if all(a and isinstance(a[0], int) for a in node): node.label += STATESPLIT + 'B' elif name == 'dominatesV': # Stanford Parser dominatesV=1 for node in tree.subtrees(lambda n: base(n, 'VP')): if any(tag.startswith('V') or tag.startswith('MD') for _, tag in node.pos()): node.label += STATESPLIT + 'v' elif name == 'splitNPADV': # Stanford Parser splitNPADV=1 for node in tree.subtrees(lambda n: base(n, 'NP') and 'ADV' in functions(n)): node.label += STATESPLIT + 'ADV' try: hd = next(a for a in node if ishead(a)) except StopIteration: continue if base(hd, 'POS') and hd.parent_index > 0: hd = node[hd.parent_index - 1] while base(hd, 'NP'): hd.label += STATESPLIT + 'ADV' try: hd = next(a for a in hd if ishead(a)) except StopIteration: break elif name == 'markDitransV': # Stanford Parser markDitransV=2 for node in tree.subtrees(lambda n: n.label.startswith('VB')): npargs = sum(1 for a in node.parent if base(a, 'NP') and 'TMP' not in functions(a)) if npargs >= 2: node.label += STATESPLIT + '2Arg' elif name == 'PTBbrackets': # ensure that brackets are in PTB format sent[:] = [ptbescape(token) for token in sent] else: return False return True def ftbtransforms(name, tree, sent): """Port of manual FTB enrichments specified in Stanford parser. cf. ``FrenchTreebankParserParams.java``""" if name == 'markinf': for t in tree.subtrees(lambda n: strip(n.label) == "V" and isinstance(n.parent, Tree) and isinstance(n.parent.parent, Tree) and strip(n.parent.label) == "VN" and strip(n.parent.parent.label) == "VPinf"): t.label += STATESPLIT + "infinitive" elif name == 'markpart': for t in tree.subtrees(lambda n: strip(n.label) == "V" and isinstance(n.parent, Tree) and isinstance(n.parent.parent, Tree) and strip(n.parent.label) == "VN" and strip(n.parent.parent.label) == "VPpart"): t.label += STATESPLIT + "participle" elif name == 'markvn': for t in tree.subtrees(lambda n: strip(n.label) == "VN"): for sub in islice(t.subtrees(), 1, None): sub.label += STATESPLIT + "withVN" elif name == 'coord1': for t in tree.subtrees(lambda n: strip(n.label) == 'COORD' and len(n) >= 2): t.label += STATESPLIT + strip(t[1].label) elif name == 'de2': for t in tree.subtrees(lambda n: strip(n.label) == 'P' and DERE.match(sent[n[0]])): t.label += STATESPLIT + "de2" elif name == 'de3': # @NP|PP|COORD >+(@NP|PP) (@PP <, (@P < /^([Dd]es?|du|d')$/)) for t in tree.subtrees(lambda n: strip(n.label) in ("PP", "COORD")): a = list(ancestors(t)) for n in range(2, len(a)): if PPORNP.match("".join(strip(x.label) for x in a[:n])): if (strip(a[n - 1][0].label) == "P" and DERE.match(sent[a[n - 1][0][0]])): t.label += STATESPLIT + "de3" break elif name == 'markp1': for t in tree.subtrees(lambda n: strip(n.label) == "P" and strip(n.parent.label) == "PP" and strip(n.parent.parent.label) == "NP"): t.label += STATESPLIT + "n" elif name == 'mwadvs': for t in tree.subtrees(lambda n: strip(n.label) == "MWADV" and "S" in n.parent.label): t.label += STATESPLIT + "mwadv-s" elif name == 'mwadvsel1': for t in tree.subtrees(lambda n: strip(n.label) == "MWADV" and len(n) == 2 and strip(n[0].label) == "P" and strip(n[1].label) == "N"): t.label += STATESPLIT + "mwadv1" elif name == 'mwadvsel2': for t in tree.subtrees(lambda n: strip(n.label) == "MWADV" and len(n) == 3 and strip(n[0].label) == "P" and strip(n[1].label) == "D" and strip(n[2].label) == "N"): t.label += STATESPLIT + "mwadv2" elif name == 'mwnsel1': for t in tree.subtrees(lambda n: strip(n.label) == "MWN" and len(n) == 2 and strip(n[0].label) == "N" and strip(n[1].label) == "A"): t.label += STATESPLIT + "mwn1" elif name == 'mwnsel2': for t in tree.subtrees(lambda n: strip(n.label) == "MWN" and len(n) == 3 and strip(n[0].label) == "N" and strip(n[1].label) == "P" and strip(n[2].label) == "N"): t.label += STATESPLIT + "mwn2" elif name == 'mwnsel3': # noun-noun compound joined with dash. for t in tree.subtrees(lambda n: strip(n.label) == "MWN" and len(n) == 3 and strip(n[0].label) == "N" and sent[n[1][0]] == "-" and strip(n[2].label) == "N"): t.label += STATESPLIT + "mwn3" else: return False return True def lassytransforms(name, tree, _sent): """Transformations for the Dutch Lassy & Alpino treebanks.""" if name == 'nlselectmorph': # add select morph. feats to coarse POS tags SELECTMORPH = {'eigen', 'det', 'pron', 'init', 'fin', 'neven', 'onder', 'prenom', 'nom', 'vrij', 'pv', 'inf', 'vd', 'od'} for pos in tree.subtrees(lambda n: n and isinstance(n[0], int)): tag = pos.source[MORPH].split('(')[0] selected = [feat for feat in morphfeats(pos) if feat in SELECTMORPH] pos.label += '/%s[%s]' % (tag, ','.join(selected)) elif name == 'nlpercolatemorph': # percolate select morph tags upwards PERCOLATE = {'pv': 2, 'inf': 2} for feat in sorted(PERCOLATE): lvl = PERCOLATE[feat] for pos in tree.subtrees(lambda n, f=feat: n and isinstance(n[0], int) and f in morphfeats(n)): cnt = 0 node = pos.parent while (cnt < lvl and node is not None and node.parent is not None): if not node.label.endswith(STATESPLIT + feat): node.label += STATESPLIT + feat node = node.parent cnt += 1 elif name == 'nlmwuhead': # add label of head child to MWU nodes EXPANDCAT = {'MWU'} for node in tree.subtrees(lambda n: strip(n.label) in EXPANDCAT): node.label += STATESPLIT + next( iter(strip(a.label) for a in node if ishead(a) or a is node[-1])) elif name == 'nladdunary': # introduce unary NPs maxid = getmaxid(tree) for node in tree.postorder(lambda n: strip(n.label) in {'n', 'vnw'} and strip(n.parent.label) in {'SMAIN', 'PP', 'INF'}): children = node[:] node[:] = [] tag = ParentedTree(node.label, children) tag.source = node.source[:] node[:] = [tag] node.source[TAG] = node.label = 'NP' node.source[FUNC] = tag.source[FUNC] if node.source[FUNC] and node.source[FUNC][0].isupper(): tag.source[FUNC] = 'HD' else: tag.source[FUNC] = 'hd' maxid += 1 node.source[WORD] = '#%d' % maxid node.source[LEMMA] = node.source[MORPH] = '--' elif name == 'nlelimcnj': # assign conjuncts the function of the parent for node in tree.subtrees(lambda n: function(n) == 'cnj'): node.source[FUNC] = function(node.parent) or '--' else: return False return True
[docs]def reversetransform(tree, transformations): """Undo specified transformations and remove state splits marked by ``^``. Do not apply twice (might remove VPs which shouldn't be).""" # Generic state-split removal for node in tree.subtrees(lambda n: STATESPLIT in n.label[1:]): node.label = node.label[:node.label.index(STATESPLIT, 1)] # restore linear precedence order for a in tree.subtrees(lambda n: len(n) > 1): a.children.sort(key=lambda n: min(n.leaves()) if isinstance(n, Tree) else n) # unfreeze attributes so that they can be modified for a in tree.subtrees(): if isinstance(getattr(a, 'source', None), tuple): a.source = list(a.source) for name in reversed(transformations): if name == 'FANOUT': removefanoutmarkers(tree) elif name == 'DP': # remove DPs for dp in tree.subtrees(lambda n: n.label == 'DP'): dp.label = 'NP' if len(dp) > 1 and dp[1].label == 'NP': # dp1 = dp[1][:] # dp[1][:] = [] # dp[1:] = dp1 dp[1][:], dp[1:] = [], dp[1][:] elif name == 'NEST': # flatten adjunctions nkonly = {'PDAT', 'CAP', 'PPOSS', 'PPOSAT', 'ADJA', 'FM', 'PRF', 'NM', 'NN', 'NE', 'PIAT', 'PRELS', 'PN', 'TRUNC', 'CH', 'CNP', 'PWAT', 'PDS', 'VP', 'CS', 'CARD', 'ART', 'PWS', 'PPER'} probably_nk = {'AP', 'PIS'} | nkonly for n in tree.subtrees(): if (len(n) == 2 and n.label == 'NP' and [x.label for x in n].count('NP') == 1 and not set(labels(n)) & probably_nk): n.children.sort(key=lambda n: n.label == 'NP') n[:] = n[:1] + n[1][:] elif name == 'PP-NP': # flatten PPs for pp in tree.subtrees(lambda n: n.label == 'PP'): if 'NP' in labels(pp) and 'NN' not in labels(pp): # ensure NP is in last position pp.children.sort(key=lambda n: n.label == 'NP') pp[-1][:], pp[-1:] = [], pp[-1][:] elif name == 'SBAR': # merge extra S level for sbar in list(tree.subtrees(lambda n: n.label == 'SBAR' or (n.label == 'S' and len(n) == 2 and labels(n) == ['PTKANT', 'S']))): sbar.label = 'S' if sbar[0].label == 'S': sbar[:] = sbar[1:] + sbar[0][:] else: sbar[:] = sbar[:1] + sbar[1][:] elif name == 'VP-FIN_NEGRA': def mergevp(s): """Merge finite VP with S level.""" for vp in (n for n, a in enumerate(s) if a.label == 'VP'): if any(a.label.endswith('FIN') for a in s[vp]): s[vp][:], s[vp:vp + 1] = [], s[vp][:] # if any(a.label == 'S' for a in tree): # map(mergevp, [a for a in tree if a.label == 'S']) # elif any(a.label == 'CS' for a in tree): # map(mergevp, [s for cs in tree for s in cs if cs.label == 'CS' # and s.label == 'S']) for s in tree.subtrees(lambda n: n.label == 'S'): mergevp(s) elif name == 'POS-PART': # remove constituents for particle verbs # get the grandfather of each verb particle def hasparticle(n): """Test whether node has a PTKVZ node.""" return any('PTKVZ' in (x.label for x in m if isinstance(x, Tree)) for m in n if isinstance(m, Tree)) for a in list(tree.subtrees(hasparticle)): for n, b in enumerate(a): if (len(b) == 2 and b.label.startswith('V') and 'PTKVZ' in (c.label for c in b if isinstance(c, Tree)) and any(c.label == b.label for c in b)): a[n:n + 1] = b[:] elif name == 'addUnary': # remove phrasal projections for single tokens # e.g. S => NP => NN becomes S => NN for node in tree.subtrees(lambda n: strip(n.label) in {'S', 'VP', 'ROOT', 'DL'}): for child in node: if (len(child) == 1 and base(child, 'NP') and strip(child[0].label) in {'NN', 'PPER', 'PDS', 'PIS', 'PRELS', 'CARD', 'PN'}): child.label = child[0].label origfunc = function(child) child.source = getattr(child[0], 'source', None) if child.source: child.source[FUNC] = origfunc or '--' children = child[0][:] child[0][:] = [] child[:] = children elif name == 'elimNKCJ': # restore NK and CJ functions for node in tree.subtrees(lambda n: strip(n.label) in {'NP', 'PP'}): for child in node: if not getattr(child, 'source', None): child.source = ['--'] * 6 if function(child) == 'HD': child.source[FUNC] = 'NK' for node in tree.subtrees(lambda n: strip(n.label) in {'CS', 'CNP', 'CVP', 'CAP', 'CAVP', 'CAC'}): for child in node: if not getattr(child, 'source', None): child.source = ['--'] * 6 if function(child) != 'CD': child.source[FUNC] = 'CJ' elif name == 'nladdunary': # remove unary node for node in tree.subtrees(lambda n: strip(n.label) in {'SMAIN', 'PP', 'INF'}): for child in node: if (len(child) == 1 and isinstance(child[0], Tree) and base(child, 'NP') and strip(child[0].label) in {'n', 'vnw'}): child.label = child[0].label origfunc = function(child) child.source = getattr(child[0], 'source', None) if child.source: child.source[FUNC] = origfunc or '--' children = child[0][:] child[0][:] = [] child[:] = children elif name == 'nlelimcnj': # restore cnj function for node in tree.subtrees(lambda n: base(n, 'CONJ')): for child in node: if not getattr(child, 'source', None): child.source = ['--'] * 6 if function(child) != 'crd' and not base(child, 'let'): child.source[FUNC] = 'cnj' elif name == 'APPEND-FUNC': # functions appended to phrasal labels for a in tree.subtrees(lambda n: '-' in n.label and not n.label.startswith('-') and not n.label.endswith('-')): # -LRB- label, func = a.label.split('-', 1) if not getattr(a, 'source', None): a.source = ['--'] * 6 a.source[TAG] = a.label = label a.source[FUNC] = func # morphological features appended to phrasal labels elif name in {'APPEND-MORPH', 'addCase', 'nlselectmorph'}: for a in tree.subtrees(lambda n: n and isinstance(n[0], int)): if '/' in a.label: label, morph = a.label.split('/', 1) if not getattr(a, 'source', None): a.source = ['--'] * 6 a.source[TAG] = a.label = label a.source[MORPH] = morph.replace('[', '(').replace(']', ')') elif name == 'FUNC-NODE': # nodes with function above phrasal labels for a in list(tree.postorder(lambda n: n.label.startswith('-') and not n.label.endswith('-') # -LRB- and n and isinstance(n[0], Tree))): a.source = ['--'] * 6 a.source[FUNC] = a.label[1:] a.source[TAG] = a.label = a[0].label a[:] = [a[0].pop() for _ in range(len(a[0]))][::-1] elif name == 'MORPH-NODE': # nodes with morph. above preterminals for a in list(tree.postorder(lambda n: n and isinstance(n[0], Tree) and n[0] and isinstance(n[0][0], int))): a.source = ['--'] * 6 a.source[MORPH] = a.label a.source[TAG] = a.label = a[0].label a[:] = [a[0].pop() for _ in range(len(a[0]))][::-1] elif name == 'LEMMA-NODE': # nodes with lemmas above words for a in list(tree.postorder(lambda n: n and isinstance(n[0], Tree) and n[0] and isinstance(n[0][0], int))): a.source = ['--'] * 6 a.source[LEMMA] = unescape(a[0].label) a.source[TAG] = a.label a[:] = [a[0].pop() for _ in range(len(a[0]))][::-1] # restore linear precedence order for a in tree.subtrees(lambda n: len(n) > 1): a.children.sort(key=lambda n: min(n.leaves()) if isinstance(n, Tree) else n) return tree
[docs]def collapselabels(trees, _sents=None, tbmapping=None): """Collapse non-root phrasal labels with specified mapping. Trees are modified in-place. :param tbmapping: a mapping of treebank labels of the form:: {coarselabel1: {finelabel1, finelabel2, ...}, ...} Cf. ``treebanktransforms.MAPPINGS`` :returns: a tuple ``(trees, mapping)`` with the transformed trees and a mapping of their original labels to the collapsed labels. """ def collapse(tree): """Collapse labels of a single tree.""" for node in islice(tree.subtrees(), 1, None): if node and isinstance(node[0], Tree): # anything not part of the mapping is stripped # (state splits, function tags, &c.) mapping[node.label] = LABELRE.sub( lambda x: revmapping.get(x.group(), ''), # lambda x: revmapping[x.group()], node.label).replace('-', '').rstrip('^') assert (mapping[node.label] and mapping[node.label][0].isalpha()), node.label node.label = mapping[node.label] # maps original treebank labels to coarser labels; e.g. NP => X revmapping = {finelabel: coarselabel for coarselabel in tbmapping for finelabel in tbmapping[coarselabel]} # maps labels after binarization and other transformations, # e.g., NP<DT,JJ,NN> => X<X,X,X> mapping = {'Epsilon': 'Epsilon', trees[0].label: trees[0].label} # collect POS tags, will not be changed for tree in trees: for node in tree.subtrees(lambda n: not isinstance(n[0], Tree)): mapping[node.label] = node.label revmapping[node.label] = node.label for tree in trees: collapse(tree) return trees, mapping
[docs]def rrtransform(tree, morphlevels=0, percolatefeatures=None, adjunctionlabel=None, ignorefunctions=None, ignorecategories=None, adjleft=True, adjright=True): """Relational-realizational tree transformation. Every constituent node is expanded to three levels: 1) syntactic category, e.g., S 2) unordered functional argument structure of children, e.g., S/<SBJ,HD,OBJ> 3) for each child: grammatical function + parent syntactic category, e.g., OBJ/S Example:: (NP-SBJ (NN-HD ...)) => (NP (<HD>/NP (HD/NP (NN ...)))) :param adjunctionlabel: a grammatical function label identifying adjunctions. They will not be part of argument structures, and their grammatical function will be replaced with their neighboring non-adjunctive functions. :param adjleft, adjright: whether to include the left and right sibling, respectively, when replacing the function label for ``adjunctionlabel``. :param ignorefunctions: function labels that do not go into argument structure, but keep their function in their realization to make backtransform possible. :param morphlevels: if nonzero, percolate morphological features this many levels upwards. For a given node, the union of the features of its children are collected, and the result is appended to its syntactic category. :param percolatefeatures: if a sequence is given, percolate only these morphological features; by default all features are used. :returns: a new, transformed tree.""" def realize(child, prevfunc, nextfunc): """Generate realization of a child node by recursion.""" newchild, morph, lvl = rrtransform(child, morphlevels, percolatefeatures, adjunctionlabel, ignorefunctions, ignorecategories, adjleft, adjright) result = tree.__class__('%s/%s' % (('%s:%s' % (prevfunc, nextfunc) if child.source[FUNC] == adjunctionlabel and (prevfunc or nextfunc) else child.source[FUNC]), tree.label), [newchild]) return result, morph, lvl if isinstance(tree[0], int): morph = tree.source[MORPH].replace('(', '[').replace(')', ']') preterminal = tree.__class__('%s/%s' % (tree.label, morph), tree) if morphlevels: return preterminal, morph, morphlevels return preterminal, None, 0 # for each node, collect the functions of closest non-adjunctive sibling childfuncsl = (prevfunc, ) = [''] for child in tree: if (isinstance(child, Tree) and child.source[FUNC] and child.source[FUNC] != adjunctionlabel and child.source[FUNC] not in ignorefunctions and child.label not in ignorecategories): prevfunc = child.source[FUNC] childfuncsl.append(prevfunc) childfuncsr = (nextfunc, ) = [''] for child in reversed(tree[1:]): if (isinstance(child, Tree) and child.source[FUNC] and child.source[FUNC] != adjunctionlabel and child.source[FUNC] not in ignorefunctions and child.label not in ignorecategories): nextfunc = child.source[FUNC] childfuncsr.insert(0, prevfunc) funcstr = ','.join(sorted(child.source[FUNC] for child in tree if isinstance(child, Tree) and child.source[FUNC] and child.source[FUNC] != adjunctionlabel and child.source[FUNC] not in ignorefunctions and child.label not in ignorecategories)) children, feats, levels = [], [], [] for child, prevfunc, nextfunc in zip(tree, childfuncsl, childfuncsr): newchild, morph, lvl = realize(child, prevfunc if adjleft else '', nextfunc if adjright else '') children.append(newchild) if morph and lvl: feats.append(morph) levels.append(lvl) morph, lvl = None, 0 if feats and max(levels) and tree.label != 'ROOT': morph, lvl = unifymorphfeat(feats, percolatefeatures), max(levels) - 1 configuration = tree.__class__('%s/<%s>' % (tree.label, funcstr), children) projection = tree.__class__(('%s-%s' % (tree.label, morph)) if morph else tree.label, [configuration]) return projection, morph, lvl
[docs]def rrbacktransform(tree, adjunctionlabel=None, func=None): """Reverse relational-realizational transformation. :param adjunctionlabel: used to assign a grammatical function to adjunctions that have been converted to contextual labels 'next:prev'. :param func: used internally to percolate functional labels. :returns: a new tree.""" morph = None if isinstance(tree[0], int): tag, morph = tree.label.split('/') result = tree.__class__(tag, tree) elif '/' not in tree[0].label: result = tree.__class__(tree.label, [rrbacktransform(child, adjunctionlabel) for child in tree]) else: result = tree.__class__(tree.label.split('-')[0], [rrbacktransform( child[0], adjunctionlabel, child.label.split('/')[0]) for child in tree[0]]) result.source = ['--'] * 6 result.source[TAG] = result.label if morph: result.source[MORPH] = morph.replace('[', '(').replace(']', ')') if func and adjunctionlabel and ':' in func: result.source[FUNC] = adjunctionlabel elif func: result.source[FUNC] = func return result
[docs]def dlevel(tree, lang='nl'): """Return the D-level measure of syntactic complexity. Original version: Rosenberg & Abbeduto (1987), https://doi.org/10.1017/S0142716400000047 Covington et al. (2006), http://ai1.ai.uga.edu/caspr/2006-01-Covington.pdf Dutch version implemented here: Appendix A of T-Scan manual, https://github.com/proycon/tscan/raw/master/docs/tscanhandleiding.pdf :param tree: A tree from the Alpino parser (i.e., not binarized, with function and morphological tags). :returns: integer 0-7; 7 is most complex.""" if lang != 'nl': raise NotImplementedError poslist = [] pv_counter = neven_counter = 0 for pos in tree.subtrees(lambda n: n and isinstance(n[0], int)): poslist.append(pos) if strip(pos.label) == 'ww' and 'pv' in morphfeats(pos): pv_counter += 1 elif strip(pos.label) == 'vg' and 'neven' in morphfeats(pos): neven_counter += 1 # 7: sentence with multiple subordinate clauses # (disregarding clauses in conjunctions) if pv_counter - neven_counter > 2: return 7 # 6: a subordinate clause modifying the subject for node in tree.subtrees(): if (strip(node.label) == 'REL' and function(node) == 'mod' and function(node.parent) == 'su'): return 6 elif (strip(node.label) in ('CP', 'WHSUB', 'WHREL', 'TI', 'OTI', 'INF') and function(node) == 'su'): return 6 elif (strip(node.label) == 'ww' and function(node.parent) == 'su' and strip(node.parent.label) == 'NP'): return 6 # 5: subordinate clause for pos in poslist: if (strip(pos.label) == 'vg' and 'onder' in morphfeats(pos) and pos.source[LEMMA] != 'dat'): return 5 # 4: non-finite clause as object with overt subject for node in tree.subtrees(): if function(node) == 'obcomp': return 4 for node in tree.subtrees(lambda n: function(n) == 'vc'): if strip(node.label) in ('TI', 'OTI', 'INF'): vcid = node.source[WORD].lstrip('#') for sib in node.parent: if function(sib) == 'obj1' and hassecedge(sib, 'su', vcid): return 4 # 3: finite clause as objects (and equivalents) for node in tree.subtrees(): if strip(node.label) == 'REL' and function(node) == 'mod': if function(node.parent) == 'obj1': return 3 elif strip(node.label) == 'ww': if strip(node.parent.label) == 'NP' and function( node.parent) == 'obj1': return 3 elif strip(node.label) in ('CP', 'WHSUB') and function(node) == 'vc': return 3 elif function(node) == 'sup': return 3 # 2: coordinated structure for pos in poslist: if strip(pos.label) == 'vg' and 'neven' in morphfeats(pos): return 2 # 1: non-finite clause with subject coindexed from main clause for node in tree.subtrees(lambda n: function(n) == 'vc'): if strip(node.label) in ('TI', 'OTI', 'INF'): vcid = node.source[WORD].lstrip('#') for sib in node.parent: if function(sib) == 'su' and hassecedge(sib, 'su', vcid): return 2 # 0: simple sentence return 0
[docs]def rindex(l, v): """Like list.index(), but go from right to left.""" return len(l) - 1 - l[::-1].index(v)
[docs]def labels(tree): """:returns: the labels of the children of this node.""" return [a.label for a in tree if isinstance(a, Tree)]
[docs]def pop(node): """Remove this node from its parent node, if it has one. Convenience function for ParentedTrees.""" try: return node.parent.pop(node.parent_index) except AttributeError: return node
def base(node, match): """Test whether ``node.label`` equals ``match`` after stripping features.""" return (node.label == match or node.label.startswith(match + STATESPLIT) or node.label.startswith(match + '-'))
[docs]def strip(label): """Equivalent to the effect of the @ operator in tregex.""" if '-' in label: return label[:label.index('-')] elif STATESPLIT in label: return label[:label.index(STATESPLIT)] return label
[docs]def ancestors(node): """Yield ancestors of node from direct parent to root node.""" while node: node = node.parent yield node
[docs]def bracketings(tree): """Labeled bracketings of a tree.""" return [(a.label, tuple(sorted(a.leaves()))) for a in tree.subtrees(lambda t: t and isinstance(t[0], Tree))]
# morphological features
[docs]def morphfeats(node): """Return the morphological features of a preterminal node. Features may be separated by dots or commas.""" try: morph = node.source[MORPH].replace('(', '[').replace(')', ']') morph = morph[morph.index('[') + 1:morph.index(']')] except (TypeError, ValueError): return () return morph.replace('.', ',').split(',')
[docs]def unifymorphfeat(feats, percolatefeatures=None): """Get the sorted union of features for a sequence of feature vectors. :param feats: a sequence of strings of comma/dot separated feature vectors. :param percolatefeatures: if a set is given, select only these features; by default all features are used. >>> print(unifymorphfeat({'Def.*.*', '*.Sg.*', '*.*.Akk'})) Akk.Def.Sg >>> print(unifymorphfeat({'LID[bep,stan,rest]', 'N[soort,ev,zijd,stan]'})) bep,ev,rest,soort,stan,zijd""" sep = '.' if any('.' in a for a in feats) else ',' result = set() for a in feats: if '[' in a: a = a[a.index('[') + 1:a.index(']')] result.update(a.split(sep)) if percolatefeatures: result.intersection_update(percolatefeatures) return sep.join(sorted(result - {'*', '--'}))
# Function tags
[docs]def function(node): """:returns: The first function tag for node, or the empty string.""" if getattr(node, 'source', None) is None: return '' return node.source[FUNC].split('-')[0]
[docs]def functions(node): """:returns: list of function tags for node, or an empty list.""" if getattr(node, 'source', None) is None: return [] a = node.source[FUNC] if a == '--' or a == '' or a is None: return [] return a.split('-')
# Secondary edges
[docs]def hassecedge(node, func, parentid): """Test whether this node has a secondary edge ``(func, parentid)``.""" if getattr(node, 'source', None) is None: return False return any(f == func and pid == parentid for f, pid in zip(node.source[6::2], node.source[7::2]))
def getmaxid(tree): """Return highest export non-terminal ID in tree.""" return max((int(node.source[WORD].lstrip('#')) for n, node in enumerate( tree.subtrees(lambda n: n.source and EXPORTNONTERMINAL.match(n.source[WORD])), 500)), default=500) __all__ = ['expandpresets', 'transform', 'reversetransform', 'collapselabels', 'dlevel', 'rrtransform', 'rrbacktransform', 'rindex', 'labels', 'pop', 'strip', 'ancestors', 'bracketings', 'morphfeats', 'unifymorphfeat', 'function', 'functions', 'hassecedge']