Source code for discodop.heads

"""Functions related to finding the linguistic head of a constituent."""
import io
import re
from collections import defaultdict, Counter
from .tree import Tree, HEAD, COMPLEMENT, MODIFIER
from .punctuation import ispunct

FIELDS = tuple(range(8))
WORD, LEMMA, TAG, MORPH, FUNC, PARENT, SECEDGETAG, SECEDGEPARENT = FIELDS
HEADRULERE = re.compile(r'^(\S+)\s+(LEFT-TO-RIGHT|RIGHT-TO-LEFT'
		r'|LEFT|RIGHT|LEFTDIS|RIGHTDIS|LIKE)(?:\s+(.*))?$')


[docs]def applyheadrules(tree, headrules, modifierrules=None): """Apply head rules and set head attribute of nodes.""" for node in tree.subtrees( lambda n: n and isinstance(n[0], Tree)): head = headfinder(node, headrules) if head is not None: head.type = HEAD if modifierrules is not None: markmodifiers(node, modifierrules)
[docs]def getheadpos(node): """Get head word dominated by this node.""" child = node while True: if not child: break if not isinstance(child[0], Tree): return child try: child = next(a for a in child if a.type == HEAD) except StopIteration: break return None
[docs]def readheadrules(filename): """Read a file containing heuristic rules for head assignment. Example line: ``s right-to-left vmfin vafin vaimp``, which means traverse siblings of an S constituent from right to left, the first child with a label of vmfin, vafin, or vaimp will be marked as head.""" headrules = {} with io.open(filename, encoding='utf8') as inp: for line in inp: line = line.strip().upper() if line and not line.startswith("%") and len(line.split()) > 2: try: label, direction, heads = HEADRULERE.match(line).groups() except AttributeError: print('no match:', line) raise if heads is None: heads = '' headrules.setdefault(label, []) if direction == 'LIKE': headrules[label].extend(headrules[heads]) else: headrules[label].append((direction, heads.split())) return headrules
[docs]def headfinder(tree, headrules, headlabels=frozenset({'HD'})): """Use head finding rules to select one child of tree node as head.""" def find(heads, children): """Match children with possible heads.""" for head in heads: for child in children: if (isinstance(child, Tree) and child.label.split('[')[0].upper() == head): return child def invfind(heads, children): """Inverted version of find().""" for child in children: for head in heads: if (isinstance(child, Tree) and child.label.split('[')[0].upper() == head): return child # check if we already have head information: for child in tree: if child.type == HEAD: return child for child in tree: if (child.source and not headlabels.isdisjoint( child.source[FUNC].upper().split('-'))): return child # apply heuristic rules: head = None children = tree for direction, heads in headrules.get(tree.label, []): if direction.startswith('LEFT'): children = tree elif direction.startswith('RIGHT'): children = tree[::-1] else: raise ValueError('expected RIGHT or LEFT.') if direction in ('LEFTDIS', 'RIGHTDIS'): head = invfind(heads, children) else: head = find(heads, children) if head is not None: break if head is None: # default head is initial/last nonterminal (depending on direction) for child in children: if (isinstance(child, Tree) and not ispunct(None, child)): return child return children[0] else: # PTB-specific i = tree.index(head) if i >= 2 and tree[i - 1].label in {'CC', 'CONJP'}: for althead in tree[i - 2::-1]: if not ispunct(althead.label, althead): return althead return head
def readmodifierrules(filename): """Read a file containing heuristic rules for marking modifiers. Example line: ``S *-MOD``, which means that for an S constituent, any child with the MOD function tag is a modifier. A default rule can be specified by using * as the first label, which always matches (in addition to another matching rule, if any). If none of the rules matches, a non-terminal is assumed to be a complement. """ modifierrules = {} with io.open(filename, encoding='utf8') as inp: for line in inp: line = line.strip().upper() if line and not line.startswith("%"): label, modifiers = line.split(None, 1) if label in modifierrules: raise ValueError('duplicate rule for %r (each label' ' should occur at most once in the file)' % label) modifierrules[label] = modifiers.split() return modifierrules def markmodifiers(tree, modifierrules): """Use heuristics to distinguish complements from modifiers. Should be applied after heads have been identified.""" from discodop.treebanktransforms import function prev = None for child in tree: if child.type == HEAD: continue child.type = COMPLEMENT applicablerules = modifierrules.get(tree.label.split('-', 1)[0], [] ) + modifierrules.get('*', []) for mod in applicablerules: if ((child.label.split('-', 1)[0].upper() == mod.split('-', 1)[0] or mod.split('-', 1)[0] == '*') and ('-' not in mod or mod.split('-', 1)[1] == '*' or function(child).upper() == mod.split('-', 1)[1])): child.type = MODIFIER break if child.label == prev: # mark enumerations/lists as modifiers child.type = MODIFIER prev = child.label
[docs]def saveheads(tree, tailmarker): """Infer head from binarization and store.""" if tailmarker: for node in tree.subtrees(lambda n: tailmarker in n.label): node.type = HEAD else: # assume head-outward binarization; the last binarized node has the head. for node in tree.subtrees(lambda n: '|<' in n.label and not any(child.label.startswith( n.label[:n.label.index('|<') + 2]) for child in n)): node[-1].type = HEAD
[docs]def headstats(trees): """Collect some information useful for writing headrules. - ``heads['NP']['NN'] ==`` number of times NN occurs as head of NP. - ``pos1['NP'][1] ==`` number of times head of NP is at position 1. - ``pos2`` is like pos1, but position is from the right. - ``unknown['NP']['NN'] ==`` number of times NP that does not have a head dominates an NN.""" heads, unknown = defaultdict(Counter), defaultdict(Counter) pos1, pos2 = defaultdict(Counter), defaultdict(Counter) for tree in trees: for a in tree.subtrees(lambda x: len(x) > 1): for n, b in enumerate(a): if b.type == HEAD: heads[a.label][b.label] += 1 pos1[a.label][n] += 1 pos2[a.label][len(a) - (n + 2)] += 1 break else: unknown[a.label].update(b.label for b in a) return heads, unknown, pos1, pos2
__all__ = ['getheadpos', 'readheadrules', 'headfinder', 'saveheads', 'headstats', 'applyheadrules']