Source code for discodop.heads

"""Functions related to finding the linguistic head of a constituent."""
import io
import re
from collections import defaultdict, Counter
from .tree import Tree, HEAD, COMPLEMENT, MODIFIER
from .punctuation import ispunct

FIELDS = tuple(range(8))
WORD, LEMMA, TAG, MORPH, FUNC, PARENT, SECEDGETAG, SECEDGEPARENT = FIELDS
HEADRULERE = re.compile(r'^(\S+)\s+(LEFT-TO-RIGHT|RIGHT-TO-LEFT'
		r'|LEFT|RIGHT|LEFTDIS|RIGHTDIS|LIKE)(?:\s+(.*))?$')


[docs]def applyheadrules(tree, headrules, modifierrules=None):
	"""Apply head rules and set head attribute of nodes."""
	for node in tree.subtrees(
			lambda n: n and isinstance(n[0], Tree)):
		head = headfinder(node, headrules)
		if head is not None:
			head.type = HEAD
			if modifierrules is not None:
				markmodifiers(node, modifierrules)


[docs]def getheadpos(node):
	"""Get head word dominated by this node."""
	child = node
	while True:
		if not child:
			break
		if not isinstance(child[0], Tree):
			return child
		try:
			child = next(a for a in child if a.type == HEAD)
		except StopIteration:
			break
	return None


[docs]def readheadrules(filename):
	"""Read a file containing heuristic rules for head assignment.

	Example line: ``s right-to-left vmfin vafin vaimp``, which means
	traverse siblings of an S constituent from right to left, the first child
	with a label of vmfin, vafin, or vaimp will be marked as head."""
	headrules = {}
	with io.open(filename, encoding='utf8') as inp:
		for line in inp:
			line = line.strip().upper()
			if line and not line.startswith("%") and len(line.split()) > 2:
				try:
					label, direction, heads = HEADRULERE.match(line).groups()
				except AttributeError:
					print('no match:', line)
					raise
				if heads is None:
					heads = ''
				headrules.setdefault(label, [])
				if direction == 'LIKE':
					headrules[label].extend(headrules[heads])
				else:
					headrules[label].append((direction, heads.split()))
	return headrules


[docs]def headfinder(tree, headrules, headlabels=frozenset({'HD'})):
	"""Use head finding rules to select one child of tree node as head."""
	def find(heads, children):
		"""Match children with possible heads."""
		for head in heads:
			for child in children:
				if (isinstance(child, Tree)
						and child.label.split('[')[0].upper() == head):
					return child

	def invfind(heads, children):
		"""Inverted version of find()."""
		for child in children:
			for head in heads:
				if (isinstance(child, Tree)
						and child.label.split('[')[0].upper() == head):
					return child

	# check if we already have head information:
	for child in tree:
		if child.type == HEAD:
			return child
	for child in tree:
		if (child.source and not headlabels.isdisjoint(
				child.source[FUNC].upper().split('-'))):
			return child
	# apply heuristic rules:
	head = None
	children = tree
	for direction, heads in headrules.get(tree.label, []):
		if direction.startswith('LEFT'):
			children = tree
		elif direction.startswith('RIGHT'):
			children = tree[::-1]
		else:
			raise ValueError('expected RIGHT or LEFT.')
		if direction in ('LEFTDIS', 'RIGHTDIS'):
			head = invfind(heads, children)
		else:
			head = find(heads, children)
		if head is not None:
			break
	if head is None:
		# default head is initial/last nonterminal (depending on direction)
		for child in children:
			if (isinstance(child, Tree)
					and not ispunct(None, child)):
				return child
		return children[0]
	else:  # PTB-specific
		i = tree.index(head)
		if i >= 2 and tree[i - 1].label in {'CC', 'CONJP'}:
			for althead in tree[i - 2::-1]:
				if not ispunct(althead.label, althead):
					return althead
		return head


def readmodifierrules(filename):
	"""Read a file containing heuristic rules for marking modifiers.

	Example line: ``S *-MOD``, which means that for an S
	constituent, any child with the MOD function tag is a modifier.
	A default rule can be specified by using * as the first label, which
	always matches (in addition to another matching rule, if any).
	If none of the rules matches, a non-terminal is assumed to be a complement.
	"""
	modifierrules = {}
	with io.open(filename, encoding='utf8') as inp:
		for line in inp:
			line = line.strip().upper()
			if line and not line.startswith("%"):
				label, modifiers = line.split(None, 1)
				if label in modifierrules:
					raise ValueError('duplicate rule for %r (each label'
							' should occur at most once in the file)' % label)
				modifierrules[label] = modifiers.split()
	return modifierrules


def markmodifiers(tree, modifierrules):
	"""Use heuristics to distinguish complements from modifiers.

	Should be applied after heads have been identified."""
	from discodop.treebanktransforms import function
	prev = None
	for child in tree:
		if child.type == HEAD:
			continue
		child.type = COMPLEMENT
		applicablerules = modifierrules.get(tree.label.split('-', 1)[0], []
				) + modifierrules.get('*', [])
		for mod in applicablerules:
			if ((child.label.split('-', 1)[0].upper() == mod.split('-', 1)[0]
					or mod.split('-', 1)[0] == '*')
					and ('-' not in mod
						or mod.split('-', 1)[1] == '*'
						or function(child).upper() == mod.split('-', 1)[1])):
				child.type = MODIFIER
				break
		if child.label == prev:  # mark enumerations/lists as modifiers
			child.type = MODIFIER
		prev = child.label


[docs]def saveheads(tree, tailmarker):
	"""Infer head from binarization and store."""
	if tailmarker:
		for node in tree.subtrees(lambda n: tailmarker in n.label):
			node.type = HEAD
	else:
		# assume head-outward binarization; the last binarized node has the head.
		for node in tree.subtrees(lambda n: '|<' in n.label
				and not any(child.label.startswith(
					n.label[:n.label.index('|<') + 2])
					for child in n)):
			node[-1].type = HEAD


[docs]def headstats(trees):
	"""Collect some information useful for writing headrules.

	- ``heads['NP']['NN'] ==`` number of times NN occurs as head of NP.
	- ``pos1['NP'][1] ==`` number of times head of NP is at position 1.
	- ``pos2`` is like pos1, but position is from the right.
	- ``unknown['NP']['NN'] ==`` number of times NP that does not have a head
		dominates an NN."""
	heads, unknown = defaultdict(Counter), defaultdict(Counter)
	pos1, pos2 = defaultdict(Counter), defaultdict(Counter)
	for tree in trees:
		for a in tree.subtrees(lambda x: len(x) > 1):
			for n, b in enumerate(a):
				if b.type == HEAD:
					heads[a.label][b.label] += 1
					pos1[a.label][n] += 1
					pos2[a.label][len(a) - (n + 2)] += 1
					break
			else:
				unknown[a.label].update(b.label for b in a)
	return heads, unknown, pos1, pos2


__all__ = ['getheadpos', 'readheadrules', 'headfinder', 'saveheads',
		'headstats', 'applyheadrules']