Source code for discodop.lexicon

# -*- coding: UTF-8 -*-
"""Add rules to handle unknown words and smooth lexical probabilities.

Rare words in the training set are replaced with word signatures, such that
unknown words can receive similar tags. Given a function to produce such
signatures from words, the flow is as follows:

- Simple lexical smoothing:

  #. getunknownwordmodel (get statistics)
  #. replaceraretrainwords (adjust trees)
  #. [ read off grammar ]
  #. simplesmoothlexicon (add extra lexical productions)

- During parsing:

  #. replaceraretestwords (only give known words and signatures to parser)
  #. restore original words in derivations

"""
# pylint: disable=abstract-class-instantiated
import os
import re
import logging
import tempfile
from operator import itemgetter
from subprocess import Popen, PIPE
from collections import defaultdict, Counter, OrderedDict
from .tree import escape
from .util import which
from .containers import REMOVESTATESPLITS

UNK = '_UNK'
NUMBERRE = re.compile('^(?:[0-9]*[,.\'])?[0-9]+$')
YEARRE = re.compile('^(?:19|20)[0-9]{2}$')
TREETAGGERHELP = '''tree tagger not found. commands to install:
mkdir tree-tagger && cd tree-tagger
wget ftp://ftp.ims.uni-stuttgart.de/pub/corpora/tree-tagger-linux-3.2.tar.gz
tar -xzf tree-tagger-linux-3.2.tar.gz
wget ftp://ftp.ims.uni-stuttgart.de/pub/corpora/tagger-scripts.tar.gz
tar -xzf tagger-scripts.tar.gz
mkdir lib && cd lib && wget \
ftp://ftp.ims.uni-stuttgart.de/pub/corpora/german-par-linux-3.2-utf8.bin.gz
gunzip german-par-linux-3.2-utf8.bin.gz'''
STANFORDTAGGERHELP = '''Stanford tagger not found. Commands to install:
wget http://nlp.stanford.edu/software/stanford-postagger-full-2012-07-09.tgz
tar -xzf stanford-postagger-full-2012-07-09.tgz'''


[docs]def getunknownwordmodel(tagged_sents, unknownword,
		unknownthreshold, openclassthreshold):
	"""Collect statistics for an unknown word model.

	:param tagged_sents: the sentences from the training set with the gold POS
			tags from the treebank.
	:param unknownword: a function that returns a signature for a given word;
			e.g., "eschewed" => "_UNK-L-d".
	:param unknownthreshold: words with frequency lower than or equal to this
			are replaced by their signature.
	:param openclassthreshold: tags that rewrite to at least this much word
			types are considered to be open class categories, so that open
			class words and tags can be identified."""
	wordsfortag = defaultdict(set)
	tags = Counter()
	wordtags = Counter()
	words = Counter(word for sent in tagged_sents for word, tag in sent)
	lexicon = {word for word, freq in words.items()
			if freq > unknownthreshold}
	for sent in tagged_sents:
		for n, (word, tag) in enumerate(sent):
			wordsfortag[tag].add(word)
			tags[tag] += 1
			wordtags[word, tag] += 1
	openclasstags = {}
	if openclassthreshold:
		# consider POS tags with different function tags together
		wordsforcollapsedtags = defaultdict(set)
		for tag, ws in wordsfortag.items():
			match = REMOVESTATESPLITS.match(tag)
			if match is not None:
				tag1 = match.group(2)
				wordsforcollapsedtags[tag1].update(w.lower() for w in ws)
		for tag, ws in wordsfortag.items():
			match = REMOVESTATESPLITS.match(tag)
			if match is None:  # should not happen (tm), only for robustness
				if len({w.lower() for w in ws}) >= openclassthreshold:
					openclasstags[tag] = len({w.lower() for w in ws})
			elif (len(wordsforcollapsedtags[match.group(2)])
					>= openclassthreshold):
				openclasstags[tag] = len({w.lower() for w in ws})
		closedclasstags = {tag: len({w.lower() for w in wordsfortag[tag]})
				for tag in tags if tag not in openclasstags}
		# FIXME: do we want to preserve case? could simplify code otherwise
		closedclasswords = {word for tag in closedclasstags
				for word in wordsfortag[tag]}
		# NB: words below unknown word threshold are included
		# openclasswords = words.keys() - closedclasswords
		# add rare closed-class words back to lexicon
		lexicon.update(closedclasswords)
	else:
		closedclasswords = set()
	sigs = Counter()
	for sent in tagged_sents:
		for n, (word, _) in enumerate(sent):
			if word not in lexicon:
				# NB: sig depends on n and lexicon
				sig = unknownword(word, n, lexicon)
				sigs[sig] += 1
	msg = 'word types: %d, signature types: %d\n' % (
			len(lexicon), len(sigs))
	msg += 'number of words per collapsed tag: %s\n' % ' '.join(sorted(
			'%s:%d' % (a, len(b)) for a, b in wordsforcollapsedtags.items()))
	msg += 'open class tags: %s\n\n' % ' '.join(sorted(
			'%s:%d' % a for a in openclasstags.items()))
	msg += 'closed class tags: %s\n' % ' '.join(sorted(
			'%s:%d' % a for a in closedclasstags.items()))
	msg += 'closed class words: %s\n' % ' '.join(sorted(closedclasswords))
	return (sigs, words, lexicon, closedclasswords, tags, wordtags), msg


[docs]def replaceraretrainwords(tagged_sents, unknownword, lexicon):
	"""Replace train set words not in lexicon w/signature from unknownword()."""
	def repl(n, word):
		"""Replace word w/signature if needed."""
		if YEARRE.match(word):
			return '1970'
		elif NUMBERRE.match(word):
			return '000'
		elif word not in lexicon:
			return unknownword(word, n, lexicon)
		return word

	return [[repl(n, word) for n, (word, _) in enumerate(sent)]
				for sent in tagged_sents]


[docs]def replaceraretestwords(sent, unknownword, lexicon, sigs):
	"""Replace test set words not in lexicon w/signature from unknownword().

	If only a lowercase version of a word is in the grammar, that will be used
	instead. If the returned signature is not part of the grammar, a default
	one is returned."""
	for n, word in enumerate(sent):
		if YEARRE.match(word):
			yield '1970'
		elif NUMBERRE.match(word):
			yield '000'
		elif word in lexicon:
			yield word
		elif word.lower() in lexicon:
			yield word.lower()
		else:
			sig = unknownword(word, n, lexicon)
			if sig in sigs:
				yield sig
			else:
				yield UNK


[docs]def simplesmoothlexicon(lexmodel, epsilon=1. / 100):
	"""Collect new lexical productions.

	- for rare words, include productions with words in addition to signatures.
	- map unobserved signatures to ``_UNK`` and associate w/all potential tags.
	- (unobserved combinations of open class (word, tag) handled in parser).

	:param epsilon: pseudo-frequency of unseen productions ``tag => word``.
	:returns: a dictionary of lexical rules, with pseudo-frequencies as values.
	"""
	(_sigs, _words, lexicon, _closedclasswords, tags, wordtags) = lexmodel
	newrules = {}
	# rare words as signature AND as word:
	for word, tag in wordtags:
		if word not in lexicon:
			# needs to be normalized later
			newrules[(tag, 'Epsilon'), (escape(word), )] = wordtags[word, tag]
			# print(tag, '=>', word, wordstags[word, tag], file=sys.stderr)
	for tag in tags:  # catch-all unknown signature
		newrules[(tag, 'Epsilon'), (UNK, )] = epsilon
	return newrules


# === functions for unknown word signatures ============

HASDIGIT = re.compile(r"\d", re.UNICODE)
HASNONDIGIT = re.compile(r"\D", re.UNICODE)
# NB: includes '-', hyphen, non-breaking hyphen
# does NOT include: figure-dash, em-dash, en-dash (these are punctuation,
# not word-combining) u2012-u2015; nb: these are hex values.
HASDASH = re.compile("[-\u2010\u2011]")
# FIXME: exclude accented characters for model 6?
HASLOWER = re.compile('[a-z\xe7\xe9\xe0\xec\xf9\xe2\xea\xee\xf4\xfb\xeb'
		'\xef\xfc\xff\u0153\xe6]')
HASUPPER = re.compile('[A-Z\xc7\xc9\xc0\xcc\xd9\xc2\xca\xce\xd4\xdb\xcb'
		'\xcf\xdc\u0178\u0152\xc6]')
HASLETTER = re.compile('[A-Za-z\xe7\xe9\xe0\xec\xf9\xe2\xea\xee\xf4\xfb'
		'\xeb\xef\xfc\xff\u0153\xe6\xc7\xc9\xc0\xcc\xd9\xc2\xca\xce\xd4'
		'\xdb\xcb\xcf\xdc\u0178\u0152\xc6]')
# Cf. http://en.wikipedia.org/wiki/French_alphabet
LOWER = ('abcdefghijklmnopqrstuvwxyz\xe7\xe9\xe0\xec\xf9\xe2\xea\xee\xf4\xfb'
		'\xeb\xef\xfc\xff\u0153\xe6')
UPPER = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ\xc7\xc9\xc0\xcc\xd9\xc2\xca\xce\xd4\xdb'
		'\xcb\xcf\xdc\u0178\u0152\xc6')
LOWERUPPER = LOWER + UPPER


[docs]def unknownword6(word, loc, lexicon):
	"""Model 6 of the Stanford parser (for WSJ treebank)."""
	wlen = len(word)
	numcaps = 0
	sig = UNK
	numcaps = len(HASUPPER.findall(word))
	lowered = word.lower()
	if numcaps > 1:
		sig += "-CAPS"
	elif numcaps > 0:
		if loc == 0:
			sig += "-INITC"
			if lowered in lexicon:
				sig += "-KNOWNLC"
		else:
			sig += "-CAP"
	elif HASLOWER.search(word):
		sig += "-LC"
	if HASDIGIT.search(word):
		sig += "-NUM"
	if HASDASH.search(word):
		sig += "-DASH"
	if lowered.endswith('s') and wlen >= 3:
		if lowered[-2] not in 'siu':
			sig += '-s'
	elif wlen >= 5 and not HASDASH.search(word) and not (
			HASDIGIT.search(word) and numcaps > 0):
		suffixes = ('ed', 'ing', 'ion', 'er', 'est', 'ly', 'ity', 'y', 'al')
		for a in suffixes:
			if lowered.endswith(a):
				sig += "-%s" % a
				break
	return sig


[docs]def unknownword4(word, loc, _lexicon):
	"""Model 4 of the Stanford parser. Relatively language agnostic."""
	sig = UNK

	# letters
	if word and word[0] in UPPER:
		if not HASLOWER.search(word):
			sig += "-AC"
		elif loc == 0:
			sig += "-SC"
		else:
			sig += "-C"
	elif HASLOWER.search(word):
		sig += "-L"
	elif HASLETTER.search(word):
		sig += "-U"
	else:
		sig += "-S"  # no letter

	# digits
	if HASDIGIT.search(word):
		if HASNONDIGIT.search(word):
			sig += "-n"
		else:
			sig += "-N"

	# punctuation
	if "-" in word:
		sig += "-H"
	if "." in word:
		sig += "-P"
	if "," in word:
		sig += "-C"
	if len(word) > 3:
		if word[-1] in LOWERUPPER:
			sig += "-%s" % word[-2:].lower()
	return sig


[docs]def unknownwordbase(word, _loc, _lexicon):
	"""BaseUnknownWordModel of the Stanford parser.
	Relatively language agnostic."""
	sig = UNK

	# letters
	if word[0] in UPPER:
		sig += "-C"
	else:
		sig += "-c"

	# digits
	if HASDIGIT.search(word):
		if HASNONDIGIT.search(word):
			sig += "-n"
		else:
			sig += "-N"

	# punctuation
	if "-" in word:
		sig += "-H"
	if word == ".":
		sig += "-P"
	if word == ",":
		sig += "-C"
	if len(word) > 3:
		if word[-1] in LOWERUPPER:
			sig += "-%s" % word[-2:].lower()
	return sig


NOUNSUFFIX = re.compile("(ier|ière|ité|ion|ison|isme|ysme|iste|esse|eur|euse"
		"|ence|eau|erie|ng|ette|age|ade|ance|ude|ogue|aphe|ate|duc|anthe"
		"|archie|coque|érèse|ergie|ogie|lithe|mètre|métrie|odie|pathie|phie"
		"|phone|phore|onyme|thèque|scope|some|pole|ôme|chromie|pie)s?$")
ADJSUFFIX = re.compile("(iste|ième|uple|issime|aire|esque|atoire|ale|al|able"
		"|ible|atif|ique|if|ive|eux|aise|ent|ois|oise|ante|el|elle|ente|oire"
		"|ain|aine)s?$")
POSSIBLEPLURAL = re.compile("(s|ux)$")
VERBSUFFIX = re.compile("(ir|er|re|ez|ont|ent|ant|ais|ait|ra|era|eras"
		"|é|és|ées|isse|it)$")
ADVSUFFIX = re.compile("(iment|ement|emment|amment)$")
HASPUNC = re.compile("([\u0021-\u002F\u003A-\u0040\u005B\u005C\u005D"
		"\u005E-\u0060\u007B-\u007E\u00A1-\u00BF\u2010-\u2027\u2030-\u205E"
		"\u20A0-\u20B5])+")
ISPUNC = re.compile("([\u0021-\u002F\u003A-\u0040\u005B\u005C\u005D"
		"\u005E-\u0060\u007B-\u007E\u00A1-\u00BF\u2010-\u2027\u2030-\u205E"
		"\u20A0-\u20B5])+$")


[docs]def unknownwordftb(word, loc, _lexicon):
	"""Model 2 for French of the Stanford parser."""
	sig = UNK

	if ADVSUFFIX.search(word):
		sig += "-ADV"
	elif VERBSUFFIX.search(word):
		sig += "-VB"
	elif NOUNSUFFIX.search(word):
		sig += "-NN"

	if ADJSUFFIX.search(word):
		sig += "-ADV"
	if HASDIGIT.search(word):
		sig += "-NUM"
	if POSSIBLEPLURAL.search(word):
		sig += "-PL"

	if ISPUNC.search(word):
		sig += "-ISPUNC"
	elif HASPUNC.search(word):
		sig += "-HASPUNC"

	if loc > 0 and word and word[0] in UPPER:
		sig += "-UP"

	return sig


UNKNOWNWORDFUNC = {
		"4": unknownword4,
		"6": unknownword6,
		"base": unknownwordbase,
		"ftb": unknownwordftb,
		}


# === Performing POS tagging with external tools ============
[docs]def externaltagging(usetagger, model, sents, overridetag, tagmap):
	"""Use an external tool to tag a list of sentences."""
	logging.info('Start tagging.')
	goldtags = [t for sent in sents.values() for _, t in sent]
	if usetagger == 'treetagger':  # Tree-tagger
		if not os.path.exists('tree-tagger/bin/tree-tagger'):
			raise ValueError(TREETAGGERHELP)
		infile, inname = tempfile.mkstemp(text=True)
		with os.fdopen(infile, 'w') as infile:
			for tagsent in sents.values():
				sent = map(itemgetter(0), tagsent)
				infile.write('\n'.join(w.encode('utf-8')
					for w in sent) + '\n<S>\n')
		filtertags = ''
		if not model:
			model = 'tree-tagger/lib/german-par-linux-3.2-utf8.bin'
			filtertags = '| tree-tagger/cmd/filter-german-tags'
		tagger = Popen('tree-tagger/bin/tree-tagger -token -sgml'
				' %s %s %s' % (model, inname, filtertags),
				stdout=PIPE, shell=True)
		tagout = tagger.stdout.read(
				).decode('utf-8').split('<S>')[:-1]
		os.unlink(inname)
		taggedsents = OrderedDict((n, [tagmangle(a, None, overridetag, tagmap)
					for a in tags.splitlines() if a.strip()])
					for n, tags in zip(sents, tagout))
	elif usetagger == 'stanford':  # Stanford Tagger
		if not os.path.exists('stanford-postagger-full-2012-07-09'):
			raise ValueError(STANFORDTAGGERHELP)
		infile, inname = tempfile.mkstemp(text=True)
		with os.fdopen(infile, 'w') as infile:
			for tagsent in sents.values():
				sent = map(itemgetter(0), tagsent)
				infile.write(' '.join(w.encode('utf-8')
					for w in sent) + '\n')
		if not model:
			model = 'models/german-hgc.tagger'
		tagger = Popen(args=(
				'/usr/bin/java -mx2G -classpath stanford-postagger.jar'
				' edu.stanford.nlp.tagger.maxent.MaxentTagger'
				' -tokenize false -encoding utf-8'
				' -model %s -textFile %s' % (model, inname)).split(),
				cwd='stanford-postagger-full-2012-07-09',
				shell=False, stdout=PIPE)
		tagout = tagger.stdout.read(
				).decode('utf-8').splitlines()
		os.unlink(inname)
		taggedsents = OrderedDict((n, [tagmangle(a, '_', overridetag, tagmap)
			for a in tags.split()]) for n, tags in zip(sents, tagout))
	elif usetagger == 'frog':  # Dutch 'frog' tagger
		tagger = Popen(args=[which('frog')]
					+ '-n --skip=tacmnp -t /dev/stdin'.split(),
				shell=False, stdin=PIPE, stdout=PIPE)
		tagout, stderr = tagger.communicate(''.join(
				' '.join(w for w in map(itemgetter(0), tagsent)) + '\n'
				for tagsent in sents.values()).encode('utf8'))
		logging.info(stderr)
		# lines consist of: 'idx token lemma POS score'
		taggedsents = OrderedDict((n,
				[(line.split()[1],
					line.split()[3].replace('(', '[').replace(')', ']'))
					for line in lines.splitlines()]) for n, lines
				in zip(sents, tagout.decode('utf-8').split('\n\n')))
	if len(taggedsents) != len(sents):
		raise ValueError('mismatch in number of sentences after tagging.')
	for n, tags in taggedsents.items():
		if len(sents[n]) != len(tags):
			raise ValueError('mismatch in number of tokens after tagging.\n'
				'before: %r\nafter: %r' % (sents[n], tags))
	newtags = [t for sent in taggedsents.values() for _, t in sent]
	logging.info('Tag accuracy: %5.2f\ngold - cand: %r\ncand - gold %r',
		(100 * accuracy(goldtags, newtags)),
		set(goldtags) - set(newtags), set(newtags) - set(goldtags))
	return taggedsents


def accuracy(gold, cand):
	"""Compute fraction of equivalent pairs in two sequences."""
	return sum(a == b for a, b in zip(gold, cand)) / len(gold)


[docs]def tagmangle(a, splitchar, overridetag, tagmap):
	"""Function to filter tags after they are produced by the tagger."""
	word, tag = a.rsplit(splitchar, 1)
	for newtag in overridetag:
		if word in overridetag[newtag]:
			tag = newtag
	return word, tagmap.get(tag, tag)


# The following is based on the undo-compounds transformation of FTB in
# Candito, M., Crabbé, B., & Denis, P. (2010). Statistical French dependency
# parsing: treebank conversion and first results.
# http://www.lrec-conf.org/proceedings/lrec2010/pdf/392_Paper.pdf

# regex expressions for repeated compound patterns which will be undone
FTBREGULARCOMPOUNDPATTERNS = {
		# an N, maybe with Det, Adj, and PPs:
		# "Organisation de coopération et de développement économique"
		# "Institut de formation des agents de voyages"
		# "pomme de terre"
		# "marché monétaire et obligataire"
		# "Bureau de recherches géologiques et minières"
		'N': re.compile(
			r'(D )?(A )*N( A( C A)?)*( P(\+D)?( D)?( A)* N( A( C A)?)*'
			r'(( C)? P(\+D)?( D)?( A)* N( A)*)*)?$'
			# r'|N( N)+$'  # Air France, maison mère ...
			# r'|(N|ET)( ET)+$'  # Wall Street, Zenith Data Structures,
			),
		'V': re.compile(
			r'V( V)*( P| A| D)*( N)+( P)*$'  # mettre en place, etc.
			r'|V( D)?( A)?( N)+$'),  # faire face, faire appel
		'P': re.compile(r'P(\+D)? (D )?(A )*N( P(\+D)?( D)?( A)* N( A( C A)?)*'
			r'(( C)? P(\+D)?( D)?( A)* N( A)*)*)? P(\+D)?$'),
		'ADV': re.compile(r'P(\+D)? (D )?(A )*N( P(\+D)?( D)?( A)* N( A)*)?$'),
		'A': re.compile(r'A (C A)?'),
		}

# The following compounds should not be undone; mostly organization names or
# fixed expressions like "aujourd'hui". TODO! extend list of allowed compounds
FTBALLOWEDCOMPOUNDS = {
		'(MWN (N Fondation) (N France) (A active))',
		'(MWN (N Côte) (PONCT -) (P d\') (N Ivoire))',
		'(MWP (CL Il) (CL y) (V a))',
		'(MWN (N Jean) (PONCT -) (N Louis))',
		'(MWADV (ADV tout) (P de) (N suite))',
		'(MWP (P jusqu\') (P au))',
		'(MWN (A Haute) (PONCT -) (N Corse))',
		'(MWP (CL y) (A compris))',
		'(MWN (N Chalon) (PONCT -) (P sur) (PONCT -) (N Saône))',
		'(MWP (CL il) (CL y) (V a))',
		'(MWN (N Royaume) (PONCT -) (A uni))',
		'(MWN (N Seine) (PONCT -) (N Saint) (PONCT -) (N Denis))',
		'(MWN (N Roche) (PONCT -) (N la) (PONCT -) (N Molière))',
		'(MWN (N Air) (N France))',
		'(MWN (A Grande) (PONCT -) (N Bretagne))',
		'(MWN (N Union) (A soviétique))',
		}


__all__ = ['getunknownwordmodel', 'replaceraretrainwords',
		'replaceraretestwords', 'simplesmoothlexicon',
		'unknownword6', 'unknownword4', 'unknownwordbase',
		'unknownwordftb', 'externaltagging', 'tagmangle']