Source code for discodop.lexicon

# -*- coding: UTF-8 -*-
"""Add rules to handle unknown words and smooth lexical probabilities.

Rare words in the training set are replaced with word signatures, such that
unknown words can receive similar tags. Given a function to produce such
signatures from words, the flow is as follows:

- Simple lexical smoothing:

  #. getunknownwordmodel (get statistics)
  #. replaceraretrainwords (adjust trees)
  #. [ read off grammar ]
  #. simplesmoothlexicon (add extra lexical productions)

- During parsing:

  #. replaceraretestwords (only give known words and signatures to parser)
  #. restore original words in derivations

"""
# pylint: disable=abstract-class-instantiated
import os
import re
import logging
import tempfile
from operator import itemgetter
from subprocess import Popen, PIPE
from collections import defaultdict, Counter, OrderedDict
from .tree import escape
from .util import which
from .containers import REMOVESTATESPLITS

UNK = '_UNK'
NUMBERRE = re.compile('^(?:[0-9]*[,.\'])?[0-9]+$')
YEARRE = re.compile('^(?:19|20)[0-9]{2}$')
TREETAGGERHELP = '''tree tagger not found. commands to install:
mkdir tree-tagger && cd tree-tagger
wget ftp://ftp.ims.uni-stuttgart.de/pub/corpora/tree-tagger-linux-3.2.tar.gz
tar -xzf tree-tagger-linux-3.2.tar.gz
wget ftp://ftp.ims.uni-stuttgart.de/pub/corpora/tagger-scripts.tar.gz
tar -xzf tagger-scripts.tar.gz
mkdir lib && cd lib && wget \
ftp://ftp.ims.uni-stuttgart.de/pub/corpora/german-par-linux-3.2-utf8.bin.gz
gunzip german-par-linux-3.2-utf8.bin.gz'''
STANFORDTAGGERHELP = '''Stanford tagger not found. Commands to install:
wget http://nlp.stanford.edu/software/stanford-postagger-full-2012-07-09.tgz
tar -xzf stanford-postagger-full-2012-07-09.tgz'''


[docs]def getunknownwordmodel(tagged_sents, unknownword, unknownthreshold, openclassthreshold): """Collect statistics for an unknown word model. :param tagged_sents: the sentences from the training set with the gold POS tags from the treebank. :param unknownword: a function that returns a signature for a given word; e.g., "eschewed" => "_UNK-L-d". :param unknownthreshold: words with frequency lower than or equal to this are replaced by their signature. :param openclassthreshold: tags that rewrite to at least this much word types are considered to be open class categories, so that open class words and tags can be identified.""" wordsfortag = defaultdict(set) tags = Counter() wordtags = Counter() words = Counter(word for sent in tagged_sents for word, tag in sent) lexicon = {word for word, freq in words.items() if freq > unknownthreshold} for sent in tagged_sents: for n, (word, tag) in enumerate(sent): wordsfortag[tag].add(word) tags[tag] += 1 wordtags[word, tag] += 1 openclasstags = {} if openclassthreshold: # consider POS tags with different function tags together wordsforcollapsedtags = defaultdict(set) for tag, ws in wordsfortag.items(): match = REMOVESTATESPLITS.match(tag) if match is not None: tag1 = match.group(2) wordsforcollapsedtags[tag1].update(w.lower() for w in ws) for tag, ws in wordsfortag.items(): match = REMOVESTATESPLITS.match(tag) if match is None: # should not happen (tm), only for robustness if len({w.lower() for w in ws}) >= openclassthreshold: openclasstags[tag] = len({w.lower() for w in ws}) elif (len(wordsforcollapsedtags[match.group(2)]) >= openclassthreshold): openclasstags[tag] = len({w.lower() for w in ws}) closedclasstags = {tag: len({w.lower() for w in wordsfortag[tag]}) for tag in tags if tag not in openclasstags} # FIXME: do we want to preserve case? could simplify code otherwise closedclasswords = {word for tag in closedclasstags for word in wordsfortag[tag]} # NB: words below unknown word threshold are included # openclasswords = words.keys() - closedclasswords # add rare closed-class words back to lexicon lexicon.update(closedclasswords) else: closedclasswords = set() sigs = Counter() for sent in tagged_sents: for n, (word, _) in enumerate(sent): if word not in lexicon: # NB: sig depends on n and lexicon sig = unknownword(word, n, lexicon) sigs[sig] += 1 msg = 'word types: %d, signature types: %d\n' % ( len(lexicon), len(sigs)) msg += 'number of words per collapsed tag: %s\n' % ' '.join(sorted( '%s:%d' % (a, len(b)) for a, b in wordsforcollapsedtags.items())) msg += 'open class tags: %s\n\n' % ' '.join(sorted( '%s:%d' % a for a in openclasstags.items())) msg += 'closed class tags: %s\n' % ' '.join(sorted( '%s:%d' % a for a in closedclasstags.items())) msg += 'closed class words: %s\n' % ' '.join(sorted(closedclasswords)) return (sigs, words, lexicon, closedclasswords, tags, wordtags), msg
[docs]def replaceraretrainwords(tagged_sents, unknownword, lexicon): """Replace train set words not in lexicon w/signature from unknownword().""" def repl(n, word): """Replace word w/signature if needed.""" if YEARRE.match(word): return '1970' elif NUMBERRE.match(word): return '000' elif word not in lexicon: return unknownword(word, n, lexicon) return word return [[repl(n, word) for n, (word, _) in enumerate(sent)] for sent in tagged_sents]
[docs]def replaceraretestwords(sent, unknownword, lexicon, sigs): """Replace test set words not in lexicon w/signature from unknownword(). If only a lowercase version of a word is in the grammar, that will be used instead. If the returned signature is not part of the grammar, a default one is returned.""" for n, word in enumerate(sent): if YEARRE.match(word): yield '1970' elif NUMBERRE.match(word): yield '000' elif word in lexicon: yield word elif word.lower() in lexicon: yield word.lower() else: sig = unknownword(word, n, lexicon) if sig in sigs: yield sig else: yield UNK
[docs]def simplesmoothlexicon(lexmodel, epsilon=1. / 100): """Collect new lexical productions. - for rare words, include productions with words in addition to signatures. - map unobserved signatures to ``_UNK`` and associate w/all potential tags. - (unobserved combinations of open class (word, tag) handled in parser). :param epsilon: pseudo-frequency of unseen productions ``tag => word``. :returns: a dictionary of lexical rules, with pseudo-frequencies as values. """ (_sigs, _words, lexicon, _closedclasswords, tags, wordtags) = lexmodel newrules = {} # rare words as signature AND as word: for word, tag in wordtags: if word not in lexicon: # needs to be normalized later newrules[(tag, 'Epsilon'), (escape(word), )] = wordtags[word, tag] # print(tag, '=>', word, wordstags[word, tag], file=sys.stderr) for tag in tags: # catch-all unknown signature newrules[(tag, 'Epsilon'), (UNK, )] = epsilon return newrules
# === functions for unknown word signatures ============ HASDIGIT = re.compile(r"\d", re.UNICODE) HASNONDIGIT = re.compile(r"\D", re.UNICODE) # NB: includes '-', hyphen, non-breaking hyphen # does NOT include: figure-dash, em-dash, en-dash (these are punctuation, # not word-combining) u2012-u2015; nb: these are hex values. HASDASH = re.compile("[-\u2010\u2011]") # FIXME: exclude accented characters for model 6? HASLOWER = re.compile('[a-z\xe7\xe9\xe0\xec\xf9\xe2\xea\xee\xf4\xfb\xeb' '\xef\xfc\xff\u0153\xe6]') HASUPPER = re.compile('[A-Z\xc7\xc9\xc0\xcc\xd9\xc2\xca\xce\xd4\xdb\xcb' '\xcf\xdc\u0178\u0152\xc6]') HASLETTER = re.compile('[A-Za-z\xe7\xe9\xe0\xec\xf9\xe2\xea\xee\xf4\xfb' '\xeb\xef\xfc\xff\u0153\xe6\xc7\xc9\xc0\xcc\xd9\xc2\xca\xce\xd4' '\xdb\xcb\xcf\xdc\u0178\u0152\xc6]') # Cf. http://en.wikipedia.org/wiki/French_alphabet LOWER = ('abcdefghijklmnopqrstuvwxyz\xe7\xe9\xe0\xec\xf9\xe2\xea\xee\xf4\xfb' '\xeb\xef\xfc\xff\u0153\xe6') UPPER = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ\xc7\xc9\xc0\xcc\xd9\xc2\xca\xce\xd4\xdb' '\xcb\xcf\xdc\u0178\u0152\xc6') LOWERUPPER = LOWER + UPPER
[docs]def unknownword6(word, loc, lexicon): """Model 6 of the Stanford parser (for WSJ treebank).""" wlen = len(word) numcaps = 0 sig = UNK numcaps = len(HASUPPER.findall(word)) lowered = word.lower() if numcaps > 1: sig += "-CAPS" elif numcaps > 0: if loc == 0: sig += "-INITC" if lowered in lexicon: sig += "-KNOWNLC" else: sig += "-CAP" elif HASLOWER.search(word): sig += "-LC" if HASDIGIT.search(word): sig += "-NUM" if HASDASH.search(word): sig += "-DASH" if lowered.endswith('s') and wlen >= 3: if lowered[-2] not in 'siu': sig += '-s' elif wlen >= 5 and not HASDASH.search(word) and not ( HASDIGIT.search(word) and numcaps > 0): suffixes = ('ed', 'ing', 'ion', 'er', 'est', 'ly', 'ity', 'y', 'al') for a in suffixes: if lowered.endswith(a): sig += "-%s" % a break return sig
[docs]def unknownword4(word, loc, _lexicon): """Model 4 of the Stanford parser. Relatively language agnostic.""" sig = UNK # letters if word and word[0] in UPPER: if not HASLOWER.search(word): sig += "-AC" elif loc == 0: sig += "-SC" else: sig += "-C" elif HASLOWER.search(word): sig += "-L" elif HASLETTER.search(word): sig += "-U" else: sig += "-S" # no letter # digits if HASDIGIT.search(word): if HASNONDIGIT.search(word): sig += "-n" else: sig += "-N" # punctuation if "-" in word: sig += "-H" if "." in word: sig += "-P" if "," in word: sig += "-C" if len(word) > 3: if word[-1] in LOWERUPPER: sig += "-%s" % word[-2:].lower() return sig
[docs]def unknownwordbase(word, _loc, _lexicon): """BaseUnknownWordModel of the Stanford parser. Relatively language agnostic.""" sig = UNK # letters if word[0] in UPPER: sig += "-C" else: sig += "-c" # digits if HASDIGIT.search(word): if HASNONDIGIT.search(word): sig += "-n" else: sig += "-N" # punctuation if "-" in word: sig += "-H" if word == ".": sig += "-P" if word == ",": sig += "-C" if len(word) > 3: if word[-1] in LOWERUPPER: sig += "-%s" % word[-2:].lower() return sig
NOUNSUFFIX = re.compile("(ier|ière|ité|ion|ison|isme|ysme|iste|esse|eur|euse" "|ence|eau|erie|ng|ette|age|ade|ance|ude|ogue|aphe|ate|duc|anthe" "|archie|coque|érèse|ergie|ogie|lithe|mètre|métrie|odie|pathie|phie" "|phone|phore|onyme|thèque|scope|some|pole|ôme|chromie|pie)s?$") ADJSUFFIX = re.compile("(iste|ième|uple|issime|aire|esque|atoire|ale|al|able" "|ible|atif|ique|if|ive|eux|aise|ent|ois|oise|ante|el|elle|ente|oire" "|ain|aine)s?$") POSSIBLEPLURAL = re.compile("(s|ux)$") VERBSUFFIX = re.compile("(ir|er|re|ez|ont|ent|ant|ais|ait|ra|era|eras" "|é|és|ées|isse|it)$") ADVSUFFIX = re.compile("(iment|ement|emment|amment)$") HASPUNC = re.compile("([\u0021-\u002F\u003A-\u0040\u005B\u005C\u005D" "\u005E-\u0060\u007B-\u007E\u00A1-\u00BF\u2010-\u2027\u2030-\u205E" "\u20A0-\u20B5])+") ISPUNC = re.compile("([\u0021-\u002F\u003A-\u0040\u005B\u005C\u005D" "\u005E-\u0060\u007B-\u007E\u00A1-\u00BF\u2010-\u2027\u2030-\u205E" "\u20A0-\u20B5])+$")
[docs]def unknownwordftb(word, loc, _lexicon): """Model 2 for French of the Stanford parser.""" sig = UNK if ADVSUFFIX.search(word): sig += "-ADV" elif VERBSUFFIX.search(word): sig += "-VB" elif NOUNSUFFIX.search(word): sig += "-NN" if ADJSUFFIX.search(word): sig += "-ADV" if HASDIGIT.search(word): sig += "-NUM" if POSSIBLEPLURAL.search(word): sig += "-PL" if ISPUNC.search(word): sig += "-ISPUNC" elif HASPUNC.search(word): sig += "-HASPUNC" if loc > 0 and word and word[0] in UPPER: sig += "-UP" return sig
UNKNOWNWORDFUNC = { "4": unknownword4, "6": unknownword6, "base": unknownwordbase, "ftb": unknownwordftb, } # === Performing POS tagging with external tools ============
[docs]def externaltagging(usetagger, model, sents, overridetag, tagmap): """Use an external tool to tag a list of sentences.""" logging.info('Start tagging.') goldtags = [t for sent in sents.values() for _, t in sent] if usetagger == 'treetagger': # Tree-tagger if not os.path.exists('tree-tagger/bin/tree-tagger'): raise ValueError(TREETAGGERHELP) infile, inname = tempfile.mkstemp(text=True) with os.fdopen(infile, 'w') as infile: for tagsent in sents.values(): sent = map(itemgetter(0), tagsent) infile.write('\n'.join(w.encode('utf-8') for w in sent) + '\n<S>\n') filtertags = '' if not model: model = 'tree-tagger/lib/german-par-linux-3.2-utf8.bin' filtertags = '| tree-tagger/cmd/filter-german-tags' tagger = Popen('tree-tagger/bin/tree-tagger -token -sgml' ' %s %s %s' % (model, inname, filtertags), stdout=PIPE, shell=True) tagout = tagger.stdout.read( ).decode('utf-8').split('<S>')[:-1] os.unlink(inname) taggedsents = OrderedDict((n, [tagmangle(a, None, overridetag, tagmap) for a in tags.splitlines() if a.strip()]) for n, tags in zip(sents, tagout)) elif usetagger == 'stanford': # Stanford Tagger if not os.path.exists('stanford-postagger-full-2012-07-09'): raise ValueError(STANFORDTAGGERHELP) infile, inname = tempfile.mkstemp(text=True) with os.fdopen(infile, 'w') as infile: for tagsent in sents.values(): sent = map(itemgetter(0), tagsent) infile.write(' '.join(w.encode('utf-8') for w in sent) + '\n') if not model: model = 'models/german-hgc.tagger' tagger = Popen(args=( '/usr/bin/java -mx2G -classpath stanford-postagger.jar' ' edu.stanford.nlp.tagger.maxent.MaxentTagger' ' -tokenize false -encoding utf-8' ' -model %s -textFile %s' % (model, inname)).split(), cwd='stanford-postagger-full-2012-07-09', shell=False, stdout=PIPE) tagout = tagger.stdout.read( ).decode('utf-8').splitlines() os.unlink(inname) taggedsents = OrderedDict((n, [tagmangle(a, '_', overridetag, tagmap) for a in tags.split()]) for n, tags in zip(sents, tagout)) elif usetagger == 'frog': # Dutch 'frog' tagger tagger = Popen(args=[which('frog')] + '-n --skip=tacmnp -t /dev/stdin'.split(), shell=False, stdin=PIPE, stdout=PIPE) tagout, stderr = tagger.communicate(''.join( ' '.join(w for w in map(itemgetter(0), tagsent)) + '\n' for tagsent in sents.values()).encode('utf8')) logging.info(stderr) # lines consist of: 'idx token lemma POS score' taggedsents = OrderedDict((n, [(line.split()[1], line.split()[3].replace('(', '[').replace(')', ']')) for line in lines.splitlines()]) for n, lines in zip(sents, tagout.decode('utf-8').split('\n\n'))) if len(taggedsents) != len(sents): raise ValueError('mismatch in number of sentences after tagging.') for n, tags in taggedsents.items(): if len(sents[n]) != len(tags): raise ValueError('mismatch in number of tokens after tagging.\n' 'before: %r\nafter: %r' % (sents[n], tags)) newtags = [t for sent in taggedsents.values() for _, t in sent] logging.info('Tag accuracy: %5.2f\ngold - cand: %r\ncand - gold %r', (100 * accuracy(goldtags, newtags)), set(goldtags) - set(newtags), set(newtags) - set(goldtags)) return taggedsents
def accuracy(gold, cand): """Compute fraction of equivalent pairs in two sequences.""" return sum(a == b for a, b in zip(gold, cand)) / len(gold)
[docs]def tagmangle(a, splitchar, overridetag, tagmap): """Function to filter tags after they are produced by the tagger.""" word, tag = a.rsplit(splitchar, 1) for newtag in overridetag: if word in overridetag[newtag]: tag = newtag return word, tagmap.get(tag, tag)
# The following is based on the undo-compounds transformation of FTB in # Candito, M., Crabbé, B., & Denis, P. (2010). Statistical French dependency # parsing: treebank conversion and first results. # http://www.lrec-conf.org/proceedings/lrec2010/pdf/392_Paper.pdf # regex expressions for repeated compound patterns which will be undone FTBREGULARCOMPOUNDPATTERNS = { # an N, maybe with Det, Adj, and PPs: # "Organisation de coopération et de développement économique" # "Institut de formation des agents de voyages" # "pomme de terre" # "marché monétaire et obligataire" # "Bureau de recherches géologiques et minières" 'N': re.compile( r'(D )?(A )*N( A( C A)?)*( P(\+D)?( D)?( A)* N( A( C A)?)*' r'(( C)? P(\+D)?( D)?( A)* N( A)*)*)?$' # r'|N( N)+$' # Air France, maison mère ... # r'|(N|ET)( ET)+$' # Wall Street, Zenith Data Structures, ), 'V': re.compile( r'V( V)*( P| A| D)*( N)+( P)*$' # mettre en place, etc. r'|V( D)?( A)?( N)+$'), # faire face, faire appel 'P': re.compile(r'P(\+D)? (D )?(A )*N( P(\+D)?( D)?( A)* N( A( C A)?)*' r'(( C)? P(\+D)?( D)?( A)* N( A)*)*)? P(\+D)?$'), 'ADV': re.compile(r'P(\+D)? (D )?(A )*N( P(\+D)?( D)?( A)* N( A)*)?$'), 'A': re.compile(r'A (C A)?'), } # The following compounds should not be undone; mostly organization names or # fixed expressions like "aujourd'hui". TODO! extend list of allowed compounds FTBALLOWEDCOMPOUNDS = { '(MWN (N Fondation) (N France) (A active))', '(MWN (N Côte) (PONCT -) (P d\') (N Ivoire))', '(MWP (CL Il) (CL y) (V a))', '(MWN (N Jean) (PONCT -) (N Louis))', '(MWADV (ADV tout) (P de) (N suite))', '(MWP (P jusqu\') (P au))', '(MWN (A Haute) (PONCT -) (N Corse))', '(MWP (CL y) (A compris))', '(MWN (N Chalon) (PONCT -) (P sur) (PONCT -) (N Saône))', '(MWP (CL il) (CL y) (V a))', '(MWN (N Royaume) (PONCT -) (A uni))', '(MWN (N Seine) (PONCT -) (N Saint) (PONCT -) (N Denis))', '(MWN (N Roche) (PONCT -) (N la) (PONCT -) (N Molière))', '(MWN (N Air) (N France))', '(MWN (A Grande) (PONCT -) (N Bretagne))', '(MWN (N Union) (A soviétique))', } __all__ = ['getunknownwordmodel', 'replaceraretrainwords', 'replaceraretestwords', 'simplesmoothlexicon', 'unknownword6', 'unknownword4', 'unknownwordbase', 'unknownwordftb', 'externaltagging', 'tagmangle']